In [1]:
import numpy as np
import pandas as pd

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder ,OrdinalEncoder 
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline,make_pipeline
from sklearn.feature_selection import SelectKBest,chi2
from sklearn.tree import DecisionTreeClassifier

In [3]:
df = pd.read_csv('used_cars_data.csv',usecols=['Name','Location','Owner_Type','Fuel_Type','Transmission','Price','Kilometers_Driven'])

df

Unnamed: 0,Name,Location,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Price
0,Maruti Wagon R LXI CNG,Mumbai,72000,CNG,Manual,First,1.75
1,Hyundai Creta 1.6 CRDi SX Option,Pune,41000,Diesel,Manual,First,12.50
2,Honda Jazz V,Chennai,46000,Petrol,Manual,First,4.50
3,Maruti Ertiga VDI,Chennai,87000,Diesel,Manual,First,6.00
4,Audi A4 New 2.0 TDI Multitronic,Coimbatore,40670,Diesel,Automatic,Second,17.74
...,...,...,...,...,...,...,...
7248,Volkswagen Vento Diesel Trendline,Hyderabad,89411,Diesel,Manual,First,
7249,Volkswagen Polo GT TSI,Mumbai,59000,Petrol,Automatic,First,
7250,Nissan Micra Diesel XV,Kolkata,28000,Diesel,Manual,First,
7251,Volkswagen Polo GT TSI,Pune,52262,Petrol,Automatic,Third,


In [4]:
#Train the data 
x_train , x_test , y_train , y_test = train_test_split(df.drop('Transmission',axis=1),
                                                       df['Transmission'],
                                                       test_size=0.3,
                                                       random_state=0)
x_train

Unnamed: 0,Name,Location,Kilometers_Driven,Fuel_Type,Owner_Type,Price
6811,Honda City i VTEC CVT VX,Hyderabad,39822,Petrol,First,
5029,Ford Figo Aspire 1.2 Ti-VCT Ambiente,Chennai,29875,Petrol,First,4.85
2423,BMW 3 Series 320d,Mumbai,25000,Diesel,First,17.99
2434,Hyundai Xcent 1.2 Kappa S Option,Kolkata,42000,Petrol,First,3.45
2559,Maruti Swift VDI,Chennai,78000,Diesel,First,4.40
...,...,...,...,...,...,...
4931,Mahindra TUV 300 T8,Pune,67000,Diesel,First,6.30
3264,Toyota Innova 2.5 G (Diesel) 8 Seater,Jaipur,147350,Diesel,Second,4.95
1653,Mahindra Scorpio VLX,Kolkata,62000,Diesel,First,6.25
2607,Hyundai Verna CRDi 1.6 SX Option,Pune,50000,Diesel,First,11.50


In [5]:
#Data Clening / imputation transformer
trf1 = ColumnTransformer([
    ('impute_price',SimpleImputer(),[5])
],remainder='passthrough')

### handle_unknown= 'ignore' means if you train your data like that 
City: Delhi, Mumbai, Pune

#### Encoder learns 
##### ['Delhi', 'Mumbai', 'Pune']
##### Test Real world Data 
##### City: Delhi, Chennai
##### if u don't use it it throw error if you give different values like 'chennai' ValueError: Found unknown categories ['Chennai']
##### OneHotEncoder(handle_unknown='ignore') if unseen category appers Chennai → [0, 0, 0]


In [6]:
#Encoding Phase 
trf2 = ColumnTransformer([
    ('ohe_loc_ftype',OneHotEncoder(sparse_output=False,handle_unknown='ignore'),[1,3])
],remainder='passthrough')

In [7]:
# Scaling Phase
trf3 = ColumnTransformer([
    ('Scale_kilometer',MinMaxScaler(),[2])
])

In [8]:
trf5 = DecisionTreeClassifier()

# Create Pipeline

In [9]:
pipe = Pipeline([
    ('trf1',trf1),
    ('trf2',trf2),
    ('trf3',trf3),
    ('trf5',trf5)
])
pipe.named_steps

{'trf1': ColumnTransformer(remainder='passthrough',
                   transformers=[('impute_price', SimpleImputer(), [5])]),
 'trf2': ColumnTransformer(remainder='passthrough',
                   transformers=[('ohe_loc_ftype',
                                  OneHotEncoder(handle_unknown='ignore',
                                                sparse_output=False),
                                  [1, 3])]),
 'trf3': ColumnTransformer(transformers=[('Scale_kilometer', MinMaxScaler(), [2])]),
 'trf5': DecisionTreeClassifier()}

In [10]:
# pipe = make_pipeline(trf1,trf2,trf3,trf5)
# x_train
pipe.named_steps

{'trf1': ColumnTransformer(remainder='passthrough',
                   transformers=[('impute_price', SimpleImputer(), [5])]),
 'trf2': ColumnTransformer(remainder='passthrough',
                   transformers=[('ohe_loc_ftype',
                                  OneHotEncoder(handle_unknown='ignore',
                                                sparse_output=False),
                                  [1, 3])]),
 'trf3': ColumnTransformer(transformers=[('Scale_kilometer', MinMaxScaler(), [2])]),
 'trf5': DecisionTreeClassifier()}

In [11]:
# train
pipe.fit(x_train,y_train)

0,1,2
,steps,"[('trf1', ...), ('trf2', ...), ...]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('impute_price', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'mean'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,transformers,"[('ohe_loc_ftype', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,transformers,"[('Scale_kilometer', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,feature_range,"(0, ...)"
,copy,True
,clip,False

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [12]:
# Code here
pipe.named_steps

{'trf1': ColumnTransformer(remainder='passthrough',
                   transformers=[('impute_price', SimpleImputer(), [5])]),
 'trf2': ColumnTransformer(remainder='passthrough',
                   transformers=[('ohe_loc_ftype',
                                  OneHotEncoder(handle_unknown='ignore',
                                                sparse_output=False),
                                  [1, 3])]),
 'trf3': ColumnTransformer(transformers=[('Scale_kilometer', MinMaxScaler(), [2])]),
 'trf5': DecisionTreeClassifier()}

In [13]:
#predict
y_pred = pipe.predict(x_test)

In [14]:
y_pred

array(['Manual', 'Manual', 'Manual', ..., 'Manual', 'Manual', 'Manual'],
      shape=(2176,), dtype=object)

In [16]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

0.7316176470588235

In [17]:
# cross validation using cross_val_score
from sklearn.model_selection import cross_val_score
cross_val_score(pipe, x_train, y_train, cv=5, scoring='accuracy').mean()


np.float64(0.7120348318529149)

In [18]:
# gridsearchcv
params = {
    'trf5__max_depth':[1,2,3,4,5,None]
}

In [20]:
from sklearn.model_selection import GridSearchCV
grid = GridSearchCV(pipe, params, cv=5, scoring='accuracy')
grid.fit(x_train, y_train)

0,1,2
,estimator,Pipeline(step...lassifier())])
,param_grid,"{'trf5__max_depth': [1, 2, ...]}"
,scoring,'accuracy'
,n_jobs,
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,transformers,"[('impute_price', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'mean'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,transformers,"[('ohe_loc_ftype', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,transformers,"[('Scale_kilometer', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,feature_range,"(0, ...)"
,copy,True
,clip,False

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,1
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,
,max_leaf_nodes,
,min_impurity_decrease,0.0
