In [1]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

import env


In [3]:
url = f'mysql+pymysql://{env.username}:{env.password}@{env.host}/used_cars'
cars = pd.read_sql('SELECT * FROM cars', url)
cars.columns = [c.lower() for c in cars]
cars.set_index('id', inplace=True)

print('{} rows x {} cols'.format(*cars.shape))
cars.head()

297899 rows x 8 cols


Unnamed: 0_level_0,price,year,mileage,city,state,vin,make,model
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,16472,2015,18681,Jefferson City,MO,KL4CJBSBXFB267643,Buick,EncoreConvenience
2,15749,2015,27592,Highland,IN,KL4CJASB5FB245057,Buick,EncoreFWD
3,16998,2015,13650,Boone,NC,KL4CJCSB0FB264921,Buick,EncoreLeather
4,15777,2015,25195,New Orleans,LA,KL4CJASB4FB217542,Buick,EncoreFWD
5,16784,2015,22800,Las Vegas,NV,KL4CJBSB3FB166881,Buick,EncoreConvenience


# data prep

In [4]:
cars['avg_saleprice'] = cars.groupby(['year', 'make', 'model']).price.transform('mean')
cars['gt_avg'] = (cars.price > cars.avg_saleprice).astype(int)


In [5]:
cars.drop(columns=['price', 'vin', 'avg_saleprice'], inplace=True)


# encode cats

In [8]:
from sklearn.preprocessing import LabelEncoder

for col in ['state', 'make', 'model', 'year','city']:
    le = LabelEncoder().fit(cars[col])
    cars[col] = le.transform(cars[col])

# split

In [12]:
x, y = cars.drop(columns='gt_avg'), cars.gt_avg

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.2)


In [15]:
X

Unnamed: 0_level_0,year,mileage,city,state,make,model
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,18,18681,1063,28,7,523
2,18,27592,987,19,7,525
3,18,13650,213,32,7,526
4,18,25195,1554,22,7,525
5,18,22800,1176,38,7,523
...,...,...,...,...,...,...
297895,20,4416,2253,5,40,83
297896,19,2782,1576,44,40,82
297897,20,5702,93,49,40,83
297898,20,3850,1850,5,40,85


# cross val

In [18]:
import sklearn.metrics as m
from sklearn.model_selection import cross_val_score

tree = DecisionTreeClassifier(max_depth=2)

cross_val_score(tree, x_train, y_train, cv=3)

# accuracy is the default metric

array([0.59324018, 0.59300101, 0.59242941])

In [19]:
cross_val_score(tree, x_train, y_train, cv=3, scoring='precision')

array([0.59074286, 0.5924219 , 0.58726959])

# Grid Search

In [22]:
from sklearn.model_selection import GridSearchCV


In [23]:
params = {'max_depth':[2,3,4,5],
          'max_features' : [None,1,2,3,4,5]}


In [24]:
tree = DecisionTreeClassifier()

In [27]:
grid = GridSearchCV(tree, params, cv=3)

grid.fit(x_train, y_train)

GridSearchCV(cv=3, estimator=DecisionTreeClassifier(),
             param_grid={'max_depth': [2, 3, 4, 5],
                         'max_features': [None, 1, 2, 3, 4, 5]})

In [28]:
results = grid.cv_results_
results.keys()

dict_keys(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time', 'param_max_depth', 'param_max_features', 'params', 'split0_test_score', 'split1_test_score', 'split2_test_score', 'mean_test_score', 'std_test_score', 'rank_test_score'])

In [29]:
test_scores = results['mean_test_score']
test_scores

array([0.5928902 , 0.55388776, 0.55369916, 0.57581211, 0.55987158,
       0.57617318, 0.63117923, 0.56123086, 0.55344302, 0.5950009 ,
       0.61435303, 0.62726432, 0.63897969, 0.55793277, 0.59764854,
       0.60152579, 0.61821335, 0.63401996, 0.65179445, 0.58151055,
       0.60695957, 0.62598019, 0.63146041, 0.65116504])

In [30]:
params = results['params']
params

[{'max_depth': 2, 'max_features': None},
 {'max_depth': 2, 'max_features': 1},
 {'max_depth': 2, 'max_features': 2},
 {'max_depth': 2, 'max_features': 3},
 {'max_depth': 2, 'max_features': 4},
 {'max_depth': 2, 'max_features': 5},
 {'max_depth': 3, 'max_features': None},
 {'max_depth': 3, 'max_features': 1},
 {'max_depth': 3, 'max_features': 2},
 {'max_depth': 3, 'max_features': 3},
 {'max_depth': 3, 'max_features': 4},
 {'max_depth': 3, 'max_features': 5},
 {'max_depth': 4, 'max_features': None},
 {'max_depth': 4, 'max_features': 1},
 {'max_depth': 4, 'max_features': 2},
 {'max_depth': 4, 'max_features': 3},
 {'max_depth': 4, 'max_features': 4},
 {'max_depth': 4, 'max_features': 5},
 {'max_depth': 5, 'max_features': None},
 {'max_depth': 5, 'max_features': 1},
 {'max_depth': 5, 'max_features': 2},
 {'max_depth': 5, 'max_features': 3},
 {'max_depth': 5, 'max_features': 4},
 {'max_depth': 5, 'max_features': 5}]

In [32]:
for p, s in zip(params, test_scores):
    p['score'] = s

df = pd.DataFrame(params).sort_values(by='score')

In [37]:
df.sort_values(by = "score",ascending= False)

Unnamed: 0,max_depth,max_features,score
18,5,,0.651794
23,5,5.0,0.651165
12,4,,0.63898
17,4,5.0,0.63402
22,5,4.0,0.63146
6,3,,0.631179
11,3,5.0,0.627264
21,5,3.0,0.62598
16,4,4.0,0.618213
10,3,4.0,0.614353


In [39]:
model = grid.best_estimator_

In [40]:
model

DecisionTreeClassifier(max_depth=5)

In [41]:
model.fit(x_test, y_test)

DecisionTreeClassifier(max_depth=5)

In [53]:
model.score(x_test, y_test)

0.6612621685129239