In [3]:
import pandas as pd
import pydataset


from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression


In [6]:
cars = pd.read_csv('cars.csv')
cars.columns = [c.lower() for c in cars]
cars.set_index('id', inplace=True)

print('{} rows x {} cols'.format(*cars.shape))
cars.head()

297899 rows x 8 cols


Unnamed: 0_level_0,price,year,mileage,city,state,vin,make,model
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,16472,2015,18681,Jefferson City,MO,KL4CJBSBXFB267643,Buick,EncoreConvenience
2,15749,2015,27592,Highland,IN,KL4CJASB5FB245057,Buick,EncoreFWD
3,16998,2015,13650,Boone,NC,KL4CJCSB0FB264921,Buick,EncoreLeather
4,15777,2015,25195,New Orleans,LA,KL4CJASB4FB217542,Buick,EncoreFWD
5,16784,2015,22800,Las Vegas,NV,KL4CJBSB3FB166881,Buick,EncoreConvenience


In [7]:
cars['avg_saleprice'] = cars.groupby(['year', 'make', 'model']).price.transform('mean')
cars['gt_avg'] = (cars.price > cars.avg_saleprice).astype(int)


In [8]:
cars.head()

Unnamed: 0_level_0,price,year,mileage,city,state,vin,make,model,avg_saleprice,gt_avg
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,16472,2015,18681,Jefferson City,MO,KL4CJBSBXFB267643,Buick,EncoreConvenience,17291.768786,0
2,15749,2015,27592,Highland,IN,KL4CJASB5FB245057,Buick,EncoreFWD,16721.350598,0
3,16998,2015,13650,Boone,NC,KL4CJCSB0FB264921,Buick,EncoreLeather,19080.632911,0
4,15777,2015,25195,New Orleans,LA,KL4CJASB4FB217542,Buick,EncoreFWD,16721.350598,0
5,16784,2015,22800,Las Vegas,NV,KL4CJBSB3FB166881,Buick,EncoreConvenience,17291.768786,0


In [9]:
cars.drop(columns=['price', 'city', 'vin', 'avg_saleprice'], inplace=True)

In [10]:
from sklearn.preprocessing import LabelEncoder

for col in ['state', 'make', 'model', 'year']:
    le = LabelEncoder().fit(cars[col])
    cars[col] = le.transform(cars[col])

In [11]:
cars.head()

Unnamed: 0_level_0,year,mileage,state,make,model,gt_avg
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,18,18681,28,7,523,0
2,18,27592,19,7,525,0
3,18,13650,32,7,526,0
4,18,25195,22,7,525,0
5,18,22800,38,7,523,0


In [12]:
X, y = cars.drop(columns='gt_avg'), cars.gt_avg

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2)

In [13]:
import sklearn.metrics as m
from sklearn.model_selection import cross_val_score

tree = DecisionTreeClassifier(max_depth=2)

cross_val_score(tree, X_train, y_train, cv=3)

array([0.59468781, 0.59271148, 0.59226576])

In [14]:
cross_val_score(tree, X_train, y_train, cv=3, scoring='precision')

array([0.64550202, 0.59026995, 0.59163431])

In [15]:
from sklearn.model_selection import GridSearchCV

params = {'max_depth': [2, 3, 4],
          'max_features': [None, 1, 3]}

tree = DecisionTreeClassifier()

grid = GridSearchCV(tree, params, cv=3)

grid.fit(X_train, y_train)

GridSearchCV(cv=3, estimator=DecisionTreeClassifier(),
             param_grid={'max_depth': [2, 3, 4], 'max_features': [None, 1, 3]})

In [16]:
results = grid.cv_results_
results.keys()

dict_keys(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time', 'param_max_depth', 'param_max_features', 'params', 'split0_test_score', 'split1_test_score', 'split2_test_score', 'mean_test_score', 'std_test_score', 'rank_test_score'])

In [17]:
test_scores = results['mean_test_score']
test_scores



array([0.59322169, 0.53931914, 0.58178741, 0.63177085, 0.5592084 ,
       0.58736823, 0.6375824 , 0.56175559, 0.62852732])

In [18]:
params = results['params']
params

[{'max_depth': 2, 'max_features': None},
 {'max_depth': 2, 'max_features': 1},
 {'max_depth': 2, 'max_features': 3},
 {'max_depth': 3, 'max_features': None},
 {'max_depth': 3, 'max_features': 1},
 {'max_depth': 3, 'max_features': 3},
 {'max_depth': 4, 'max_features': None},
 {'max_depth': 4, 'max_features': 1},
 {'max_depth': 4, 'max_features': 3}]

In [19]:
for p, s in zip(params, test_scores):
    p['score'] = s

pd.DataFrame(params).sort_values(by='score')

Unnamed: 0,max_depth,max_features,score
1,2,1.0,0.539319
4,3,1.0,0.559208
7,4,1.0,0.561756
2,2,3.0,0.581787
5,3,3.0,0.587368
0,2,,0.593222
8,4,3.0,0.628527
3,3,,0.631771
6,4,,0.637582


In [20]:
clf = KNeighborsClassifier()
grid = GridSearchCV(clf, {'n_neighbors': range(1, 21)}, cv=5)
grid.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=KNeighborsClassifier(),
             param_grid={'n_neighbors': range(1, 21)})

In [21]:
grid.cv_results_['mean_test_score']

array([0.5404353 , 0.55109328, 0.55066109, 0.55652299, 0.55702651,
       0.5603372 , 0.56105473, 0.56426471, 0.56400875, 0.56621587,
       0.56642148, 0.56808312, 0.56768869, 0.56964405, 0.56978252,
       0.57103294, 0.57064691, 0.57262745, 0.57235889, 0.57399116])

In [22]:

grid.cv_results_['params']

[{'n_neighbors': 1},
 {'n_neighbors': 2},
 {'n_neighbors': 3},
 {'n_neighbors': 4},
 {'n_neighbors': 5},
 {'n_neighbors': 6},
 {'n_neighbors': 7},
 {'n_neighbors': 8},
 {'n_neighbors': 9},
 {'n_neighbors': 10},
 {'n_neighbors': 11},
 {'n_neighbors': 12},
 {'n_neighbors': 13},
 {'n_neighbors': 14},
 {'n_neighbors': 15},
 {'n_neighbors': 16},
 {'n_neighbors': 17},
 {'n_neighbors': 18},
 {'n_neighbors': 19},
 {'n_neighbors': 20}]

In [23]:
grid.best_score_, grid.best_params_

(0.5739911589548466, {'n_neighbors': 20})

In [24]:
grid.best_estimator_

KNeighborsClassifier(n_neighbors=20)

In [25]:
clf = DecisionTreeClassifier()
grid = GridSearchCV(clf, {'max_depth': range(1, 21), 'min_samples_leaf': range(1, 11)}, cv=10)
grid.fit(X_train, y_train)

GridSearchCV(cv=10, estimator=DecisionTreeClassifier(),
             param_grid={'max_depth': range(1, 21),
                         'min_samples_leaf': range(1, 11)})

In [26]:
for params, score in zip(grid.cv_results_['params'], grid.cv_results_['mean_test_score']):
    params['score'] = score

In [29]:
pd.DataFrame(grid.cv_results_['params']).sort_values(by='score')

Unnamed: 0,max_depth,min_samples_leaf,score
0,1,1,0.559225
1,1,2,0.559225
2,1,3,0.559225
3,1,4,0.559225
4,1,5,0.559225
...,...,...,...
116,12,7,0.674176
113,12,4,0.674201
117,12,8,0.674315
118,12,9,0.674516
