In [6]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
import sklearn.metrics as m
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

In [2]:
cars = pd.read_csv('cars.csv')
cars.columns = [c.lower() for c in cars]
cars.set_index('id', inplace=True)

print('{} rows x {} cols'.format(*cars.shape))
cars.head()

297899 rows x 8 cols


Unnamed: 0_level_0,price,year,mileage,city,state,vin,make,model
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,16472,2015,18681,Jefferson City,MO,KL4CJBSBXFB267643,Buick,EncoreConvenience
2,15749,2015,27592,Highland,IN,KL4CJASB5FB245057,Buick,EncoreFWD
3,16998,2015,13650,Boone,NC,KL4CJCSB0FB264921,Buick,EncoreLeather
4,15777,2015,25195,New Orleans,LA,KL4CJASB4FB217542,Buick,EncoreFWD
5,16784,2015,22800,Las Vegas,NV,KL4CJBSB3FB166881,Buick,EncoreConvenience


In [3]:
cars['avg_saleprice'] = cars.groupby(['year', 'make', 'model']).price.transform('mean')
cars['gt_avg'] = (cars.price > cars.avg_saleprice).astype(int)

In [4]:
cars.drop(columns=['price', 'city', 'vin', 'avg_saleprice'], inplace=True)

In [7]:
for col in ['state', 'make', 'model', 'year']:
    le = LabelEncoder().fit(cars[col])
    cars[col] = le.transform(cars[col])

In [8]:
cars.head()

Unnamed: 0_level_0,year,mileage,state,make,model,gt_avg
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,18,18681,28,7,523,0
2,18,27592,19,7,525,0
3,18,13650,32,7,526,0
4,18,25195,22,7,525,0
5,18,22800,38,7,523,0


In [9]:
X, y = cars.drop(columns='gt_avg'), cars.gt_avg

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2)

In [10]:
X_train, X_validate, y_train, y_validate = train_test_split(X_train, y_train, test_size=.3333)

In [11]:
tree = DecisionTreeClassifier(max_depth=2)

cross_val_score(tree, X_train, y_train, cv=3)

array([0.59209259, 0.59331219, 0.590952  ])

In [12]:
cross_val_score(tree, X_train, y_train, cv=3, scoring='precision')

array([0.58724088, 0.58910644, 0.59893631])

### Decision Tree

In [13]:
params = {'max_depth': [2, 3, 4],
          'max_features': [None, 1, 3]}

tree = DecisionTreeClassifier()

grid = GridSearchCV(tree, params, cv=3)

grid.fit(X_train, y_train)

GridSearchCV(cv=3, estimator=DecisionTreeClassifier(),
             param_grid={'max_depth': [2, 3, 4], 'max_features': [None, 1, 3]})

In [14]:
results = grid.cv_results_
results.keys()

dict_keys(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time', 'param_max_depth', 'param_max_features', 'params', 'split0_test_score', 'split1_test_score', 'split2_test_score', 'mean_test_score', 'std_test_score', 'rank_test_score'])

In [15]:
test_scores = results['mean_test_score']
test_scores

array([0.59211893, 0.53724972, 0.59143289, 0.63154943, 0.54813802,
       0.59053916, 0.63778658, 0.54591628, 0.62192616])

In [16]:
params = results['params']
params

[{'max_depth': 2, 'max_features': None},
 {'max_depth': 2, 'max_features': 1},
 {'max_depth': 2, 'max_features': 3},
 {'max_depth': 3, 'max_features': None},
 {'max_depth': 3, 'max_features': 1},
 {'max_depth': 3, 'max_features': 3},
 {'max_depth': 4, 'max_features': None},
 {'max_depth': 4, 'max_features': 1},
 {'max_depth': 4, 'max_features': 3}]

In [17]:
for p, s in zip(params, test_scores):
    p['score'] = s

pd.DataFrame(params).sort_values(by='score')

Unnamed: 0,max_depth,max_features,score
1,2,1.0,0.53725
7,4,1.0,0.545916
4,3,1.0,0.548138
5,3,3.0,0.590539
2,2,3.0,0.591433
0,2,,0.592119
8,4,3.0,0.621926
3,3,,0.631549
6,4,,0.637787


In [18]:
#Decision Tree should have no max features, and 4 for max_depth

### KNN

In [19]:
params = {'n_neighbors': [2, 3, 4, 5, 6, 7, 8, 9, 10]}

knn = KNeighborsClassifier()

grid = GridSearchCV(knn, params, cv=3)

grid.fit(X_train, y_train)

GridSearchCV(cv=3, estimator=KNeighborsClassifier(),
             param_grid={'n_neighbors': [2, 3, 4, 5, 6, 7, 8, 9, 10]})

In [20]:
results = grid.cv_results_
results.keys()

dict_keys(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time', 'param_n_neighbors', 'params', 'split0_test_score', 'split1_test_score', 'split2_test_score', 'mean_test_score', 'std_test_score', 'rank_test_score'])

In [21]:
test_scores = results['mean_test_score']
test_scores

array([0.54798693, 0.5468981 , 0.55300937, 0.55305342, 0.55819544,
       0.55828984, 0.5627899 , 0.56170108, 0.56418712])

In [22]:
params = results['params']
params

[{'n_neighbors': 2},
 {'n_neighbors': 3},
 {'n_neighbors': 4},
 {'n_neighbors': 5},
 {'n_neighbors': 6},
 {'n_neighbors': 7},
 {'n_neighbors': 8},
 {'n_neighbors': 9},
 {'n_neighbors': 10}]

In [23]:
for p, s in zip(params, test_scores):
    p['score'] = s

pd.DataFrame(params).sort_values(by='score')

Unnamed: 0,n_neighbors,score
1,3,0.546898
0,2,0.547987
2,4,0.553009
3,5,0.553053
4,6,0.558195
5,7,0.55829
7,9,0.561701
6,8,0.56279
8,10,0.564187


In [24]:
#KNN should use 10 n_neighbors

### Logistic Regression

In [25]:
params = {'C': [0.2, 0.5, 1, 1.2, 1.5, 2]}

logit = LogisticRegression()

grid = GridSearchCV(logit, params, cv=3)

grid.fit(X_train, y_train)

GridSearchCV(cv=3, estimator=LogisticRegression(),
             param_grid={'C': [0.2, 0.5, 1, 1.2, 1.5, 2]})

In [26]:
results = grid.cv_results_
results.keys()

dict_keys(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time', 'param_C', 'params', 'split0_test_score', 'split1_test_score', 'split2_test_score', 'mean_test_score', 'std_test_score', 'rank_test_score'])

In [27]:
test_scores = results['mean_test_score']
test_scores

array([0.58746153, 0.58746153, 0.58746153, 0.58746153, 0.58746153,
       0.58746153])

In [28]:
params = results['params']
params

[{'C': 0.2}, {'C': 0.5}, {'C': 1}, {'C': 1.2}, {'C': 1.5}, {'C': 2}]

In [29]:
for p, s in zip(params, test_scores):
    p['score'] = s

pd.DataFrame(params).sort_values(by='score')

Unnamed: 0,C,score
0,0.2,0.587462
1,0.5,0.587462
2,1.0,0.587462
3,1.2,0.587462
4,1.5,0.587462
5,2.0,0.587462


In [30]:
#Same score with given parameters