In [56]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

import sklearn.metrics as m
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV


In [2]:
df = pd.read_csv('cars - cars.csv')

In [3]:
df.head()

Unnamed: 0,price,year,mileage,city,state,vin,make,model
0,16472,2015,18681,Jefferson City,MO,KL4CJBSBXFB267643,Buick,EncoreConvenience
1,15749,2015,27592,Highland,IN,KL4CJASB5FB245057,Buick,EncoreFWD
2,16998,2015,13650,Boone,NC,KL4CJCSB0FB264921,Buick,EncoreLeather
3,15777,2015,25195,New Orleans,LA,KL4CJASB4FB217542,Buick,EncoreFWD
4,16784,2015,22800,Las Vegas,NV,KL4CJBSB3FB166881,Buick,EncoreConvenience


In [None]:
# Which cars have sold high?

# Well how do we define high?

In [None]:
# Lets try to make an assessment of which cars are similar

In [8]:
df.groupby(['make', 'model', 'year']).price.mean()

make           model     year
AM             General   1997    62489.250000
                         1998    47499.500000
                         1999    48097.500000
                         2000    58658.142857
                         2001    71748.000000
                                     ...     
Mercedes-Benz  Sprinter  2012    31616.800000
                         2013    26869.947368
                         2014    31611.461538
                         2015    35066.157895
                         2016    35125.134615
Name: price, Length: 4796, dtype: float64

In [None]:
lets put this back into original DF

In [9]:
df['mean_price'] = df.groupby(['make', 'model', 'year']).price.transform('mean')

In [10]:
df.head()

Unnamed: 0,price,year,mileage,city,state,vin,make,model,mean_price
0,16472,2015,18681,Jefferson City,MO,KL4CJBSBXFB267643,Buick,EncoreConvenience,17291.768786
1,15749,2015,27592,Highland,IN,KL4CJASB5FB245057,Buick,EncoreFWD,16721.350598
2,16998,2015,13650,Boone,NC,KL4CJCSB0FB264921,Buick,EncoreLeather,19080.632911
3,15777,2015,25195,New Orleans,LA,KL4CJASB4FB217542,Buick,EncoreFWD,16721.350598
4,16784,2015,22800,Las Vegas,NV,KL4CJBSB3FB166881,Buick,EncoreConvenience,17291.768786


In [11]:
df['sold_high'] = (df.price > df.mean_price).astype(int)

In [12]:
df.head()

Unnamed: 0,price,year,mileage,city,state,vin,make,model,mean_price,sold_high
0,16472,2015,18681,Jefferson City,MO,KL4CJBSBXFB267643,Buick,EncoreConvenience,17291.768786,0
1,15749,2015,27592,Highland,IN,KL4CJASB5FB245057,Buick,EncoreFWD,16721.350598,0
2,16998,2015,13650,Boone,NC,KL4CJCSB0FB264921,Buick,EncoreLeather,19080.632911,0
3,15777,2015,25195,New Orleans,LA,KL4CJASB4FB217542,Buick,EncoreFWD,16721.350598,0
4,16784,2015,22800,Las Vegas,NV,KL4CJBSB3FB166881,Buick,EncoreConvenience,17291.768786,0


In [13]:
df.sold_high.value_counts()

0    139076
1    122390
Name: sold_high, dtype: int64

In [16]:
num_feats = [col for col in df.columns if 
             (np.issubdtype(df[col], np.number
                           ) and df[col].nunique() > 25)]

In [17]:
num_feats.remove('mean_price')

In [21]:
cars = df.drop(columns={'city', 'vin', 'price', 'mean_price'})

In [22]:
cars.head()

Unnamed: 0,year,mileage,state,make,model,sold_high
0,2015,18681,MO,Buick,EncoreConvenience,0
1,2015,27592,IN,Buick,EncoreFWD,0
2,2015,13650,NC,Buick,EncoreLeather,0
3,2015,25195,LA,Buick,EncoreFWD,0
4,2015,22800,NV,Buick,EncoreConvenience,0


In [28]:
cat_cols = [col for col in cars.columns if col not in num_feats]

In [29]:
cat_cols

['year', 'state', 'make', 'model', 'sold_high']

In [30]:
# remove target

cat_cols.remove('sold_high')

In [None]:
# Lets encode the categorical features

In [31]:
from sklearn.preprocessing import LabelEncoder

In [36]:
for col in ['state', 'make', 'model']:
    encoder = LabelEncoder()
    cars[col] = LabelEncoder.fit_transform(cars[col], cars['sold_high'])

In [None]:
# usually we'd split into train, val, test here

In [37]:
cars.head()

Unnamed: 0,year,mileage,state,make,model,sold_high
0,2015,18681,0,0,0,0
1,2015,27592,0,0,0,0
2,2015,13650,0,0,0,0
3,2015,25195,0,0,0,0
4,2015,22800,0,0,0,0


In [38]:
# Using cv_score

X, y = cars.drop(columns={'sold_high'}),\
cars[['sold_high']]

In [57]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, train_size=0.8, random_state=1349)

In [46]:
# We need a model object

tree = DecisionTreeClassifier()

In [61]:
cross_val_score(tree, X_train, y_train, cv=3)

array([1., 1., 1.])

In [60]:
X_train.shape, y_train.shape

((209172, 5), (209172, 1))

In [58]:
X.head()

Unnamed: 0,year,mileage,state,make,model
0,2015,18681,0,0,0
1,2015,27592,0,0,0
2,2015,13650,0,0,0
3,2015,25195,0,0,0
4,2015,22800,0,0,0


In [59]:
y_train.head()

Unnamed: 0,sold_high
203453,1
59785,0
166046,0
53593,1
21873,1


In [63]:
X_train.head()

Unnamed: 0,year,mileage,state,make,model
203453,2012,76351,1,1,1
59785,2012,79750,0,0,0
166046,2013,68781,0,0,0
53593,2012,92365,1,1,1
21873,2016,27850,1,1,1


In [62]:
y_train.head()

Unnamed: 0,sold_high
203453,1
59785,0
166046,0
53593,1
21873,1


In [66]:
param_grid = {
    'max_depth': [None, 10, 4, 3, 2],
    'min_samples_leaf': [1, 3, 5, 20],
    'criterion': ['gini', 'entropy']
}
gsearch = GridSearchCV(DecisionTreeClassifier(), param_grid)

In [67]:
gsearch

In [68]:
gsearch.fit(X_train, y_train)

In [71]:
results = gsearch.cv_results_

In [72]:
results.keys()

dict_keys(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time', 'param_criterion', 'param_max_depth', 'param_min_samples_leaf', 'params', 'split0_test_score', 'split1_test_score', 'split2_test_score', 'split3_test_score', 'split4_test_score', 'mean_test_score', 'std_test_score', 'rank_test_score'])

In [74]:
results_df_init = pd.DataFrame(results)

In [76]:
params = pd.DataFrame(results['params'])

In [77]:
results_df_init.head(2)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_depth,param_min_samples_leaf,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.037266,0.004662,0.003756,0.000328,gini,,1,"{'criterion': 'gini', 'max_depth': None, 'min_...",1.0,1.0,1.0,1.0,1.0,1.0,0.0,1
1,0.034294,0.00017,0.003244,0.000113,gini,,3,"{'criterion': 'gini', 'max_depth': None, 'min_...",1.0,1.0,1.0,1.0,1.0,1.0,0.0,1


In [83]:
splits = [col for col in results.keys() if 'split' in col]

In [85]:
pd.concat([params, results_df_init[splits]], axis=1)

Unnamed: 0,criterion,max_depth,min_samples_leaf,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score
0,gini,,1,1.0,1.0,1.0,1.0,1.0
1,gini,,3,1.0,1.0,1.0,1.0,1.0
2,gini,,5,1.0,1.0,1.0,1.0,1.0
3,gini,,20,1.0,1.0,1.0,1.0,1.0
4,gini,10.0,1,1.0,1.0,1.0,1.0,1.0
5,gini,10.0,3,1.0,1.0,1.0,1.0,1.0
6,gini,10.0,5,1.0,1.0,1.0,1.0,1.0
7,gini,10.0,20,1.0,1.0,1.0,1.0,1.0
8,gini,4.0,1,1.0,1.0,1.0,1.0,1.0
9,gini,4.0,3,1.0,1.0,1.0,1.0,1.0


[]