Within advanced_topics, do your work for this exercise in a jupyter notebook or python script named cross_validation.

Use the cross validation techniques discussed in the lesson to figure out what kind of model works best with the cars dataset used in the lesson.

In [23]:
import pandas as pd

from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import LabelEncoder


In [7]:
# read in the data
df = pd.read_csv('./used_cars.csv.gz').set_index('Id')
df.head()

Unnamed: 0_level_0,Price,Year,Mileage,City,State,Vin,Make,Model
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,16472,2015,18681,Jefferson City,MO,KL4CJBSBXFB267643,Buick,EncoreConvenience
2,15749,2015,27592,Highland,IN,KL4CJASB5FB245057,Buick,EncoreFWD
3,16998,2015,13650,Boone,NC,KL4CJCSB0FB264921,Buick,EncoreLeather
4,15777,2015,25195,New Orleans,LA,KL4CJASB4FB217542,Buick,EncoreFWD
5,16784,2015,22800,Las Vegas,NV,KL4CJBSB3FB166881,Buick,EncoreConvenience


In [8]:
# make all titles lowercase
df.columns = [c.lower() for c in df]

In [10]:
df.head()

Unnamed: 0_level_0,price,year,mileage,city,state,vin,make,model
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,16472,2015,18681,Jefferson City,MO,KL4CJBSBXFB267643,Buick,EncoreConvenience
2,15749,2015,27592,Highland,IN,KL4CJASB5FB245057,Buick,EncoreFWD
3,16998,2015,13650,Boone,NC,KL4CJCSB0FB264921,Buick,EncoreLeather
4,15777,2015,25195,New Orleans,LA,KL4CJASB4FB217542,Buick,EncoreFWD
5,16784,2015,22800,Las Vegas,NV,KL4CJBSB3FB166881,Buick,EncoreConvenience


In [11]:
df.shape

(297899, 8)

In [15]:
# create new column vag_sales grouping year, make, model, then transform by the mean
df['avg_sales'] = df.groupby(['year', 'make', 'model']).price.transform('mean')

In [16]:
df.head()

Unnamed: 0_level_0,price,year,mileage,city,state,vin,make,model,avg_sales
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,16472,2015,18681,Jefferson City,MO,KL4CJBSBXFB267643,Buick,EncoreConvenience,17291.768786
2,15749,2015,27592,Highland,IN,KL4CJASB5FB245057,Buick,EncoreFWD,16721.350598
3,16998,2015,13650,Boone,NC,KL4CJCSB0FB264921,Buick,EncoreLeather,19080.632911
4,15777,2015,25195,New Orleans,LA,KL4CJASB4FB217542,Buick,EncoreFWD,16721.350598
5,16784,2015,22800,Las Vegas,NV,KL4CJBSB3FB166881,Buick,EncoreConvenience,17291.768786


In [17]:
# create column get_avg to find 
df['get_avg'] = (df.price > df.avg_sales).astype(int)

In [20]:
df.head(10)

Unnamed: 0_level_0,price,year,mileage,city,state,vin,make,model,avg_sales,get_avg
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,16472,2015,18681,Jefferson City,MO,KL4CJBSBXFB267643,Buick,EncoreConvenience,17291.768786,0
2,15749,2015,27592,Highland,IN,KL4CJASB5FB245057,Buick,EncoreFWD,16721.350598,0
3,16998,2015,13650,Boone,NC,KL4CJCSB0FB264921,Buick,EncoreLeather,19080.632911,0
4,15777,2015,25195,New Orleans,LA,KL4CJASB4FB217542,Buick,EncoreFWD,16721.350598,0
5,16784,2015,22800,Las Vegas,NV,KL4CJBSB3FB166881,Buick,EncoreConvenience,17291.768786,0
6,17020,2016,16877,Grand Island,NY,KL4CJASB7GB536760,Buick,EncoreFWD,18122.519084,0
7,15950,2015,27885,West Covina,CA,KL4CJASB3FB241802,Buick,EncoreFWD,16721.350598,0
8,17091,2016,24008,Little Rock,AR,KL4CJASBXGB565542,Buick,EncoreFWD,18122.519084,0
9,16995,2015,8624,Punta Gorda,FL,KL4CJFSB7FB173565,Buick,EncoreConvenience,17291.768786,0
10,17700,2015,13807,Jacksonville,NC,KL4CJBSB8FB068543,Buick,EncoreConvenience,17291.768786,1


In [21]:
df.drop(columns=['price', 'city', 'vin', 'avg_sales'], inplace=True)

In [22]:
df.head()

Unnamed: 0_level_0,year,mileage,state,make,model,get_avg
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,2015,18681,MO,Buick,EncoreConvenience,0
2,2015,27592,IN,Buick,EncoreFWD,0
3,2015,13650,NC,Buick,EncoreLeather,0
4,2015,25195,LA,Buick,EncoreFWD,0
5,2015,22800,NV,Buick,EncoreConvenience,0


In [25]:
for cols in ['state', 'make', 'model', 'year']:
    le = LabelEncoder().fit(df[cols])
    df[cols] = le.transform(df[cols])

In [26]:
df.head()

Unnamed: 0_level_0,year,mileage,state,make,model,get_avg
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,18,18681,28,7,523,0
2,18,27592,19,7,525,0
3,18,13650,32,7,526,0
4,18,25195,22,7,525,0
5,18,22800,38,7,523,0


In [5]:
train, test = train_test_split(df)

In [6]:
X, y = train[['Year', 'Mileage']], train.Price

In [None]:
# results_ underscore means its derived from training data, wont know number without
# look this up further 
# can have a negative r2 is value did worse than predicted score
# zip() --> look up use cases