In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score

%matplotlib inline

# Dataset

In [2]:
remote_csv = 'https://raw.githubusercontent.com/alexeygrigorev/mlbookcamp-code/master/chapter-02-car-price/data.csv'

!wget $remote_csv -O data.csv

--2023-10-02 13:33:54--  https://raw.githubusercontent.com/alexeygrigorev/mlbookcamp-code/master/chapter-02-car-price/data.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.111.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1475504 (1.4M) [text/plain]
Saving to: ‘data.csv’


2023-10-02 13:33:58 (403 KB/s) - ‘data.csv’ saved [1475504/1475504]



In [3]:
orig_df = pd.read_csv('data.csv')
orig_df.head()

Unnamed: 0,Make,Model,Year,Engine Fuel Type,Engine HP,Engine Cylinders,Transmission Type,Driven_Wheels,Number of Doors,Market Category,Vehicle Size,Vehicle Style,highway MPG,city mpg,Popularity,MSRP
0,BMW,1 Series M,2011,premium unleaded (required),335.0,6.0,MANUAL,rear wheel drive,2.0,"Factory Tuner,Luxury,High-Performance",Compact,Coupe,26,19,3916,46135
1,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Convertible,28,19,3916,40650
2,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,High-Performance",Compact,Coupe,28,20,3916,36350
3,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Coupe,28,18,3916,29450
4,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,Luxury,Compact,Convertible,28,18,3916,34500


In [4]:
orig_df.columns = orig_df.columns.str.replace(' ', '_').str.lower()
orig_df.head()

Unnamed: 0,make,model,year,engine_fuel_type,engine_hp,engine_cylinders,transmission_type,driven_wheels,number_of_doors,market_category,vehicle_size,vehicle_style,highway_mpg,city_mpg,popularity,msrp
0,BMW,1 Series M,2011,premium unleaded (required),335.0,6.0,MANUAL,rear wheel drive,2.0,"Factory Tuner,Luxury,High-Performance",Compact,Coupe,26,19,3916,46135
1,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Convertible,28,19,3916,40650
2,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,High-Performance",Compact,Coupe,28,20,3916,36350
3,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Coupe,28,18,3916,29450
4,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,Luxury,Compact,Convertible,28,18,3916,34500


In [5]:
orig_df.isna().any()

make                 False
model                False
year                 False
engine_fuel_type      True
engine_hp             True
engine_cylinders      True
transmission_type    False
driven_wheels        False
number_of_doors       True
market_category       True
vehicle_size         False
vehicle_style        False
highway_mpg          False
city_mpg             False
popularity           False
msrp                 False
dtype: bool

In [6]:
orig_df = orig_df.fillna(0)

In [7]:
orig_df.isna().any()

make                 False
model                False
year                 False
engine_fuel_type     False
engine_hp            False
engine_cylinders     False
transmission_type    False
driven_wheels        False
number_of_doors      False
market_category      False
vehicle_size         False
vehicle_style        False
highway_mpg          False
city_mpg             False
popularity           False
msrp                 False
dtype: bool

In [8]:
orig_df = orig_df.rename(columns={'msrp': 'price'})
orig_df.columns

Index(['make', 'model', 'year', 'engine_fuel_type', 'engine_hp',
       'engine_cylinders', 'transmission_type', 'driven_wheels',
       'number_of_doors', 'market_category', 'vehicle_size', 'vehicle_style',
       'highway_mpg', 'city_mpg', 'popularity', 'price'],
      dtype='object')

In [9]:
hw_columns = ['make', 'model', 'year', 'engine_hp',
       'engine_cylinders', 'transmission_type', 'vehicle_style',
       'highway_mpg', 'city_mpg', 'price']
df_prep = orig_df.copy()
df_prep = df_prep[hw_columns]
df_prep.head()

Unnamed: 0,make,model,year,engine_hp,engine_cylinders,transmission_type,vehicle_style,highway_mpg,city_mpg,price
0,BMW,1 Series M,2011,335.0,6.0,MANUAL,Coupe,26,19,46135
1,BMW,1 Series,2011,300.0,6.0,MANUAL,Convertible,28,19,40650
2,BMW,1 Series,2011,300.0,6.0,MANUAL,Coupe,28,20,36350
3,BMW,1 Series,2011,230.0,6.0,MANUAL,Coupe,28,18,29450
4,BMW,1 Series,2011,230.0,6.0,MANUAL,Convertible,28,18,34500


# Question 1

In [10]:
orig_df.transmission_type.mode()

0    AUTOMATIC
Name: transmission_type, dtype: object

# Question 2

In [11]:
numerical = ['year', 'engine_hp', 'engine_cylinders', 'highway_mpg', 'city_mpg']
categorical = ['make', 'model', 'transmission_type', 'vehicle_style']

In [12]:
df_prep[numerical].corr()

Unnamed: 0,year,engine_hp,engine_cylinders,highway_mpg,city_mpg
year,1.0,0.338714,-0.040708,0.25824,0.198171
engine_hp,0.338714,1.0,0.774851,-0.415707,-0.424918
engine_cylinders,-0.040708,0.774851,1.0,-0.614541,-0.587306
highway_mpg,0.25824,-0.415707,-0.614541,1.0,0.886829
city_mpg,0.198171,-0.424918,-0.587306,0.886829,1.0


* engine_hp and year: 0.338714
* engine_hp and engine_cylinders: 0.774851
* highway_mpg and engine_cylinders: -0.614541
* **highway_mpg and city_mpg: 0.886829**

# Data preparation

In [13]:
price_mean = df_prep.price.mean()
price_mean

40594.737032063116

In [14]:
price_above_avg = (df_prep.price > price_mean)
price_above_avg

0         True
1         True
2        False
3        False
4        False
         ...  
11909     True
11910     True
11911     True
11912     True
11913    False
Name: price, Length: 11914, dtype: bool

In [15]:
df_prep['above_average'] = price_above_avg.astype(int)
df_prep.above_average

0        1
1        1
2        0
3        0
4        0
        ..
11909    1
11910    1
11911    1
11912    1
11913    0
Name: above_average, Length: 11914, dtype: int64

In [16]:
seed = 42

df_full_train, df_test = train_test_split(df_prep, test_size=0.2,random_state=seed)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=seed)

df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

y_train = df_train.above_average.values
y_val = df_val.above_average.values
y_test = df_test.above_average.values

del df_train['price']
del df_val['price']
del df_test['price']

del df_train['above_average']
del df_val['above_average']
del df_test['above_average']

df_train.head()

Unnamed: 0,make,model,year,engine_hp,engine_cylinders,transmission_type,vehicle_style,highway_mpg,city_mpg
0,Mitsubishi,Endeavor,2011,225.0,6.0,AUTOMATIC,4dr SUV,19,15
1,Kia,Borrego,2009,276.0,6.0,AUTOMATIC,4dr SUV,21,17
2,Lamborghini,Gallardo,2012,570.0,10.0,MANUAL,Convertible,20,12
3,Chevrolet,Colorado,2016,200.0,4.0,AUTOMATIC,Crew Cab Pickup,27,20
4,Pontiac,Vibe,2009,158.0,4.0,AUTOMATIC,4dr Hatchback,26,20


# Question 3

In [17]:
for c in categorical:
    score = mutual_info_score(df_full_train.above_average, df_full_train[c])
    print(c, score, round(score, 2))

# transmission_type has the lowest mutual info score

make 0.2387236479073192 0.24
model 0.46099440756035703 0.46
transmission_type 0.020883575914971142 0.02
vehicle_style 0.08339022741593435 0.08


# Question 4

In [18]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score

In [19]:
# one-hot encoding
dv = DictVectorizer(sparse=False)

cols = categorical + numerical
train_dict = df_train[cols].to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val[cols].to_dict(orient='records')
X_val = dv.transform(val_dict)

In [20]:
model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=seed)
model.fit(X_train, y_train)

In [21]:
y_pred = model.predict_proba(X_val)[:, 1]
orig_acc = accuracy_score(y_val, y_pred >= 0.5)
orig_acc_rounded = round(orig_acc, 2)

In [22]:
print(f'{orig_acc=}, {orig_acc_rounded=}')
# don't know why this doesn't match any of the multiple choice answers, so

orig_acc=0.9345362987830466, orig_acc_rounded=0.93


# Question 5

In [23]:
def train_without(df_train, df_val, column_to_drop):
    dv = DictVectorizer(sparse=False)

    cols = (categorical + numerical)
    cols.remove(column_to_drop)

    train_dict = df_train[cols].to_dict(orient='records')
    X_train = dv.fit_transform(train_dict)
    
    val_dict = df_val[cols].to_dict(orient='records')
    X_val = dv.transform(val_dict)

    model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=seed)
    model.fit(X_train, y_train)
    
    y_pred = model.predict_proba(X_val)[:, 1]
    return accuracy_score(y_val, y_pred >= 0.5)

In [24]:
cols_to_drop = ['year', 'engine_hp', 'transmission_type', 'city_mpg']
diffs = []

for c in cols_to_drop:
    acc = train_without(df_train, df_val, c)
    diffs.append(orig_acc - acc)
    print(c, acc, orig_acc - acc)

min(diffs)

year 0.9475451112043642 -0.013008812421317684
engine_hp 0.9299202685690307 0.004616030214015909
transmission_type 0.9458665547629039 -0.011330255979857373
city_mpg 0.9458665547629039 -0.011330255979857373


-0.013008812421317684

# Question 6

In [25]:
from sklearn.linear_model import Ridge

df_full_train, df_test = train_test_split(df_prep, test_size=0.2,random_state=seed)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=seed)

df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

y_train = np.log1p(df_train.price.values)
y_val = np.log1p(df_val.price.values)
y_test = np.log1p(df_test.price.values)

del df_train['price']
del df_val['price']
del df_test['price']

del df_train['above_average']
del df_val['above_average']
del df_test['above_average']

df_train.columns

Index(['make', 'model', 'year', 'engine_hp', 'engine_cylinders',
       'transmission_type', 'vehicle_style', 'highway_mpg', 'city_mpg'],
      dtype='object')

In [26]:
def rmse(y, y_pred):
    err = y - y_pred
    sq_err = err ** 2
    mse = sq_err.mean()
    return np.sqrt(mse)

In [30]:
cols = categorical + numerical
alphas = [0, 0.01, 0.1, 1, 10]
rmse_scores = []

for a in alphas:
    dv = DictVectorizer(sparse=True)

    train_dict = df_train[cols].to_dict(orient='records')
    X_train = dv.fit_transform(train_dict)
    
    val_dict = df_val[cols].to_dict(orient='records')
    X_val = dv.transform(val_dict)
    
    model = Ridge(solver='sag', alpha=a, random_state=seed)
    model.fit(X_train, y_train)

    y_pred = model.predict(X_val)
    rmse_score = rmse(y_val, y_pred)
    rmse_scores.append(rmse_score)

    print(a, rmse_score, round(rmse_score, 3))

min(rmse_scores)

0 0.2511927061623568 0.251
0.01 0.25494057332611675 0.255
0.1 0.25130047172580444 0.251
1 0.2583176591229197 0.258
10 0.33102044719593593 0.331


0.2511927061623568