In [5]:
import pandas as pd
import numpy as np

import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline

In [92]:
data = pd.read_csv('https://raw.githubusercontent.com/alexeygrigorev/mlbookcamp-code/master/chapter-02-car-price/data.csv')
data.head()

Unnamed: 0,Make,Model,Year,Engine Fuel Type,Engine HP,Engine Cylinders,Transmission Type,Driven_Wheels,Number of Doors,Market Category,Vehicle Size,Vehicle Style,highway MPG,city mpg,Popularity,MSRP
0,BMW,1 Series M,2011,premium unleaded (required),335.0,6.0,MANUAL,rear wheel drive,2.0,"Factory Tuner,Luxury,High-Performance",Compact,Coupe,26,19,3916,46135
1,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Convertible,28,19,3916,40650
2,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,High-Performance",Compact,Coupe,28,20,3916,36350
3,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Coupe,28,18,3916,29450
4,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,Luxury,Compact,Convertible,28,18,3916,34500


In [93]:
data = data[['Make',
'Model',
'Year',
'Engine HP',
'Engine Cylinders',
'Transmission Type',
'Vehicle Style',
'highway MPG',
'city mpg',
'MSRP']]

In [94]:
data.columns = data.columns.str.replace(' ', '_').str.lower()

In [95]:
data.fillna(0, inplace=True)
data.rename(columns={'msrp': 'price'}, inplace=True)

In [29]:
data.groupby('transmission_type')['transmission_type'].count()

transmission_type
AUTOMATED_MANUAL     626
AUTOMATIC           8266
DIRECT_DRIVE          68
MANUAL              2935
UNKNOWN               19
Name: transmission_type, dtype: int64

In [30]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11914 entries, 0 to 11913
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   make               11914 non-null  object 
 1   model              11914 non-null  object 
 2   year               11914 non-null  int64  
 3   engine_hp          11914 non-null  float64
 4   engine_cylinders   11914 non-null  float64
 5   transmission_type  11914 non-null  object 
 6   vehicle_style      11914 non-null  object 
 7   highway_mpg        11914 non-null  int64  
 8   city_mpg           11914 non-null  int64  
 9   price              11914 non-null  int64  
dtypes: float64(2), int64(4), object(4)
memory usage: 930.9+ KB


In [96]:
numerical = ['year', 'engine_hp', 'engine_cylinders', 'highway_mpg', 'city_mpg', 'price']
categorical = ['make', 'model', 'transmission_type', 'vehicle_style']

In [35]:
data[numerical].corr()

Unnamed: 0,year,engine_hp,engine_cylinders,highway_mpg,city_mpg,price
year,1.0,0.338714,-0.040708,0.25824,0.198171,0.22759
engine_hp,0.338714,1.0,0.774851,-0.415707,-0.424918,0.650095
engine_cylinders,-0.040708,0.774851,1.0,-0.614541,-0.587306,0.526274
highway_mpg,0.25824,-0.415707,-0.614541,1.0,0.886829,-0.160043
city_mpg,0.198171,-0.424918,-0.587306,0.886829,1.0,-0.157676
price,0.22759,0.650095,0.526274,-0.160043,-0.157676,1.0


In [37]:
data.price = data.price.apply(lambda x: 1 if x > data.price.mean() else 0)

In [97]:
from sklearn.model_selection import train_test_split

In [98]:
df_train_full, df_test = train_test_split(data, test_size=0.2, random_state=42)

In [99]:
df_train, df_val = train_test_split(df_train_full, test_size=0.2, random_state=42)

In [100]:
y_train = df_train.price.values
y_val = df_val.price.values
y_test = df_test.price.values

del df_train['price']
del df_val['price']
del df_test['price']

In [42]:
from sklearn.metrics import mutual_info_score

In [45]:
def calculate_mi(series):
    return mutual_info_score(series, df_train_full.price)

df_mi = df_train_full[categorical].apply(calculate_mi)
df_mi = df_mi.sort_values(ascending=False).to_frame(name='MI')


display(df_mi.head())
display(df_mi.tail())

Unnamed: 0,MI
model,0.460994
make,0.238724
vehicle_style,0.08339
transmission_type,0.020884


Unnamed: 0,MI
model,0.460994
make,0.238724
vehicle_style,0.08339
transmission_type,0.020884


In [102]:
numerical.remove('price')

In [103]:
from sklearn.feature_extraction import DictVectorizer

dv = DictVectorizer(sparse=False)

train_dict = df_train[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dict)

In [64]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

In [66]:
results = model.predict(X_val)

In [69]:
from sklearn.metrics import accuracy_score

round(accuracy_score(y_val, results), 2)

0.94

In [74]:
dict(zip(dv.get_feature_names_out(), model.coef_[0].round(3)))

{'city_mpg': 0.083,
 'engine_cylinders': -0.054,
 'engine_hp': 0.036,
 'highway_mpg': 0.001,
 'make=Acura': 1.115,
 'make=Alfa Romeo': 1.244,
 'make=Aston Martin': 0.434,
 'make=Audi': 3.177,
 'make=BMW': 2.455,
 'make=Bentley': 0.132,
 'make=Bugatti': 0.0,
 'make=Buick': -0.432,
 'make=Cadillac': 2.46,
 'make=Chevrolet': -1.563,
 'make=Chrysler': -1.266,
 'make=Dodge': -2.962,
 'make=FIAT': -0.395,
 'make=Ferrari': 0.251,
 'make=Ford': -2.177,
 'make=GMC': -0.913,
 'make=Genesis': 0.353,
 'make=HUMMER': -0.066,
 'make=Honda': -1.173,
 'make=Hyundai': -2.532,
 'make=Infiniti': 0.353,
 'make=Kia': -1.521,
 'make=Lamborghini': 0.007,
 'make=Land Rover': 1.989,
 'make=Lexus': 1.275,
 'make=Lincoln': 1.083,
 'make=Lotus': 3.517,
 'make=Maserati': 0.795,
 'make=Maybach': 0.003,
 'make=Mazda': -1.529,
 'make=McLaren': 0.0,
 'make=Mercedes-Benz': 0.899,
 'make=Mitsubishi': -1.516,
 'make=Nissan': -1.192,
 'make=Oldsmobile': -0.991,
 'make=Plymouth': -0.144,
 'make=Pontiac': -2.462,
 'make=Por

In [88]:
from sklearn.feature_extraction import DictVectorizer

dv = DictVectorizer(sparse=False)
lst = categorical+numerical[::-1]
for i in categorical+numerical:
    lst.remove(i)
    print(i)
    train_dict = df_train[lst].to_dict(orient='records')
    X_train = dv.fit_transform(train_dict)

    val_dict = df_val[lst].to_dict(orient='records')
    X_val = dv.transform(val_dict)



    model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)

    results = model.predict(X_val)
    print(round(accuracy_score(y_val, results), 3) - 0.94)

make
0.0050000000000000044
model
-0.05899999999999994
transmission_type
-0.06099999999999994
vehicle_style
-0.06299999999999994
year
-0.061999999999999944
engine_hp
-0.1449999999999999
engine_cylinders
-0.21399999999999997
highway_mpg
-0.22099999999999997
city_mpg


ValueError: Sample sequence X is empty.

In [106]:
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

In [107]:
for i in [0, 0.01, 0.1, 1, 10]:
    ridge_model = Ridge(solver='sag', random_state=42, alpha=i)
    ridge_model.fit(X_train, y_train)
    pred_y = ridge_model.predict(X_val)
    print(np.sqrt(mean_squared_error(y_val, pred_y)), i)
    



57378.32560238737 0




57378.340570448214 0.01




57378.47528124028 0.1




57379.82192403147 1
57393.242390071224 10


