In [1]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression, Ridge
from sklearn.metrics import mutual_info_score
from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# Get the data

In [3]:
!wget https://raw.githubusercontent.com/alexeygrigorev/mlbookcamp-code/master/chapter-02-car-price/data.csv

--2023-10-02 17:16:30--  https://raw.githubusercontent.com/alexeygrigorev/mlbookcamp-code/master/chapter-02-car-price/data.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 2606:50c0:8001::154, 2606:50c0:8003::154, 2606:50c0:8000::154, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|2606:50c0:8001::154|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1475504 (1.4M) [text/plain]
Saving to: ‘data.csv’


2023-10-02 17:16:31 (9.12 MB/s) - ‘data.csv’ saved [1475504/1475504]



In [4]:
df = pd.read_csv("data.csv")
df.head()

Unnamed: 0,Make,Model,Year,Engine Fuel Type,Engine HP,Engine Cylinders,Transmission Type,Driven_Wheels,Number of Doors,Market Category,Vehicle Size,Vehicle Style,highway MPG,city mpg,Popularity,MSRP
0,BMW,1 Series M,2011,premium unleaded (required),335.0,6.0,MANUAL,rear wheel drive,2.0,"Factory Tuner,Luxury,High-Performance",Compact,Coupe,26,19,3916,46135
1,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Convertible,28,19,3916,40650
2,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,High-Performance",Compact,Coupe,28,20,3916,36350
3,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Coupe,28,18,3916,29450
4,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,Luxury,Compact,Convertible,28,18,3916,34500


In [5]:
cols_to_keep = [
    "Make",
    "Model",
    "Year",
    "Engine HP",
    "Engine Cylinders",
    "Transmission Type",
    "Vehicle Style",
    "highway MPG",
    "city mpg",
    "MSRP"
]

In [6]:
data = df[cols_to_keep]
data.head()

Unnamed: 0,Make,Model,Year,Engine HP,Engine Cylinders,Transmission Type,Vehicle Style,highway MPG,city mpg,MSRP
0,BMW,1 Series M,2011,335.0,6.0,MANUAL,Coupe,26,19,46135
1,BMW,1 Series,2011,300.0,6.0,MANUAL,Convertible,28,19,40650
2,BMW,1 Series,2011,300.0,6.0,MANUAL,Coupe,28,20,36350
3,BMW,1 Series,2011,230.0,6.0,MANUAL,Coupe,28,18,29450
4,BMW,1 Series,2011,230.0,6.0,MANUAL,Convertible,28,18,34500


In [7]:
data.columns = data.columns.str.replace(" ", "_").str.lower()
data.head()

Unnamed: 0,make,model,year,engine_hp,engine_cylinders,transmission_type,vehicle_style,highway_mpg,city_mpg,msrp
0,BMW,1 Series M,2011,335.0,6.0,MANUAL,Coupe,26,19,46135
1,BMW,1 Series,2011,300.0,6.0,MANUAL,Convertible,28,19,40650
2,BMW,1 Series,2011,300.0,6.0,MANUAL,Coupe,28,20,36350
3,BMW,1 Series,2011,230.0,6.0,MANUAL,Coupe,28,18,29450
4,BMW,1 Series,2011,230.0,6.0,MANUAL,Convertible,28,18,34500


In [8]:
data = data.fillna(0)

In [9]:
data.rename(columns={"msrp": "price"}, inplace=True)

In [10]:
data.head()

Unnamed: 0,make,model,year,engine_hp,engine_cylinders,transmission_type,vehicle_style,highway_mpg,city_mpg,price
0,BMW,1 Series M,2011,335.0,6.0,MANUAL,Coupe,26,19,46135
1,BMW,1 Series,2011,300.0,6.0,MANUAL,Convertible,28,19,40650
2,BMW,1 Series,2011,300.0,6.0,MANUAL,Coupe,28,20,36350
3,BMW,1 Series,2011,230.0,6.0,MANUAL,Coupe,28,18,29450
4,BMW,1 Series,2011,230.0,6.0,MANUAL,Convertible,28,18,34500


In [11]:
# Q1 - What is the most frequent observation (mode) for the column transmission_type?
data["transmission_type"].value_counts()

transmission_type
AUTOMATIC           8266
MANUAL              2935
AUTOMATED_MANUAL     626
DIRECT_DRIVE          68
UNKNOWN               19
Name: count, dtype: int64

In [12]:
# Q2 - Correlation
num_features = ["year", "engine_hp", "engine_cylinders", "highway_mpg", "city_mpg"]
corr_matrix = data[num_features].corr()
print(corr_matrix)

                      year  engine_hp  engine_cylinders  highway_mpg  city_mpg
year              1.000000   0.338714         -0.040708     0.258240  0.198171
engine_hp         0.338714   1.000000          0.774851    -0.415707 -0.424918
engine_cylinders -0.040708   0.774851          1.000000    -0.614541 -0.587306
highway_mpg       0.258240  -0.415707         -0.614541     1.000000  0.886829
city_mpg          0.198171  -0.424918         -0.587306     0.886829  1.000000


In [13]:
# Make prices binary
mean_price = data["price"].mean()
data["above_average"] = (data["price"] > mean_price).astype(int)
data.head()

Unnamed: 0,make,model,year,engine_hp,engine_cylinders,transmission_type,vehicle_style,highway_mpg,city_mpg,price,above_average
0,BMW,1 Series M,2011,335.0,6.0,MANUAL,Coupe,26,19,46135,1
1,BMW,1 Series,2011,300.0,6.0,MANUAL,Convertible,28,19,40650,1
2,BMW,1 Series,2011,300.0,6.0,MANUAL,Coupe,28,20,36350,0
3,BMW,1 Series,2011,230.0,6.0,MANUAL,Coupe,28,18,29450,0
4,BMW,1 Series,2011,230.0,6.0,MANUAL,Convertible,28,18,34500,0


In [14]:
# Split dataset in train/val/test
df_full_train, df_test = train_test_split(data, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

del df_train["price"]
del df_val["price"]
del df_test["price"]

In [15]:
df_train.head()

Unnamed: 0,make,model,year,engine_hp,engine_cylinders,transmission_type,vehicle_style,highway_mpg,city_mpg,above_average
3972,Mitsubishi,Endeavor,2011,225.0,6.0,AUTOMATIC,4dr SUV,19,15,0
1997,Kia,Borrego,2009,276.0,6.0,AUTOMATIC,4dr SUV,21,17,0
5216,Lamborghini,Gallardo,2012,570.0,10.0,MANUAL,Convertible,20,12,1
2805,Chevrolet,Colorado,2016,200.0,4.0,AUTOMATIC,Crew Cab Pickup,27,20,0
11369,Pontiac,Vibe,2009,158.0,4.0,AUTOMATIC,4dr Hatchback,26,20,0


In [16]:
# Q3 - Mutual Information
cat_features = ["make", "model", "transmission_type", "vehicle_style"]

In [17]:
def mut_info_score_price(series):
    return mutual_info_score(series, df_train["above_average"])

In [18]:
mutual_info = df_train[cat_features].apply(mut_info_score_price)
mutual_info.sort_values(ascending=True)

transmission_type    0.020958
vehicle_style        0.084143
make                 0.239769
model                0.462344
dtype: float64

In [19]:
# Q4 - Logistic regression
dv = DictVectorizer(sparse=False)
train_dict = df_train[cat_features + num_features].to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val[cat_features + num_features].to_dict(orient='records')
X_val = dv.transform(val_dict)

test_dict = df_test[cat_features + num_features].to_dict(orient='records')
X_test = dv.transform(test_dict)

y_train = df_train["above_average"].values
y_val = df_val["above_average"].values
y_test = df_test["above_average"].values

In [20]:
model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

In [21]:
y_pred = model.predict(X_val)
(y_pred == y_val).mean().round(2)

0.93

In [22]:
# Q5 - Feature elimination
all_features = ["year", "engine_hp", "transmission_type", "city_mpg"]

In [23]:
dv = DictVectorizer(sparse=False)
train_dict = df_train[all_features].to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val[all_features].to_dict(orient='records')
X_val = dv.transform(val_dict)

test_dict = df_test[all_features].to_dict(orient='records')
X_test = dv.transform(test_dict)

model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_val)
accuracy_all_features = (y_pred == y_val).mean()
accuracy_all_features

0.8850188837599664

In [24]:
features = ["year", "engine_hp", "transmission_type"]
dv = DictVectorizer(sparse=False)
train_dict = df_train[features].to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val[features].to_dict(orient='records')
X_val = dv.transform(val_dict)

test_dict = df_test[features].to_dict(orient='records')
X_test = dv.transform(test_dict)

model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_val)
accuracy = (y_pred == y_val).mean()
abs(accuracy - accuracy_all_features)

0.008392782207301663

In [25]:
features = ["year", "engine_hp", "city_mpg"]
dv = DictVectorizer(sparse=False)
train_dict = df_train[features].to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val[features].to_dict(orient='records')
X_val = dv.transform(val_dict)

test_dict = df_test[features].to_dict(orient='records')
X_test = dv.transform(test_dict)

model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_val)
accuracy = (y_pred == y_val).mean()
abs(accuracy - accuracy_all_features)

0.002937473772555599

In [26]:
features = ["year", "transmission_type", "city_mpg"]
dv = DictVectorizer(sparse=False)
train_dict = df_train[features].to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val[features].to_dict(orient='records')
X_val = dv.transform(val_dict)

test_dict = df_test[features].to_dict(orient='records')
X_test = dv.transform(test_dict)

model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_val)
accuracy = (y_pred == y_val).mean()
abs(accuracy - accuracy_all_features)

0.14057910197230383

In [27]:
features = ["engine_hp", "transmission_type", "city_mpg"]
dv = DictVectorizer(sparse=False)
train_dict = df_train[features].to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val[features].to_dict(orient='records')
X_val = dv.transform(val_dict)

test_dict = df_test[features].to_dict(orient='records')
X_test = dv.transform(test_dict)

model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_val)
accuracy = (y_pred == y_val).mean()
abs(accuracy - accuracy_all_features)

0.00041963911036513313

In [28]:
#Q6 - Regression with Scikit-Learn. What's the best alpha?
df_full_train, df_test = train_test_split(data, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

In [33]:
dv = DictVectorizer(sparse=True)
train_dict = df_train[cat_features + num_features].to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val[cat_features + num_features].to_dict(orient='records')
X_val = dv.transform(val_dict)

test_dict = df_test[cat_features + num_features].to_dict(orient='records')
X_test = dv.transform(test_dict)

In [34]:
y_train = np.log1p(df_train["price"]).values
y_val = np.log1p(df_val["price"]).values
y_test = np.log1p(df_test["price"]).values

In [35]:
def rmse(y, y_pred):
    error = y_pred - y
    mse = (error ** 2).mean()
    return np.sqrt(mse)

In [36]:
alphas = [0, 0.01, 0.1, 1, 10]

for alpha in alphas:
    model = Ridge(alpha=alpha, solver='sag', random_state=42)
    model.fit(X_train, y_train)

    y_pred = model.predict(X_val)

    print(f"{alpha}: {round(rmse(y_val, y_pred), 3)}")
    

0: 0.255
0.01: 0.251
0.1: 0.255
1: 0.258
10: 0.331
