In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
!wget https://raw.githubusercontent.com/alexeygrigorev/mlbookcamp-code/master/chapter-02-car-price/data.csv

--2023-10-01 09:53:28--  https://raw.githubusercontent.com/alexeygrigorev/mlbookcamp-code/master/chapter-02-car-price/data.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1475504 (1.4M) [text/plain]
Saving to: ‘data.csv’


2023-10-01 09:53:28 (38.7 MB/s) - ‘data.csv’ saved [1475504/1475504]



*Dataset*

In [3]:
import pandas as pd

cars = pd.read_csv("data.csv")
cars.head()

Unnamed: 0,Make,Model,Year,Engine Fuel Type,Engine HP,Engine Cylinders,Transmission Type,Driven_Wheels,Number of Doors,Market Category,Vehicle Size,Vehicle Style,highway MPG,city mpg,Popularity,MSRP
0,BMW,1 Series M,2011,premium unleaded (required),335.0,6.0,MANUAL,rear wheel drive,2.0,"Factory Tuner,Luxury,High-Performance",Compact,Coupe,26,19,3916,46135
1,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Convertible,28,19,3916,40650
2,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,High-Performance",Compact,Coupe,28,20,3916,36350
3,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Coupe,28,18,3916,29450
4,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,Luxury,Compact,Convertible,28,18,3916,34500


*Features*

In [4]:
selected_columns = [
    'Make', 'Model', 'Year', 'Engine HP', 'Engine Cylinders',
    'Transmission Type', 'Vehicle Style', 'highway MPG', 'city mpg',
    'MSRP'
]

data = cars[selected_columns]
data.head()

Unnamed: 0,Make,Model,Year,Engine HP,Engine Cylinders,Transmission Type,Vehicle Style,highway MPG,city mpg,MSRP
0,BMW,1 Series M,2011,335.0,6.0,MANUAL,Coupe,26,19,46135
1,BMW,1 Series,2011,300.0,6.0,MANUAL,Convertible,28,19,40650
2,BMW,1 Series,2011,300.0,6.0,MANUAL,Coupe,28,20,36350
3,BMW,1 Series,2011,230.0,6.0,MANUAL,Coupe,28,18,29450
4,BMW,1 Series,2011,230.0,6.0,MANUAL,Convertible,28,18,34500


*Data Preparation*

In [5]:
data.columns = data.columns.str.replace(' ', '_').str.lower()
data.fillna(0, inplace=True)
data.rename(columns={'msrp': 'price'}, inplace=True)

data.head()

Unnamed: 0,make,model,year,engine_hp,engine_cylinders,transmission_type,vehicle_style,highway_mpg,city_mpg,price
0,BMW,1 Series M,2011,335.0,6.0,MANUAL,Coupe,26,19,46135
1,BMW,1 Series,2011,300.0,6.0,MANUAL,Convertible,28,19,40650
2,BMW,1 Series,2011,300.0,6.0,MANUAL,Coupe,28,20,36350
3,BMW,1 Series,2011,230.0,6.0,MANUAL,Coupe,28,18,29450
4,BMW,1 Series,2011,230.0,6.0,MANUAL,Convertible,28,18,34500


**Question #1**

In [6]:
data['transmission_type'].value_counts()

AUTOMATIC           8266
MANUAL              2935
AUTOMATED_MANUAL     626
DIRECT_DRIVE          68
UNKNOWN               19
Name: transmission_type, dtype: int64

**Question #2**

In [7]:
numerical_features = [
    'year', 'engine_hp', 'engine_cylinders', 'highway_mpg', 'city_mpg'
]

data[numerical_features].corr().round(3)

Unnamed: 0,year,engine_hp,engine_cylinders,highway_mpg,city_mpg
year,1.0,0.339,-0.041,0.258,0.198
engine_hp,0.339,1.0,0.775,-0.416,-0.425
engine_cylinders,-0.041,0.775,1.0,-0.615,-0.587
highway_mpg,0.258,-0.416,-0.615,1.0,0.887
city_mpg,0.198,-0.425,-0.587,0.887,1.0


* `engine_hp` and `year` ~ `0.339`
* `engine_hp` and `engine_cylinders` ~ `0.775`
* `highway_mpg` and `engine_cylinders`~ `-0.615`
* `highway_mpg` and `city_mpg` ~ **`0.887`**

*Make `price` binary*

In [8]:
data['above_average'] = (data['price'] > data['price'].mean()).astype(int)

data['above_average'].value_counts()

0    8645
1    3269
Name: above_average, dtype: int64

*Split the data*

In [9]:
from sklearn.model_selection import train_test_split

X, X_test, y, y_test = train_test_split(
    data.drop(columns=['above_average', 'price']), data['above_average'],
    test_size=0.2, random_state=42
)
X_train, X_val, y_train, y_val = train_test_split(
    X, y,
    test_size=0.25, random_state=42
)

X_train.shape[0], X_val.shape[0], X_test.shape[0]

(7148, 2383, 2383)

**Question #3**

In [10]:
from sklearn.metrics import mutual_info_score

mi_score = lambda x: mutual_info_score(y_train, x)
X_train.apply(mi_score).round(2).sort_values(ascending=False)

model                0.46
engine_hp            0.36
make                 0.24
engine_cylinders     0.12
vehicle_style        0.08
year                 0.07
city_mpg             0.06
highway_mpg          0.04
transmission_type    0.02
dtype: float64

**Question #4**

In [11]:
import numpy as np
from sklearn.preprocessing import OneHotEncoder

categorical_features = X.columns[~X.columns.isin(numerical_features)]

ohe = OneHotEncoder(handle_unknown='ignore', sparse=False)
ohe.fit(X_train[categorical_features])

X_train_encoded = np.concatenate(
    (
        X_train[numerical_features].values,
        ohe.transform(X_train[categorical_features])
    ),
    axis=1
)
X_val_encoded = np.concatenate(
    (
        X_val[numerical_features].values,
        ohe.transform(X_val[categorical_features])
    ),
    axis=1
)
X_test_encoded = np.concatenate(
    (
        X_test[numerical_features].values,
        ohe.transform(X_test[categorical_features])
    ),
    axis=1
)

X_train_encoded.shape[0], X_val_encoded.shape[0], X_test_encoded.shape[0]

(7148, 2383, 2383)

In [12]:
from sklearn.linear_model import LogisticRegression

def accuracy_score(y_true, y_pred):
    return (y_true == y_pred).mean().round(2)

model = LogisticRegression(
    solver='liblinear', C=10, max_iter=1000, random_state=42
)
model.fit(X_train_encoded, y_train)

accuracy_score(model.predict(X_val_encoded), y_val)

0.95

**Question #5**

In [13]:
original_accuracy = accuracy_score(model.predict(X_train_encoded), y_train)

features = numerical_features + list(ohe.get_feature_names_out())
feature_to_idx = dict(zip(features, range(len(features))))
scores = dict()

for feature in ('year', 'engine_hp', 'transmission_type', 'city_mpg'):
    mask = np.ones(X_train_encoded.shape[1]).astype(bool)

    if feature == 'transmission_type':
        mask[feature_to_idx['transmission_type_AUTOMATIC']] = False
        mask[feature_to_idx['transmission_type_MANUAL']] = False
        mask[feature_to_idx['transmission_type_AUTOMATED_MANUAL']] = False
        mask[feature_to_idx['transmission_type_DIRECT_DRIVE']] = False
        mask[feature_to_idx['transmission_type_UNKNOWN']] = False
    else:
        mask[feature_to_idx[feature]] = False

    model.fit(X_train_encoded[:, mask], y_train)
    accuracy_without_feature = accuracy_score(model.predict(X_train_encoded[:, mask]), y_train)
    scores['%s' % feature] = original_accuracy - accuracy_without_feature

scores

{'year': 0.0,
 'engine_hp': 0.020000000000000018,
 'transmission_type': 0.020000000000000018,
 'city_mpg': 0.010000000000000009}

**Question #6**

In [14]:
X, X_test, y, y_test = train_test_split(
    data.drop(columns=['above_average', 'price']), np.log(data['price']),
    test_size=0.2, random_state=42
)
X_train, X_val, y_train, y_val = train_test_split(
    X, y,
    test_size=0.25, random_state=42
)

ohe = OneHotEncoder(handle_unknown='ignore', sparse=False)
ohe.fit(X_train[categorical_features])

X_train_encoded = np.concatenate(
    (
        X_train[numerical_features].values,
        ohe.transform(X_train[categorical_features])
    ),
    axis=1
)
X_val_encoded = np.concatenate(
    (
        X_val[numerical_features].values,
        ohe.transform(X_val[categorical_features])
    ),
    axis=1
)
X_test_encoded = np.concatenate(
    (
        X_test[numerical_features].values,
        ohe.transform(X_test[categorical_features])
    ),
    axis=1
)

In [15]:
from sklearn.linear_model import Ridge

def rmse(y_true, y_pred):
    return np.sqrt(np.mean((y_true - y_pred)**2))

alphas = [0, 0.01, 0.1, 1, 10]
for alpha in alphas:
    ridge = Ridge(solver='sag', alpha=alpha, random_state=42)
    ridge.fit(X_train_encoded, y_train)

    print(alpha, np.round(rmse(y_val, ridge.predict(X_val_encoded)), 3))

0 0.487
0.01 0.487
0.1 0.487
1 0.487
10 0.487
