In [30]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder



In [4]:
#mean_squared_error(y_true, y_pred)

def rmse(y, y_pred):
    se = (y - y_pred) ** 2
    mse = se.mean()
    return np.sqrt(mse)


In [5]:
!wget https://raw.githubusercontent.com/alexeygrigorev/mlbookcamp-code/master/chapter-02-car-price/data.csv

--2023-10-02 16:21:40--  https://raw.githubusercontent.com/alexeygrigorev/mlbookcamp-code/master/chapter-02-car-price/data.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 2606:50c0:8003::154, 2606:50c0:8001::154, 2606:50c0:8002::154, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|2606:50c0:8003::154|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1475504 (1.4M) [text/plain]
Saving to: ‘data.csv’


2023-10-02 16:21:40 (12.4 MB/s) - ‘data.csv’ saved [1475504/1475504]



In [6]:
!mv data.csv ../data/
car_prices = pd.read_csv("../data/data.csv")
car_prices.head()

Unnamed: 0,Make,Model,Year,Engine Fuel Type,Engine HP,Engine Cylinders,Transmission Type,Driven_Wheels,Number of Doors,Market Category,Vehicle Size,Vehicle Style,highway MPG,city mpg,Popularity,MSRP
0,BMW,1 Series M,2011,premium unleaded (required),335.0,6.0,MANUAL,rear wheel drive,2.0,"Factory Tuner,Luxury,High-Performance",Compact,Coupe,26,19,3916,46135
1,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Convertible,28,19,3916,40650
2,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,High-Performance",Compact,Coupe,28,20,3916,36350
3,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Coupe,28,18,3916,29450
4,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,Luxury,Compact,Convertible,28,18,3916,34500


In [7]:
car_prices.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11914 entries, 0 to 11913
Data columns (total 16 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Make               11914 non-null  object 
 1   Model              11914 non-null  object 
 2   Year               11914 non-null  int64  
 3   Engine Fuel Type   11911 non-null  object 
 4   Engine HP          11845 non-null  float64
 5   Engine Cylinders   11884 non-null  float64
 6   Transmission Type  11914 non-null  object 
 7   Driven_Wheels      11914 non-null  object 
 8   Number of Doors    11908 non-null  float64
 9   Market Category    8172 non-null   object 
 10  Vehicle Size       11914 non-null  object 
 11  Vehicle Style      11914 non-null  object 
 12  highway MPG        11914 non-null  int64  
 13  city mpg           11914 non-null  int64  
 14  Popularity         11914 non-null  int64  
 15  MSRP               11914 non-null  int64  
dtypes: float64(3), int64(5

In [8]:
car_prices_subset = car_prices[[
    "Make",
    "Model",
    "Year",
    "Engine HP",
    "Engine Cylinders",
    "Transmission Type",
    "Vehicle Style",
    "highway MPG",
    "city mpg",
    "MSRP"
]].rename(
    columns={"MSRP": "price"}
).fillna(0)

car_prices_subset.columns = car_prices_subset.columns.str.replace(' ', '_').str.lower()

car_prices_subset.head()

Unnamed: 0,make,model,year,engine_hp,engine_cylinders,transmission_type,vehicle_style,highway_mpg,city_mpg,price
0,BMW,1 Series M,2011,335.0,6.0,MANUAL,Coupe,26,19,46135
1,BMW,1 Series,2011,300.0,6.0,MANUAL,Convertible,28,19,40650
2,BMW,1 Series,2011,300.0,6.0,MANUAL,Coupe,28,20,36350
3,BMW,1 Series,2011,230.0,6.0,MANUAL,Coupe,28,18,29450
4,BMW,1 Series,2011,230.0,6.0,MANUAL,Convertible,28,18,34500


# Q1

In [9]:
car_prices_subset.transmission_type.value_counts()

transmission_type
AUTOMATIC           8266
MANUAL              2935
AUTOMATED_MANUAL     626
DIRECT_DRIVE          68
UNKNOWN               19
Name: count, dtype: int64

AUTOMATIC is the mode of `transmission_type`

# Q2

In [10]:
pd.DataFrame(np.corrcoef(car_prices_subset[["year", "engine_hp", "engine_cylinders", "highway_mpg", "city_mpg", "price"]].values, rowvar=False))

Unnamed: 0,0,1,2,3,4,5
0,1.0,0.338714,-0.040708,0.25824,0.198171,0.22759
1,0.338714,1.0,0.774851,-0.415707,-0.424918,0.650095
2,-0.040708,0.774851,1.0,-0.614541,-0.587306,0.526274
3,0.25824,-0.415707,-0.614541,1.0,0.886829,-0.160043
4,0.198171,-0.424918,-0.587306,0.886829,1.0,-0.157676
5,0.22759,0.650095,0.526274,-0.160043,-0.157676,1.0


In [11]:
numerical_cols = ["year", "engine_hp", "engine_cylinders", "highway_mpg", "city_mpg", "price"]
car_prices_subset[numerical_cols].corr()

Unnamed: 0,year,engine_hp,engine_cylinders,highway_mpg,city_mpg,price
year,1.0,0.338714,-0.040708,0.25824,0.198171,0.22759
engine_hp,0.338714,1.0,0.774851,-0.415707,-0.424918,0.650095
engine_cylinders,-0.040708,0.774851,1.0,-0.614541,-0.587306,0.526274
highway_mpg,0.25824,-0.415707,-0.614541,1.0,0.886829,-0.160043
city_mpg,0.198171,-0.424918,-0.587306,0.886829,1.0,-0.157676
price,0.22759,0.650095,0.526274,-0.160043,-0.157676,1.0


`highway_mpg` and `city_mpg` !

In [12]:
mean_price = car_prices_subset.price.mean()
mean_price

40594.737032063116

In [13]:
car_prices_subset["above_average"] =\
    np.where(car_prices_subset.price > mean_price, 1, 0)

car_prices_subset.head()

Unnamed: 0,make,model,year,engine_hp,engine_cylinders,transmission_type,vehicle_style,highway_mpg,city_mpg,price,above_average
0,BMW,1 Series M,2011,335.0,6.0,MANUAL,Coupe,26,19,46135,1
1,BMW,1 Series,2011,300.0,6.0,MANUAL,Convertible,28,19,40650,1
2,BMW,1 Series,2011,300.0,6.0,MANUAL,Coupe,28,20,36350,0
3,BMW,1 Series,2011,230.0,6.0,MANUAL,Coupe,28,18,29450,0
4,BMW,1 Series,2011,230.0,6.0,MANUAL,Convertible,28,18,34500,0


# Split data

In [14]:
df_full_train, df_test = train_test_split(car_prices_subset, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

response_cols_to_drop = ["price", "above_average"]

df_train_x = df_train.drop(columns=response_cols_to_drop)
df_val_x = df_val.drop(columns=response_cols_to_drop)
df_test_x = df_test.drop(columns=response_cols_to_drop)

y_train = df_train.above_average
y_val = df_val.above_average
y_test = df_test.above_average

In [15]:
len(df_train), len(df_val), len(df_test)

(7148, 2383, 2383)

# Q3 mutual information

In [45]:
def mutual_info_response_score(series):
    return mutual_info_score(series, df_train.above_average)


In [46]:
df_train[[
    "make",
    "model",
    "transmission_type",
    "vehicle_style"
]].apply(mutual_info_response_score)

make                 0.239769
model                0.462344
transmission_type    0.020958
vehicle_style        0.084143
dtype: float64

transmission_type

# Q4 model accuracy

In [17]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7148 entries, 3972 to 10286
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   make               7148 non-null   object 
 1   model              7148 non-null   object 
 2   year               7148 non-null   int64  
 3   engine_hp          7148 non-null   float64
 4   engine_cylinders   7148 non-null   float64
 5   transmission_type  7148 non-null   object 
 6   vehicle_style      7148 non-null   object 
 7   highway_mpg        7148 non-null   int64  
 8   city_mpg           7148 non-null   int64  
 9   price              7148 non-null   int64  
 10  above_average      7148 non-null   int64  
dtypes: float64(2), int64(5), object(4)
memory usage: 670.1+ KB


In [18]:
dv = DictVectorizer(sparse=False)
train_dict = df_train_x.to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val_x.to_dict(orient='records')
X_val = dv.transform(val_dict)

In [19]:
train_dict[0:5]

[{'make': 'Mitsubishi',
  'model': 'Endeavor',
  'year': 2011,
  'engine_hp': 225.0,
  'engine_cylinders': 6.0,
  'transmission_type': 'AUTOMATIC',
  'vehicle_style': '4dr SUV',
  'highway_mpg': 19,
  'city_mpg': 15},
 {'make': 'Kia',
  'model': 'Borrego',
  'year': 2009,
  'engine_hp': 276.0,
  'engine_cylinders': 6.0,
  'transmission_type': 'AUTOMATIC',
  'vehicle_style': '4dr SUV',
  'highway_mpg': 21,
  'city_mpg': 17},
 {'make': 'Lamborghini',
  'model': 'Gallardo',
  'year': 2012,
  'engine_hp': 570.0,
  'engine_cylinders': 10.0,
  'transmission_type': 'MANUAL',
  'vehicle_style': 'Convertible',
  'highway_mpg': 20,
  'city_mpg': 12},
 {'make': 'Chevrolet',
  'model': 'Colorado',
  'year': 2016,
  'engine_hp': 200.0,
  'engine_cylinders': 4.0,
  'transmission_type': 'AUTOMATIC',
  'vehicle_style': 'Crew Cab Pickup',
  'highway_mpg': 27,
  'city_mpg': 20},
 {'make': 'Pontiac',
  'model': 'Vibe',
  'year': 2009,
  'engine_hp': 158.0,
  'engine_cylinders': 4.0,
  'transmission_type'

In [20]:
X_train[0:5, 0:5]

array([[ 15.,   6., 225.,  19.,   0.],
       [ 17.,   6., 276.,  21.,   0.],
       [ 12.,  10., 570.,  20.,   0.],
       [ 20.,   4., 200.,  27.,   0.],
       [ 20.,   4., 158.,  26.,   0.]])

In [21]:
model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)
model.fit(X_train, y_train)


In [22]:
y_pred = model.predict_proba(X_val)[:, 1]
churn_decision = (y_pred >= 0.5)
accuracy = (y_val == churn_decision).mean()
accuracy_rounded = round(accuracy, 2)
print(accuracy_rounded)

0.95


# pipeline it

In [23]:
def split_data_train_model(input_df):
    
    df_full_train, df_test = train_test_split(input_df, test_size=0.2, random_state=42)
    df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

    response_cols_to_drop = ["price", "above_average"]

    df_train_x = df_train.drop(columns=response_cols_to_drop)
    df_val_x = df_val.drop(columns=response_cols_to_drop)
    df_test_x = df_test.drop(columns=response_cols_to_drop)

    y_train = df_train.above_average
    y_val = df_val.above_average
    y_test = df_test.above_average
    
    dv = DictVectorizer(sparse=False)
    train_dict = df_train_x.to_dict(orient='records')
    X_train = dv.fit_transform(train_dict)

    val_dict = df_val_x.to_dict(orient='records')
    X_val = dv.transform(val_dict)

    model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)
    
    y_pred = model.predict_proba(X_val)[:, 1]
    churn_decision = (y_pred >= 0.5)
    accuracy = (y_val == churn_decision).mean()
    accuracy_rounded = round(accuracy, 2)
    
    return accuracy, accuracy_rounded

# Q5 remove features and retrain

In [24]:
features_to_remove = ["year", "engine_hp", "transmission_type", "city_mpg"]

for feature in features_to_remove:
    print(feature)
    new_accuracies = split_data_train_model(car_prices_subset.drop(columns=[feature]))
    print("accuracies", new_accuracies)
    print("delta", new_accuracies[0] - accuracy_rounded)

year
accuracies (0.9471254720939991, 0.95)
delta -0.0028745279060008455
engine_hp
accuracies (0.9227864036928242, 0.92)
delta -0.02721359630717579
transmission_type
accuracies (0.9404112463281578, 0.94)
delta -0.009588753671842198
city_mpg
accuracies (0.9458665547629039, 0.95)
delta -0.004133445237096023


Dropping year has the least impact

# Q6

In [39]:
def split_data_train_ridge(input_df, alpha):
    
    input_df = input_df.drop(columns=["above_average"])
    input_df.price = np.log1p(input_df.price)
    
    df_full_train, df_test = train_test_split(input_df, test_size=0.2, random_state=42)
    df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

    response_cols_to_drop = ["price"]

    df_train_x = df_train.drop(columns=response_cols_to_drop)
    df_val_x = df_val.drop(columns=response_cols_to_drop)
    df_test_x = df_test.drop(columns=response_cols_to_drop)

    y_train = df_train.price
    y_val = df_val.price
    y_test = df_test.price
    
    dv = DictVectorizer(sparse=False)
    train_dict = df_train_x.to_dict(orient='records')
    X_train = dv.fit_transform(train_dict)

    val_dict = df_val_x.to_dict(orient='records')
    X_val = dv.transform(val_dict)

    model = Ridge(alpha=alpha, solver='sag', random_state=42, max_iter=15000)
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_val)
    rmse_val = rmse(y_val, y_pred)
    rmse_sklearn = mean_squared_error(y_val, y_pred)**0.5
    
    return alpha, rmse_val, rmse_sklearn

In [40]:
def split_data_train_ridge_norm_feat(input_df, alpha):
    
    scaler = StandardScaler()
    ohe = OneHotEncoder(sparse=False, handle_unknown='ignore')    
    
    cat_columns = ["make", "model", "transmission_type", "vehicle_style"]
    num_columns = ["year", "engine_hp", "engine_cylinders", "highway_mpg", "city_mpg"]
    
    input_df = input_df.drop(columns=["above_average"])
    input_df.price = np.log1p(input_df.price)
    
    df_full_train, df_test = train_test_split(input_df, test_size=0.2, random_state=42)
    df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

    response_cols_to_drop = ["price"]

    df_train_x = df_train.drop(columns=response_cols_to_drop)
    df_val_x = df_val.drop(columns=response_cols_to_drop)
    df_test_x = df_test.drop(columns=response_cols_to_drop)

    y_train = df_train.price
    y_val = df_val.price
    y_test = df_test.price
    
    X_train_cat = ohe.fit_transform(df_train_x[cat_columns].values)
    X_val_cat = ohe.transform(df_val_x[cat_columns].values)
    X_test_cat = ohe.transform(df_test_x[cat_columns].values)
    
    X_train_num = scaler.fit_transform(df_train_x[num_columns].values)
    X_val_num = scaler.transform(df_val_x[num_columns].values)
    X_test_num = scaler.transform(df_test_x[num_columns].values)
    
    X_train = np.column_stack([X_train_num, X_train_cat])
    X_val = np.column_stack([X_val_num, X_val_cat])
    X_test = np.column_stack([X_test_num, X_test_cat])

    model = Ridge(alpha=alpha, solver='sag', random_state=42, max_iter=15000)
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_val)
    rmse_val = rmse(y_val, y_pred)
    rmse_sklearn = mean_squared_error(y_val, y_pred)**0.5
    
    return alpha, rmse_val, rmse_sklearn

In [41]:
for alpha in [0, 0.01, 0.1, 1, 10]:
    print(split_data_train_ridge(car_prices_subset, alpha))


(0, 0.42082120603369644, 0.42082120603369644)
(0.01, 0.42082495786127977, 0.42082495786127977)
(0.1, 0.42088347607574905, 0.42088347607574905)
(1, 0.4214776197656962, 0.4214776197656962)
(10, 0.42733214479855136, 0.42733214479855136)


In [42]:
for alpha in [0, 0.01, 0.1, 1, 10]:
    print(split_data_train_ridge_norm_featnorm_feat(car_prices_subset, alpha))



(0, 0.21768697022120245, 0.21768697022120245)




(0.01, 0.21772931784182953, 0.21772931784182953)




(0.1, 0.21830657372954634, 0.21830657372954634)




(1, 0.23128740665436537, 0.23128740665436537)




(10, 0.3210113690102109, 0.3210113690102109)
