In [33]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.metrics import mutual_info_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Ridge

In [34]:
data = pd.read_csv('car.csv')

In [35]:
data.columns = data.columns.str.replace(' ', '_').str.lower()
features = ['make', 'model', 'year', 'engine_hp', 'engine_cylinders', 'transmission_type', 'vehicle_style', 'highway_mpg', 'city_mpg', 'msrp']
data = data[features]
# rename msrp to price
data.rename(columns={'msrp': 'price'}, inplace=True)

data = data.fillna(0)

data.head()


Unnamed: 0,make,model,year,engine_hp,engine_cylinders,transmission_type,vehicle_style,highway_mpg,city_mpg,price
0,BMW,1 Series M,2011,335.0,6.0,MANUAL,Coupe,26,19,46135
1,BMW,1 Series,2011,300.0,6.0,MANUAL,Convertible,28,19,40650
2,BMW,1 Series,2011,300.0,6.0,MANUAL,Coupe,28,20,36350
3,BMW,1 Series,2011,230.0,6.0,MANUAL,Coupe,28,18,29450
4,BMW,1 Series,2011,230.0,6.0,MANUAL,Convertible,28,18,34500


In [36]:
data['transmission_type'].value_counts(ascending=False)

transmission_type
AUTOMATIC           8266
MANUAL              2935
AUTOMATED_MANUAL     626
DIRECT_DRIVE          68
UNKNOWN               19
Name: count, dtype: int64

In [37]:
num_feats = ['year', 'engine_hp', 'engine_cylinders', 'highway_mpg', 'city_mpg']
cat_feats = ['make', 'model', 'transmission_type', 'vehicle_style']

In [38]:
data[num_feats].corr()

Unnamed: 0,year,engine_hp,engine_cylinders,highway_mpg,city_mpg
year,1.0,0.338714,-0.040708,0.25824,0.198171
engine_hp,0.338714,1.0,0.774851,-0.415707,-0.424918
engine_cylinders,-0.040708,0.774851,1.0,-0.614541,-0.587306
highway_mpg,0.25824,-0.415707,-0.614541,1.0,0.886829
city_mpg,0.198171,-0.424918,-0.587306,0.886829,1.0


In [39]:
# convert price variable to a binary value 'above_average', based on average price

mean_price = data.price.mean()
data['above_average'] = (data['price'] > mean_price).astype(int)



In [40]:
# split the data into train, val, test
df_full_train, df_test = train_test_split(data, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

y_train = df_train.above_average.values
y_val = df_val.above_average.values
y_test = df_test.above_average.values



In [41]:
# define function for calculating mutual info scores for all columns
def mutual_info_score_above_average(series):
    return mutual_info_score(series, df_full_train.above_average)


mi = df_full_train[cat_feats].apply(mutual_info_score_above_average)
mi.sort_values(ascending=False)


model                0.460994
make                 0.238724
vehicle_style        0.083390
transmission_type    0.020884
dtype: float64

In [42]:
dicts_train = df_train[cat_feats + num_feats].to_dict(orient='records')
dv = DictVectorizer(sparse=False)
X_train = dv.fit_transform(dicts_train)

In [43]:
model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

In [44]:
# test dataset
dicts_test = df_test[cat_feats + num_feats].to_dict(orient='records')
X_test = dv.transform(dicts_test)
y_pred = model.predict(X_test)
(y_pred == y_test).mean().round(2)

0.93

In [45]:
def train_model(features):
    dicts_train = df_train[features].to_dict(orient='records')
    dv = DictVectorizer(sparse=False)
    X_train = dv.fit_transform(dicts_train)
    model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)
    dicts_test = df_test[features].to_dict(orient='records')
    X_test = dv.transform(dicts_test)
    y_pred = model.predict(X_test)
    accuracy = (y_pred == y_test).mean()
    return accuracy

all_features = ['year', 'engine_hp', 'transmission_type', 'city_mpg']

In [46]:
original_accuracy = train_model(all_features)

print(f"Accuracy with all features: {original_accuracy}")

diffs = []

for i in range(len(all_features)):
    if i == 0:
        new_features = all_features[i+1:]
    else:
        new_features = all_features[:i] + all_features[i+1:]
    accuracy = train_model(new_features)
    print(f"With {all_features[i]} removed: {accuracy}")
    diff = original_accuracy - accuracy
    diffs += [diff]
    print(f"Difference: {diff}")

min_diff = min(diffs, key=lambda x: abs(x))
differences = dict(zip(diffs, all_features))

print(f"\nLeast influential variable: {differences[min_diff]}")
print(differences)

Accuracy with all features: 0.8661351237935375
With year removed: 0.8715904322282837
Difference: -0.0054553084347461755
With engine_hp removed: 0.7582878724297104
Difference: 0.10784725136382711
With transmission_type removed: 0.8678136802349979
Difference: -0.0016785564414604215
With city_mpg removed: 0.8606798153587915
Difference: 0.0054553084347460645

Least influential variable: transmission_type
{-0.0054553084347461755: 'year', 0.10784725136382711: 'engine_hp', -0.0016785564414604215: 'transmission_type', 0.0054553084347460645: 'city_mpg'}


In [47]:

y_train_log = np.log1p(df_train['price'].values)
y_val_log = np.log1p(df_val['price'].values)

def train_ridge_model(alpha=0):
    dicts_train = df_train[num_feats + cat_feats].to_dict(orient='records')
    dv = DictVectorizer() # not using sparse=False
    X_train = dv.fit_transform(dicts_train)
    ridge_model = Ridge(solver='sag', random_state=42, max_iter=1000, alpha=alpha)
    ridge_model.fit(X_train, y_train_log)
    dicts_val = df_val[num_feats + cat_feats].to_dict(orient='records')
    X_val = dv.transform(dicts_val)
    y_pred = ridge_model.predict(X_val)
    return rmse(y_val_log, y_pred)

def rmse(y, y_pred):
    se = (y - y_pred) ** 2
    mse = se.mean()
    return np.sqrt(mse)



for i in [0.00, 0.01, 0.10, 1.00, 10.0]:
    result = train_ridge_model(i)
    print(f"{result.round(3)} alpha: {i} ")



0.251 alpha: 0.0 
0.255 alpha: 0.01 
0.255 alpha: 0.1 
0.258 alpha: 1.0 
0.336 alpha: 10.0 
