In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
car_price_dataset_url = 'https://raw.githubusercontent.com/alexeygrigorev/mlbookcamp-code/master/chapter-02-car-price/data.csv'
columns = ['Make', 'Model', 'Year', 'Engine HP',
           'Engine Cylinders', 'Transmission Type',
           'Vehicle Style', 'highway MPG', 'city mpg', 'MSRP']
df = pd.read_csv(car_price_dataset_url, usecols=columns)
df.columns = df.columns.str.lower().str.replace(' ', '_')
categorical_columns = list(df.select_dtypes('object').columns)
df[categorical_columns] = df[categorical_columns].apply(lambda x: x.str.lower().str.replace(' ', '_'))
df.engine_hp.fillna(0, inplace=True)
df.engine_cylinders.fillna(0, inplace=True)
df['above_average'] = (df.msrp > df.msrp.mean()).astype('int64')
numerical_columns = list(set(df.columns) - set(categorical_columns))
df.head()

Unnamed: 0,make,model,year,engine_hp,engine_cylinders,transmission_type,vehicle_style,highway_mpg,city_mpg,msrp,above_average
0,bmw,1_series_m,2011,335.0,6.0,manual,coupe,26,19,46135,1
1,bmw,1_series,2011,300.0,6.0,manual,convertible,28,19,40650,1
2,bmw,1_series,2011,300.0,6.0,manual,coupe,28,20,36350,0
3,bmw,1_series,2011,230.0,6.0,manual,coupe,28,18,29450,0
4,bmw,1_series,2011,230.0,6.0,manual,convertible,28,18,34500,0


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11914 entries, 0 to 11913
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   make               11914 non-null  object 
 1   model              11914 non-null  object 
 2   year               11914 non-null  int64  
 3   engine_hp          11914 non-null  float64
 4   engine_cylinders   11914 non-null  float64
 5   transmission_type  11914 non-null  object 
 6   vehicle_style      11914 non-null  object 
 7   highway_mpg        11914 non-null  int64  
 8   city_mpg           11914 non-null  int64  
 9   msrp               11914 non-null  int64  
 10  above_average      11914 non-null  int64  
dtypes: float64(2), int64(5), object(4)
memory usage: 1024.0+ KB


In [4]:
X = df.drop(['msrp', 'above_average'], axis=1)
y = df.above_average

In [5]:
from sklearn.model_selection import train_test_split
X_full_train, X_test, y_full_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
X_train, X_val, y_train, y_val = train_test_split(X_full_train, y_full_train, test_size=0.25, random_state=1)

In [6]:
X_train = X_train.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
X_val = X_val.reset_index(drop=True)
y_val = y_val.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

### Question 1: ROC AUC feature importance

ROC AUC could also be used to evaluate feature importance of numerical variables. For each numerical variable, use it as score and compute AUC with the `above_average` variable. Use the training dataset for that. If your AUC is < 0.5, invert this variable by putting "-" in front (e.g. `-df_train['engine_hp']`). AUC can go below 0.5 if the variable is negatively correlated with the target varialble. You can change the direction of the correlation by negating this variable - then negative correlation becomes positive. Which numerical variable (among the following 4) has the highest AUC?

* `engine_hp`
* `engine_cylinders`
* `highway_mpg`
* `city_mpg`

In [7]:
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.base import clone

In [8]:
X_train.shape

(7148, 9)

In [10]:
numerical_columns = ['year', 'engine_hp', 'engine_cylinders', 'highway_mpg', 'city_mpg']
auc_scores = []
clf = LogisticRegression(solver="liblinear", random_state=1)
y_tr = y_train.ravel()
for col in numerical_columns:
    if col == 'year':
        X_col = -X_train[col].values.reshape(-1, 1)
    else:
        X_col = X_train[col].values.reshape(-1, 1)
    clf_ = clone(clf)
    clf_.fit(X_col, y_tr)
    y_pred = clf_.predict_proba(X_col)[:, 1]
    auc_scores.append(roc_auc_score(y_tr, y_pred).round(4))
auc_scores_df = pd.DataFrame(auc_scores, index=numerical_columns, columns=['AUC'])
auc_scores_df.sort_values(ascending=False, by='AUC')

Unnamed: 0,AUC
engine_hp,0.9171
engine_cylinders,0.7661
city_mpg,0.6734
highway_mpg,0.6331
year,0.3124
