In [87]:
import pandas as pd
import numpy as np
import warnings

warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv("../data/car-price.csv")

In [22]:
subset = ['Make',
          'Model',
          'Year',
          'Engine HP',
          'Engine Cylinders',
          'Transmission Type',
          'Vehicle Style',
          'highway MPG',
          'city mpg',
          'MSRP']


categorical_columns = list(df.dtypes[df.dtypes == 'object'].index)
for col in categorical_columns:
    df[col] = df[col].str.lower().str.replace(' ', '_')
    df[col] = df[col].fillna(0)

df_car = df[subset]

### Data preparation


In [23]:
df_car.columns = df_car.columns.str.replace(' ', '_').str.lower()

df_car = df_car.fillna(0)

df_car = df_car.rename(columns={'msrp': 'price'})

### Question 1


In [24]:
df_car["transmission_type"].value_counts()

transmission_type
automatic           8266
manual              2935
automated_manual     626
direct_drive          68
unknown               19
Name: count, dtype: int64

In [25]:
print(
    f"The most represented transmission type is {df_car['transmission_type'].value_counts().index[0]}")

The most represented transmission type is automatic


### Question 2


In [26]:
df_car

Unnamed: 0,make,model,year,engine_hp,engine_cylinders,transmission_type,vehicle_style,highway_mpg,city_mpg,price
0,bmw,1_series_m,2011,335.0,6.0,manual,coupe,26,19,46135
1,bmw,1_series,2011,300.0,6.0,manual,convertible,28,19,40650
2,bmw,1_series,2011,300.0,6.0,manual,coupe,28,20,36350
3,bmw,1_series,2011,230.0,6.0,manual,coupe,28,18,29450
4,bmw,1_series,2011,230.0,6.0,manual,convertible,28,18,34500
...,...,...,...,...,...,...,...,...,...,...
11909,acura,zdx,2012,300.0,6.0,automatic,4dr_hatchback,23,16,46120
11910,acura,zdx,2012,300.0,6.0,automatic,4dr_hatchback,23,16,56670
11911,acura,zdx,2012,300.0,6.0,automatic,4dr_hatchback,23,16,50620
11912,acura,zdx,2013,300.0,6.0,automatic,4dr_hatchback,23,16,50920


In [27]:
corr_matrix = df_car.drop(columns=["make", "model", "transmission_type", "vehicle_style"]).corr(
    method="pearson", numeric_only=False
)
corr_matrix

Unnamed: 0,year,engine_hp,engine_cylinders,highway_mpg,city_mpg,price
year,1.0,0.338714,-0.040708,0.25824,0.198171,0.22759
engine_hp,0.338714,1.0,0.774851,-0.415707,-0.424918,0.650095
engine_cylinders,-0.040708,0.774851,1.0,-0.614541,-0.587306,0.526274
highway_mpg,0.25824,-0.415707,-0.614541,1.0,0.886829,-0.160043
city_mpg,0.198171,-0.424918,-0.587306,0.886829,1.0,-0.157676
price,0.22759,0.650095,0.526274,-0.160043,-0.157676,1.0


In [28]:
print(f"The highest correlation is between highway_mpg and city_mpg")

The highest correlation is between highway_mpg and city_mpg


### Make price binary


In [29]:
df_car['above_average'] = df_car['price'].apply(
    lambda x: 1 if x > df_car['price'].mean() else 0)

#### Split the data

- Split your data in train/val/test sets with 60%/20%/20% distribution.
- Use Scikit-Learn for that (the train_test_split function) and set the seed to 42.
- Make sure that the target value (above_average) is not in your dataframe.


In [40]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score

In [36]:
df_train_val, df_test = train_test_split(
    df_car, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(
    df_train_val, test_size=0.25, random_state=42)

In [37]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [71]:
y_train = df_train.above_average.values
X_train = df_train.drop(columns=['above_average', 'price'])  # .values #,

y_val = df_val.above_average.values
X_val = df_val.drop(columns=['above_average', 'price'])  # .values

y_test = df_test.above_average.values
X_test = df_test.drop(columns=['above_average', 'price'])  # .values #'price',

### Question 3

- Calculate the mutual information score between above_average and other categorical variables in our dataset. Use the training set only.
- Round the scores to 2 decimals using round(score, 2).


In [72]:
print(mutual_info_score(df_train.above_average, df_train.model))
print(mutual_info_score(df_train.above_average, df_train.make))
print(mutual_info_score(df_train.above_average, df_train.transmission_type))
print(mutual_info_score(df_train.above_average, df_train.vehicle_style))

0.46234389209653
0.23976875439118348
0.020957541896301862
0.0841430195677944


In [73]:
print(f"Transmission type has the lowest MI score")

Transmission type has the lowest MI score


### Question 4

- Now let's train a logistic regression.
- Remember that we have several categorical variables in the dataset. Include them using one-hot encoding.
- Fit the model on the training dataset.
  - To make sure the results are reproducible across different versions of Scikit-Learn, fit the model with these parameters:
  - model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)
- Calculate the accuracy on the validation dataset and round it to 2 decimal digits.


In [74]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [77]:
categorical_columns + numerical_columns

['make',
 'model',
 'transmission_type',
 'vehicle_style',
 'year',
 'engine_hp',
 'engine_cylinders',
 'highway_mpg',
 'city_mpg',
 'price',
 'above_average']

In [92]:
categorical_columns = list(df_car.dtypes[df_car.dtypes == 'object'].index)
numerical_columns = list(df_car.dtypes[df_car.dtypes != 'object'].index)

select_columns = ['make',
                  'model',
                  'transmission_type',
                  'vehicle_style',
                  'year',
                  'engine_hp',
                  'engine_cylinders',
                  'highway_mpg',
                  'city_mpg']

train_dict = df_train[select_columns].to_dict(orient='records')

dv = DictVectorizer(sparse=False)
dv.fit(train_dict)

X_train = dv.transform(train_dict)

In [93]:
model = LogisticRegression(solver='liblinear', C=1.0,
                           max_iter=1000, random_state=42)
model.fit(X_train, y_train)

val_dict = df_val[select_columns].to_dict(orient='records')
X_val = dv.transform(val_dict)

y_pred = model.predict(X_val)

accuracy = np.round(accuracy_score(y_val, y_pred), 2)
print(accuracy)

0.94


In [94]:
print(f"The accuracy is closer to  .95")

The accuracy is closer to  .95


### Question 5

- Let's find the least useful feature using the feature elimination technique.

- Train a model with all these features (using the same parameters as in Q4).

- Now exclude each feature from this set and train a model without it. Record the accuracy for each model.

- For each feature, calculate the difference between the original accuracy and the accuracy without the feature.

- Which of following feature has the smallest difference?

  - year
  - engine_hp
  - transmission_type
  - city_mpg


In [97]:
accuracies = []
global_accuracy = accuracy

for col in select_columns:
    subset = select_columns.copy()
    subset.remove(col)

    train_dict = df_train[subset].to_dict(orient='records')
    val_dict = df_val[subset].to_dict(orient='records')
    
    dv.fit(train_dict)

    X_train = dv.transform(train_dict)
    X_val = dv.transform(val_dict)


    model = LogisticRegression(solver='liblinear', C=1.0, random_state=42, max_iter=1000)
    model.fit(X_train, y_train)

    y_pred = model.predict(X_val)

    score = accuracy_score(y_val, y_pred)
    print(
        f"Column: {col} - Score: {round(score, 4)} - Diff: {round(global_accuracy-score, 5)}")
    accuracies.append(score)

Column: make - Score: 0.9387 - Diff: 0.00127
Column: model - Score: 0.9236 - Diff: 0.01637
Column: transmission_type - Score: 0.9366 - Diff: 0.00337
Column: vehicle_style - Score: 0.9375 - Diff: 0.00253
Column: year - Score: 0.9488 - Diff: -0.0088
Column: engine_hp - Score: 0.9207 - Diff: 0.01931
Column: engine_cylinders - Score: 0.9404 - Diff: -0.00041
Column: highway_mpg - Score: 0.9421 - Diff: -0.00209
Column: city_mpg - Score: 0.9467 - Diff: -0.00671


In [98]:
print(f"Given the option the smallest difference is transmission_type")

Given the option the smallest difference is transmission_type


### Question 6

- For this question, we'll see how to use a linear regression model from Scikit-Learn.
- We'll need to use the original column price. Apply the logarithmic transformation to this column.
- Fit the Ridge regression model on the training data with a solver 'sag'. Set the seed to 42.
- This model also has a parameter alpha. Let's try the following values: [0, 0.01, 0.1, 1, 10].
- Round your RMSE scores to 3 decimal digits.

- Which of these alphas leads to the best RMSE on the validation set?

  - 0
  - 0.01
  - 0.1
  - 1
  - 10


In [99]:
# log tranform the price column
y_train_log = np.log1p(y_train)
y_val_log = np.log1p(y_val)

In [100]:
train_dict = df_train[select_columns].to_dict(orient="records")
val_dict = df_val[select_columns].to_dict(orient="records")

dv.fit(train_dict)

X_train = dv.transform(train_dict)
X_val = dv.transform(val_dict)

In [101]:
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

In [102]:
model = Ridge(solver='sag', random_state=42)
model.fit(X_train, y_train_log)

y_pred = model.predict(X_val)


print(np.sqrt(mean_squared_error(y_pred, y_val_log)))
print(np.sqrt(mean_squared_error(np.expm1(y_pred), y_val)))

0.2067084909631019
0.3099982912926203


In [103]:
errors = []

for alpha in [0, 0.01, 0.1, 1, 10]:
    model = Ridge(alpha=alpha, random_state=42,solver ='sag')
    model.fit(X_train, y_train_log)
    
    y_pred = model.predict(X_val)
    
    score = np.sqrt(mean_squared_error(y_val_log, y_pred))
    errors.append((alpha, round(score,5)))

In [104]:
errors

[(0, 0.2067), (0.01, 0.2067), (0.1, 0.2067), (1, 0.20671), (10, 0.2068)]