# 0. Imports

## 0.1 Libraries

In [1]:
import numpy as np
import pandas as pd 

import matplotlib.pyplot as plt
import seaborn as sns
from mpl_toolkits.mplot3d import Axes3D

from sklearn.linear_model import LinearRegression

from sklearn.metrics import r2_score,mean_squared_error, mean_absolute_percentage_error,mean_absolute_error

import shap

  from .autonotebook import tqdm as notebook_tqdm


## 0.2. Model and data from part 1

In [105]:
from pickle import load
with open("../Part_1/part_1_model.pkl", "rb") as f:
    linear_reg = load(f)

df_train = pd.read_csv('../Part_1/data/processed_train_data.csv',index_col='id')
df_test = pd.read_csv('../Part_1/data/processed_test_data.csv',index_col='id')

In [106]:
df_train.columns

Index(['model_year', 'milage', 'accident', 'clean_title', 'horse_power',
       'tank_size', 'num_cylinders', 'automatic', 'manual', 'dual',
       'brand_Audi', 'brand_BMW', 'brand_Cadillac', 'brand_Chevrolet',
       'brand_Dodge', 'brand_Ford', 'brand_GMC', 'brand_Honda',
       'brand_Hyundai', 'brand_INFINITI', 'brand_Jeep', 'brand_Kia',
       'brand_Land', 'brand_Lexus', 'brand_Lincoln', 'brand_Mazda',
       'brand_Mercedes-Benz', 'brand_Nissan', 'brand_Porsche', 'brand_RAM',
       'brand_Subaru', 'brand_Tesla', 'brand_Toyota', 'brand_other',
       'fuel_type_E85 Flex Fuel', 'fuel_type_Gasoline', 'fuel_type_Hybrid',
       'fuel_type_Plug-In Hybrid', 'ext_col_Black', 'ext_col_Blue',
       'ext_col_Brown', 'ext_col_Gold', 'ext_col_Gray', 'ext_col_Green',
       'ext_col_Orange', 'ext_col_Red', 'ext_col_Silver', 'ext_col_White',
       'ext_col_Yellow', 'ext_col_other', 'int_col_Black', 'int_col_Blue',
       'int_col_Brown', 'int_col_Ebony', 'int_col_Global Black',
       'in

In [34]:
df_test.head()

Unnamed: 0_level_0,model_year,milage,accident,clean_title,horse_power,tank_size,num_cylinders,automatic,manual,dual,...,int_col_Brown,int_col_Ebony,int_col_Global Black,int_col_Gray,int_col_Jet Black,int_col_Orange,int_col_Red,int_col_White,int_col_other,price
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
37707,2003,46200,False,True,315.0,3.6,6.0,False,True,False,...,False,False,False,False,False,False,False,False,False,28750
37708,2010,115000,False,True,148.0,2.0,4.0,False,True,False,...,False,False,False,False,False,False,False,False,False,8950
37709,2021,39700,False,True,365.0,3.3,6.0,True,False,False,...,False,False,False,False,False,False,False,False,False,44400
37710,2022,22185,False,False,343.0,5.7,6.0,True,False,False,...,False,False,False,False,False,False,False,False,False,46599
37711,2011,152000,False,False,265.0,3.5,6.0,True,False,False,...,False,False,False,False,False,False,False,False,False,50500


# 1. Error Analysis

## 1.1 Explainability changes

To improve the model explainability we would like to transform attributes differently than we did in the previous exercise.

Doing this would allow us to analyse the model errors better, as there would be less features that would clutter our view of the importance of each feature.

This is based on our first attempt at error analysis. We noticed how difficult it was for us to interpret the results from the SHAP graphs of the previous exercise.

In [75]:
org_df_train = pd.read_csv('../Part_1/data/raw_train_data.csv',index_col='id')
org_df_test = pd.read_csv('../Part_1/data/raw_test_data.csv',index_col='id')

### 1.1.1 Color

In the previous exercise we decided to discard of colors which were not of the top 12 most common colors.
This time we do not lose data by grouping defferent shades to their "origin" color.

In exchange we minimize the variety of our data column - under the premis that the differences in shade are not meaningful for the prediction of the car price.

In [74]:
colors_drop = [col for col in df_train if col.startswith(("int", "ext"))]

brands_drop = [col for col in df_train if col.startswith(("brand"))]

In [75]:
df_train = df_train.drop(columns=colors_drop)
df_test = df_test.drop(columns=colors_drop)

df_train = df_train.drop(columns=brands_drop)
df_test = df_test.drop(columns=brands_drop)

In [77]:
df_train = pd.concat([df_train,org_df_train[['brand','int_col', 'ext_col']]],axis=1)
df_test = pd.concat([df_test,org_df_test[['brand','int_col', 'ext_col']]],axis=1)

Colors that fall into "Other":

'oyster w/contrast', 'ceramic', 'gideon',
'sport', '–', 'designo magno matte',
'dark sapphire', 'custom color', 'tempest', 'lunar rock',
'nightfall mica', 'granite', 'Pink', 'c / c', 'yulong',
'go mango!', 'grigio nimbus', 'metallic'

In [78]:
def categorize_color(color):
    color = color.lower()
    # Black group
    if any(x in color for x in [
        'black', 'obsidian', 'raven', 'onyx', 'ebony', 'nero', 'blk',
        'graphite', 'charcoal', 'dark ash', 'anthracite', 'caviar', 
        'dark matter', 'magnetic', 'beluga', 'dark galvanized']):
        return 'Black'
    # White group
    elif any(x in color for x in [
        'white', 'pearl', 'ivory', 'frost', 'platinum', 'ice', 'cloud',
        'chalk', 'bianco', 'glacier', 'linen', 'very light cashmere', 
        'parchment', 'parchment.']):
        return 'White'
    # Blue group
    elif any(x in color for x in [
        'blue', 'navy', 'aqua', 'teal', 'stormy', 'blu', 'eleos']):
        return 'Blue'
    # Red group
    elif any(x in color for x in [
        'red', 'ruby', 'garnet', 'hotspur', 'pimento', 'rosso', 
        'scarlet', 'mars', 'corsa', 'chateau']):
        return 'Red'
    # Silver/Gray group
    elif any(x in color for x in [
        'silver', 'gray', 'grey', 'steel', 'medium pewter', 'slate', 
        'boulder', 'porpoise', 'light slate', 'tungsten', 'gun metallic', 
        'medium stone', 'portland', 'mesa', 'ash']):
        return 'Silver_Gray'
    # Green group
    elif any(x in color for x in [
        'green', 'verde', 'moss', 'deep cypress']):
        return 'Green'
    # Yellow/Orange group
    elif any(x in color for x in [
        'yellow', 'gold', 'orange', 'amber', 'arancio', 'hellayella', 
        'sunset drift', 'sandstone', 'tension']):
        return 'Yellow_Orange'
    # Brown group
    elif any(x in color for x in [
        'brown', 'beige', 'tan', 'mocha', 'brandy', 'chestnut', 
        'espresso', 'roast', 'dark auburn', 'aragon', 'bronze', 
        'dune', 'maroon', 'walnut', 'camel', 'caramel', 'macchiato', 
        'medium light camel', 'shale', 'cappuccino', 'tupelo']):
        return 'Brown'
    # Purple group
    elif any(x in color for x in ['purple', 'plum', 'ametrin', 'orchid', 'pink']):
        return 'Purple_Pink'
    # Default for unclassified entries
    else:
        return 'Other'

In [79]:
df_train['int_col'] = df_train['int_col'].apply(categorize_color)
df_train['ext_col'] = df_train['ext_col'].apply(categorize_color)

df_test['int_col'] = df_test['int_col'].apply(categorize_color)
df_test['ext_col'] = df_test['ext_col'].apply(categorize_color)

### 1.1.2 Brand

Perviously we did..., this time we do...

### 1.1.3 Testing the score and explainability of the model afther the change

We can see that the model's R2 score went up/down and that the explianability is better/worse...

Replacing the data for Not Supported, Nan and - in fuel type to random values and checking if it changes the r squared

In [107]:
# Capture the original column order of df_train and df_test
original_train_columns = df_train.columns
original_test_columns = df_test.columns

# Define the list of fuel types
fuel_types = ['Gasoline', 'Hybrid', 'E85 Flex Fuel', 'Diesel', 'Plug-In Hybrid']

# Check the value of 'fuel_type' in the original dataframe and update the one-hot encoded columns in df
def replace_invalid_fuel_type(row, org_row):
    if pd.isna(org_row['fuel_type']) or org_row['fuel_type'] in ['not supported', '–']:
        new_fuel_type = np.random.choice(fuel_types)
        for fuel in fuel_types:
            if f'fuel_type_{fuel}' not in row.index:
                row[f'fuel_type_{fuel}'] = 0
            else:
                row[f'fuel_type_{fuel}'] = (fuel == new_fuel_type)
    return row

# Apply the function to the 'fuel_type' column in both train and test dataframes
df_train = df_train.apply(lambda row: replace_invalid_fuel_type(row, org_df_train.loc[row.name]), axis=1)
df_test = df_test.apply(lambda row: replace_invalid_fuel_type(row, org_df_test.loc[row.name]), axis=1)

# Restore the original column order
df_train = df_train[original_train_columns]
df_test = df_test[original_test_columns]

In [110]:
# Separate features and target variable
y_train = df_train['price']
X_train = df_train.drop(columns=['price'])
y_test = df_test['price']
X_test = df_test.drop(columns=['price'])


In [112]:
# Make predictions using the original model
y_pred = linear_reg.predict(X_test)

# Calculate the R-squared value
r_squared = r2_score(y_test, y_pred)
print(f'New R-squared value: {r_squared}')

New R-squared value: 0.10631733317458125


As we can see, we got the same R^2 value even with random values imputation. Therefore, we can infer that we need to come up with another way to fill those values. Maybe using KNN.

In [114]:
from sklearn.impute import KNNImputer

# Replace invalid or missing values in org_df_train and org_df_test with NaN
org_df_train['fuel_type'].replace(['–', 'not supported'], np.nan, inplace=True)
org_df_test['fuel_type'].replace(['–', 'not supported'], np.nan, inplace=True)

# Map categorical values to numeric values for KNNImputer
fuel_type_mapping = {
    'E85 Flex Fuel': 1,
    'Gasoline': 2,
    'Hybrid': 3,
    'Diesel': 4,
    'Plug-In Hybrid': 5,
    np.nan: np.nan
}

org_df_train['fuel_type'] = org_df_train['fuel_type'].replace(fuel_type_mapping)
org_df_test['fuel_type'] = org_df_test['fuel_type'].replace(fuel_type_mapping)

# Apply KNNImputer
imputer = KNNImputer(n_neighbors=3)

# Impute missing values in 'fuel_type' for org_df_train and org_df_test
org_df_train['fuel_type'] = imputer.fit_transform(org_df_train[['fuel_type']])
org_df_test['fuel_type'] = imputer.transform(org_df_test[['fuel_type']])

# Convert numeric values back to categorical values
reverse_fuel_type_mapping = {v: k for k, v in fuel_type_mapping.items() if not pd.isna(v)}

org_df_train['fuel_type'] = org_df_train['fuel_type'].round().replace(reverse_fuel_type_mapping)
org_df_test['fuel_type'] = org_df_test['fuel_type'].round().replace(reverse_fuel_type_mapping)

# Define the list of fuel types for one-hot encoding
fuel_types = ['Gasoline', 'Hybrid', 'E85 Flex Fuel', 'Diesel', 'Plug-In Hybrid']

# Update the one-hot encoded columns in df_train using the imputed 'fuel_type' values from org_df_train
def update_fuel_type_encoding(row, imputed_fuel_type):
    for fuel in fuel_types:
        if f'fuel_type_{fuel}' in row.index:
            row[f'fuel_type_{fuel}'] = (fuel == imputed_fuel_type)
    return row

# Apply the updates to df_train and df_test
df_train = df_train.apply(lambda row: update_fuel_type_encoding(row, org_df_train.loc[row.name, 'fuel_type']), axis=1)
df_test = df_test.apply(lambda row: update_fuel_type_encoding(row, org_df_test.loc[row.name, 'fuel_type']), axis=1)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  org_df_train['fuel_type'].replace(['–', 'not supported'], np.nan, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  org_df_test['fuel_type'].replace(['–', 'not supported'], np.nan, inplace=True)


In [115]:
# Separate features and target variable
y_train = df_train['price']
X_train = df_train.drop(columns=['price'])
y_test = df_test['price']
X_test = df_test.drop(columns=['price'])

# Make predictions using the original model
y_pred = linear_reg.predict(X_test)

# Calculate the R-squared value
r_squared = r2_score(y_test, y_pred)
print(f'New R-squared value: {r_squared}')

New R-squared value: 0.10637623834770804


# Didn't see an improvement - how come?

## 1.2 Data and model changes