In [46]:
import numpy as np
import pandas as pd 
import seaborn as sns
import sklearn
import matplotlib.pyplot as plt

In [3]:
df = pd.read_csv("./playground-series-s4e9/train.csv",index_col='id')
df.head()

Unnamed: 0_level_0,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title,price
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,MINI,Cooper S Base,2007,213000,Gasoline,172.0HP 1.6L 4 Cylinder Engine Gasoline Fuel,A/T,Yellow,Gray,None reported,Yes,4200
1,Lincoln,LS V8,2002,143250,Gasoline,252.0HP 3.9L 8 Cylinder Engine Gasoline Fuel,A/T,Silver,Beige,At least 1 accident or damage reported,Yes,4999
2,Chevrolet,Silverado 2500 LT,2002,136731,E85 Flex Fuel,320.0HP 5.3L 8 Cylinder Engine Flex Fuel Capab...,A/T,Blue,Gray,None reported,Yes,13900
3,Genesis,G90 5.0 Ultimate,2017,19500,Gasoline,420.0HP 5.0L 8 Cylinder Engine Gasoline Fuel,Transmission w/Dual Shift Mode,Black,Black,None reported,Yes,45000
4,Mercedes-Benz,Metris Base,2021,7388,Gasoline,208.0HP 2.0L 4 Cylinder Engine Gasoline Fuel,7-Speed A/T,Black,Beige,None reported,Yes,97500


In [8]:
df.isna().sum()

brand               0
model               0
model_year          0
milage              0
fuel_type        5083
engine              0
transmission        0
ext_col             0
int_col             0
accident         2452
clean_title     21419
price               0
dtype: int64

In [11]:
df["fuel_type"].value_counts()

fuel_type
Gasoline          165940
Hybrid              6832
E85 Flex Fuel       5406
Diesel              3955
–                    781
Plug-In Hybrid       521
not supported         15
Name: count, dtype: int64

## Handling Null values in the fuel_type and accident attributes

Since the number of NUlls is very low (3%, 1.5% of the data respectively) we have 2 options:

Option 1 - drop those entries and train the model without them.

Option 2 - Replacing them with another value:
1. The most frequent value in the dataset.
2. A value indicating "unknown".

In [None]:
# Option 1 - Fuel Type

df1 = df.copy()
df1 = df1[(df1['fuel_type'] != 'not supported')]
df1 = df1[(df1['fuel_type'] != '–')]
df1 = df1.dropna(subset=['fuel_type'])

In [55]:
# Option 1 - Accident

df1 = df1.dropna(subset=['accident'])

In [56]:
df1.isna().sum()    

brand               0
model               0
model_year          0
milage              0
fuel_type           0
engine              0
transmission        0
ext_col             0
int_col             0
accident            0
clean_title     18448
price               0
dtype: int64

In [None]:
# Option 2 - Fuel Type

df2 = df.copy()
df2["fuel_type"] = df2["fuel_type"].replace('not supported', "Gasoline")
df2["fuel_type"] = df2["fuel_type"].replace('–', "Gasoline")
df2["fuel_type"] = df2["fuel_type"].replace(pd.NA, "Gasoline")

In [57]:
# Option 2 - Accident

df2 = df2.fillna("Unknown")

## Handling Null values in the clean_title attribute

Since the number of NUlls is more significant, we have 2 different options:

Option 1 - drop those entries, since the column would only have one value remaining, it would not contribute to the model training - we would have to drop the attribute entirely.

Option 2 - Replacing them with another value - "No".

We will choose option 2.

In [60]:
df1 = df1.fillna("No")
df2 = df2.fillna("No")

In [61]:
df1.isna().sum()   

brand           0
model           0
model_year      0
milage          0
fuel_type       0
engine          0
transmission    0
ext_col         0
int_col         0
accident        0
clean_title     0
price           0
dtype: int64

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.linear_model import LinearRegression

# Define the columns
ordinal_cols = ['clean_title']
onehot_cols = ['brand', 'model', 'fuel_type', 'transmission', 'ext_col', 'int_col', 'accident']
numeric_cols = ['model_year', 'milage', 'price']

# Define the transformers
ordinal_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OrdinalEncoder())
])

onehot_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

numeric_transformer = SimpleImputer(strategy='mean')

# Combine the transformers into a ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('ord', ordinal_transformer, ordinal_cols),
        ('onehot', onehot_transformer, onehot_cols),
        ('num', numeric_transformer, numeric_cols)
    ])

# Create the pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

# Fit the pipeline to the data
X = df.drop(columns=['price'])
y = df['price']
pipeline.fit(X, y)