In [262]:
import numpy as np
import pandas as pd 
import seaborn as sns
import sklearn
import matplotlib.pyplot as plt
from sklearn.base import BaseEstimator, TransformerMixin

In [263]:
df = pd.read_csv("./playground-series-s4e9/train.csv",index_col='id')
df.head()

Unnamed: 0_level_0,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title,price
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,MINI,Cooper S Base,2007,213000,Gasoline,172.0HP 1.6L 4 Cylinder Engine Gasoline Fuel,A/T,Yellow,Gray,None reported,Yes,4200
1,Lincoln,LS V8,2002,143250,Gasoline,252.0HP 3.9L 8 Cylinder Engine Gasoline Fuel,A/T,Silver,Beige,At least 1 accident or damage reported,Yes,4999
2,Chevrolet,Silverado 2500 LT,2002,136731,E85 Flex Fuel,320.0HP 5.3L 8 Cylinder Engine Flex Fuel Capab...,A/T,Blue,Gray,None reported,Yes,13900
3,Genesis,G90 5.0 Ultimate,2017,19500,Gasoline,420.0HP 5.0L 8 Cylinder Engine Gasoline Fuel,Transmission w/Dual Shift Mode,Black,Black,None reported,Yes,45000
4,Mercedes-Benz,Metris Base,2021,7388,Gasoline,208.0HP 2.0L 4 Cylinder Engine Gasoline Fuel,7-Speed A/T,Black,Beige,None reported,Yes,97500


In [264]:
df.isna().sum()

brand               0
model               0
model_year          0
milage              0
fuel_type        5083
engine              0
transmission        0
ext_col             0
int_col             0
accident         2452
clean_title     21419
price               0
dtype: int64

In [265]:
df["fuel_type"].value_counts()

fuel_type
Gasoline          165940
Hybrid              6832
E85 Flex Fuel       5406
Diesel              3955
–                    781
Plug-In Hybrid       521
not supported         15
Name: count, dtype: int64

# Custom transformations automisation

The following class will preform automizations to the data.
It will be compatible with the scikit learn Pipeline and Transformer objects, making sure the entire pipeline is done automatically on the test data as well.

## Handling Null values in the fuel_type and accident attributes

Since the number of NUlls is very low (3%, 1.5% of the data respectively) we have 2 options:

Option 1 - drop those entries and train the model without them.

Option 2 - Replacing them with another value:
1. The most frequent value in the dataset.
2. A value indicating "unknown".

## Handling Null values in the clean_title attribute

Since the number of NUlls is more significant, we have 2 different options:

Option 1 - drop those entries, since the column would only have one value remaining, it would not contribute to the model training - we would have to drop the attribute entirely.

Option 2 - Replacing them with another value - "No".

We will choose option 2.

In [266]:
df_columns = (df.columns).to_list()
df_columns

['brand',
 'model',
 'model_year',
 'milage',
 'fuel_type',
 'engine',
 'transmission',
 'ext_col',
 'int_col',
 'accident',
 'clean_title',
 'price']

In [None]:
class ImputerOne(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.columns = ['brand',
                        'model',
                        'model_year',
                        'milage',
                        'fuel_type',
                        'engine',
                        'transmission',
                        'ext_col',
                        'int_col',
                        'accident',
                        'clean_title',
                        'price']

    def fit(self, X, y=None):
        # No fitting necessary for this transformer
        return self

    def transform(self, X):
        X = X.copy()
        
        # Drop the 'id' column if it exists
        if 'id' in X.columns:
            X.drop('id', axis=1, inplace=True)
            
        # Check if all required columns are present
        for col in self.columns:
            if col not in X.columns:
                raise KeyError(f"Column '{col}' is missing from the input DataFrame.")
        
        # Drop null values based on specific conditions
        X = X[X['fuel_type'].notna() & (X['fuel_type'] != 'not supported') & (X['fuel_type'] != '–')]
        X = X[X['accident'].notna()]
        
        # Fill missing values in 'clean_title' column
        X['clean_title'] = X['clean_title'].fillna("No")
        
        # Ensure the DataFrame has only the specified columns
        X = X[self.columns]
        X.reset_index(drop=True, inplace=True)
        
        return X

In [None]:
class EngineTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.columns = ['engine']
        self.output_columns = ['horse_power', 'tank_size', 'num_cylinders']

    def fit(self, X, y=None):
        # No fitting necessary for this transformer
        return self

    def transform(self, X):
        X = X.copy()
        X = pd.DataFrame(X, columns=self.columns)
   
        # Extract values using regular expressions:
        X['horse_power'] = X['engine'].str.extract(r'(\d+\.?\d*)HP').astype(float)
        X['tank_size'] = X['engine'].str.extract(r'(\d+\.?\d*)L').astype(float)
        X['num_cylinders'] = X['engine'].str.extract(r'(\d+)\sCylinder').astype(float)    
        # Drop the original 'engine' column
        X.drop('engine', axis=1, inplace=True)

        return X[self.output_columns]

In [268]:
class TransmissionTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.columns = ['transmission']
        self.output_columns = ['automatic', 'manual', 'dual']

    def fit(self, X, y=None):
        # No fitting necessary for this transformer
        return self

    def transform(self, X):
        X = X.copy()
        X = pd.DataFrame(X, columns=self.columns)
        # Initialize all attributes to False
        X['automatic'] = False
        X['manual'] = False
        X['dual'] = False
        
        # Categorize as 'dual' if it contains 'Dual'
        X.loc[X['transmission'].str.contains(r'\b(Dual|At/Mt)\b', case=False, na=False), 'dual'] = True
        
        # Ensure 'dual' is boolean before using ~ operator
        X['dual'] = X['dual'].astype(bool)
        # Categorize as 'automatic' if it contains AT, A/T, CVT, or Automatic (but not Dual)
        X.loc[(X['transmission'].str.contains(r'\b(AT|A/T|CVT|Automatic)\b', case=False, na=False)) & ~X['dual'], 'automatic'] = True
        # Categorize as 'manual' if it contains MT, M/T, or Manual (but not Dual)
        X.loc[(X['transmission'].str.contains(r'\b(MT|M/T|Manual)\b', case=False, na=False)) & ~X['dual'], 'manual'] = True

        # Drop the original 'transmission' column
        X.drop('transmission', axis=1, inplace=True)
        return X[self.output_columns]

In [270]:
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np

class ImputerTwo(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        # No fitting necessary for this transformer
        return self

    def transform(self, X):
        X = pd.DataFrame(X.copy())
        if 'fuel_type' in X.columns:
            X["fuel_type"] = X["fuel_type"].replace('not supported', "Gasoline")
            X["fuel_type"] = X["fuel_type"].replace('–', "Gasoline")
            X["fuel_type"] = X["fuel_type"].replace(pd.NA, "Gasoline")
        if 'accident' in X.columns:
            X = X['accident'].fillna("Unknown")
        if 'clean_title' in X.columns:
            X = X['clean_title'].fillna("No")
        return X
            

In [271]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.linear_model import LinearRegression

In [272]:
imp1 = ImputerOne()
X_trail = imp1.fit_transform(df)
X_trail = pd.DataFrame(X_trail)
X_trail.head()

Unnamed: 0,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title,price
0,MINI,Cooper S Base,2007,213000,Gasoline,172.0HP 1.6L 4 Cylinder Engine Gasoline Fuel,A/T,Yellow,Gray,None reported,Yes,4200
1,Lincoln,LS V8,2002,143250,Gasoline,252.0HP 3.9L 8 Cylinder Engine Gasoline Fuel,A/T,Silver,Beige,At least 1 accident or damage reported,Yes,4999
2,Chevrolet,Silverado 2500 LT,2002,136731,E85 Flex Fuel,320.0HP 5.3L 8 Cylinder Engine Flex Fuel Capab...,A/T,Blue,Gray,None reported,Yes,13900
3,Genesis,G90 5.0 Ultimate,2017,19500,Gasoline,420.0HP 5.0L 8 Cylinder Engine Gasoline Fuel,Transmission w/Dual Shift Mode,Black,Black,None reported,Yes,45000
4,Mercedes-Benz,Metris Base,2021,7388,Gasoline,208.0HP 2.0L 4 Cylinder Engine Gasoline Fuel,7-Speed A/T,Black,Beige,None reported,Yes,97500


In [258]:
X_trail.isna().sum()

brand           0
model           0
model_year      0
milage          0
fuel_type       0
engine          0
transmission    0
ext_col         0
int_col         0
accident        0
clean_title     0
price           0
dtype: int64

In [273]:
column_transformers = ColumnTransformer(
    transformers=[
        ('engine', EngineTransformer(), ['engine']),
        ('transmission', TransmissionTransformer(), ['transmission']),
    ],
    remainder='passthrough'  # Include the rest of the columns
)

X_trail = column_transformers.fit_transform(X_trail)
X_trail = pd.DataFrame(X_trail)
X_trail.head()

  X.loc[X['transmission'].str.contains(r'\b(Dual|At/Mt)\b', case=False, na=False), 'dual'] = True
  X.loc[(X['transmission'].str.contains(r'\b(AT|A/T|CVT|Automatic)\b', case=False, na=False)) & ~X['dual'], 'automatic'] = True
  X.loc[(X['transmission'].str.contains(r'\b(MT|M/T|Manual)\b', case=False, na=False)) & ~X['dual'], 'manual'] = True


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,172.0,1.6,4.0,True,False,False,MINI,Cooper S Base,2007,213000,Gasoline,Yellow,Gray,None reported,Yes,4200
1,252.0,3.9,8.0,True,False,False,Lincoln,LS V8,2002,143250,Gasoline,Silver,Beige,At least 1 accident or damage reported,Yes,4999
2,320.0,5.3,8.0,True,False,False,Chevrolet,Silverado 2500 LT,2002,136731,E85 Flex Fuel,Blue,Gray,None reported,Yes,13900
3,420.0,5.0,8.0,False,False,True,Genesis,G90 5.0 Ultimate,2017,19500,Gasoline,Black,Black,None reported,Yes,45000
4,208.0,2.0,4.0,True,False,False,Mercedes-Benz,Metris Base,2021,7388,Gasoline,Black,Beige,None reported,Yes,97500


In [None]:
column_transformers = ColumnTransformer(
    transformers=[
        ('engine', EngineTransformer(), ['engine']),
        ('transmission', TransmissionTransformer(), ['transmission']),
    ],
    remainder='passthrough'  # Include the rest of the columns
)

preprocessor_1 = Pipeline([
	('column_transformers', column_transformers),
    ('imputer_1', ImputerOne())
])

prepro1_df = preprocessor_1.fit_transform(df)

  X.loc[X['transmission'].str.contains(r'\b(Dual|At/Mt)\b', case=False, na=False), 'dual'] = True
  X.loc[(X['transmission'].str.contains(r'\b(AT|A/T|CVT|Automatic)\b', case=False, na=False)) & ~X['dual'], 'automatic'] = True


ImputerOne - Columns before imputation: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]
ImputerOne - Columns after imputation: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]
Shape after preprocessor_1: (182654, 15)


  X.loc[(X['transmission'].str.contains(r'\b(MT|M/T|Manual)\b', case=False, na=False)) & ~X['dual'], 'manual'] = True


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,172.0HP 1.6L 4 Cylinder Engine Gasoline Fuel,A/T,True,False,False,MINI,Cooper S Base,2007,213000,Gasoline,Yellow,Gray,None reported,Yes,4200
1,252.0HP 3.9L 8 Cylinder Engine Gasoline Fuel,A/T,True,False,False,Lincoln,LS V8,2002,143250,Gasoline,Silver,Beige,At least 1 accident or damage reported,Yes,4999
2,320.0HP 5.3L 8 Cylinder Engine Flex Fuel Capab...,A/T,True,False,False,Chevrolet,Silverado 2500 LT,2002,136731,E85 Flex Fuel,Blue,Gray,None reported,Yes,13900
3,420.0HP 5.0L 8 Cylinder Engine Gasoline Fuel,Transmission w/Dual Shift Mode,False,False,True,Genesis,G90 5.0 Ultimate,2017,19500,Gasoline,Black,Black,None reported,Yes,45000
4,208.0HP 2.0L 4 Cylinder Engine Gasoline Fuel,7-Speed A/T,True,False,False,Mercedes-Benz,Metris Base,2021,7388,Gasoline,Black,Beige,None reported,Yes,97500


In [None]:
# Create a second preprocessor like the first, but with the second imputer

prepro2_df = preprocessor_2.fit_transform(df)

In [None]:
# Create the pipeline
model = Pipeline(steps=[
    #('preprocessor', preprocessor_1),
    ('regressor', LinearRegression())
])

# Fit the pipeline to the data
X = df.drop(columns=['price'])
y = df['price']
model.fit(X, y)

In [3]:
# preprecoess the color columns