In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

In [3]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score,mean_squared_error,mean_absolute_error

In [4]:
Data=pd.read_csv('Car_Sales_Extended_Missing_Data.csv')

In [5]:
Data.dropna(subset=['Price'],inplace=True)

In [6]:
Data

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431.0,4.0,15323.0
1,BMW,Blue,192714.0,5.0,19943.0
2,Honda,White,84714.0,4.0,28343.0
3,Toyota,White,154365.0,4.0,13434.0
4,Nissan,Blue,181577.0,3.0,14043.0
...,...,...,...,...,...
995,Toyota,Black,35820.0,4.0,32042.0
996,,White,155144.0,3.0,5716.0
997,Nissan,Blue,66604.0,4.0,31570.0
998,Honda,White,215883.0,4.0,4001.0


In [7]:
Categorical_Features=['Make','Colour']
Categorical_Transform=Pipeline(steps=[
     ('Imputer',SimpleImputer(strategy='constant',fill_value='missing')),
     ('One_Hot_Encoder',OneHotEncoder(handle_unknown='ignore'))])

In [8]:
Categorical_Transform

Pipeline(steps=[('Imputer',
                 SimpleImputer(fill_value='missing', strategy='constant')),
                ('One_Hot_Encoder', OneHotEncoder(handle_unknown='ignore'))])

In [9]:
Labelled_Features=['Doors']
Labelled_Transform=Pipeline(steps=[
     ('Imputer',SimpleImputer(strategy='constant',fill_value=4)),
     ('One_Hot_Encoder',OneHotEncoder(handle_unknown='ignore'))])

In [10]:
Numerical_Features=['Odometer (KM)']
Numerical_Transform=Pipeline(steps=[
     ('Imputer',SimpleImputer(strategy='mean'))])

In [11]:
CT=ColumnTransformer([('Categorical',Categorical_Transform,Categorical_Features),
                      ('Labelled',Labelled_Transform,Labelled_Features),
                      ('Numerical',Numerical_Transform,Numerical_Features)])

In [12]:
Model=Pipeline(steps=[('Preprocessor',CT),
                      ('Model',RandomForestRegressor())])

In [13]:
Model

Pipeline(steps=[('Preprocessor',
                 ColumnTransformer(transformers=[('Categorical',
                                                  Pipeline(steps=[('Imputer',
                                                                   SimpleImputer(fill_value='missing',
                                                                                 strategy='constant')),
                                                                  ('One_Hot_Encoder',
                                                                   OneHotEncoder(handle_unknown='ignore'))]),
                                                  ['Make', 'Colour']),
                                                 ('Labelled',
                                                  Pipeline(steps=[('Imputer',
                                                                   SimpleImputer(fill_value=4,
                                                                                 strategy='constant')),
           

In [14]:
x=Data.drop('Price',axis=1)
y=Data['Price']

In [15]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)

In [16]:
Model.fit(x_train,y_train)

Pipeline(steps=[('Preprocessor',
                 ColumnTransformer(transformers=[('Categorical',
                                                  Pipeline(steps=[('Imputer',
                                                                   SimpleImputer(fill_value='missing',
                                                                                 strategy='constant')),
                                                                  ('One_Hot_Encoder',
                                                                   OneHotEncoder(handle_unknown='ignore'))]),
                                                  ['Make', 'Colour']),
                                                 ('Labelled',
                                                  Pipeline(steps=[('Imputer',
                                                                   SimpleImputer(fill_value=4,
                                                                                 strategy='constant')),
           

In [17]:
y_pred=Model.predict(x_test)

In [18]:
y_pred

array([17574.11      , 21604.88      , 11972.66      ,  9442.71      ,
       12049.02      , 10940.07      , 16299.57      ,  9573.39      ,
       18182.46      , 16301.94779523,  8356.18      , 15929.85      ,
        8764.78      , 10330.06      , 13053.59      , 20068.56      ,
       16716.7       ,  7135.7       , 11539.8       , 14549.78      ,
       12533.8       , 17824.92125   , 18655.01      , 27574.59      ,
        9271.88816667, 21494.41      , 13327.56      ,  7448.1       ,
       20669.28      , 19357.59      , 12011.88      , 16117.38      ,
       10539.26      , 11318.22663095, 29122.96      , 16809.4       ,
       12220.7       , 12278.33      , 21309.85      ,  9530.08      ,
       16333.33      , 20461.67      , 25543.58      , 16185.39      ,
       14127.54828571, 12106.92      , 14866.2       ,  9157.07      ,
       15468.92      , 13963.52      , 11278.88      , 20616.07      ,
       15688.12      ,  5822.73      , 13730.91      ,  9158.22      ,
      

In [19]:
Model.score(x_test,y_test)

0.2155323398147022

In [20]:
print(f'R-Suared:{r2_score(y_test,y_pred)}')
print(f'MAE:{mean_absolute_error(y_test,y_pred)}')
print(f'MSE:{mean_squared_error(y_test,y_pred)}')

R-Suared:0.2155323398147022
MAE:5775.5477952369265
MSE:52527713.70680046
