# importing the library

In [1]:
import numpy as np
import pandas as pd
from pandas import Series,DataFrame

# importing dataset

In [6]:
df = pd.read_csv('Data/car-sales-extended-missing-data.csv')
df.head()

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431.0,4.0,15323.0
1,BMW,Blue,192714.0,5.0,19943.0
2,Honda,White,84714.0,4.0,28343.0
3,Toyota,White,154365.0,4.0,13434.0
4,Nissan,Blue,181577.0,3.0,14043.0


# getting your your X and Y dataset ready

In [44]:
X = df.drop('Price',axis=1)
X

Unnamed: 0,Make,Colour,Odometer (KM),Doors
0,Honda,White,35431.0,4.0
1,BMW,Blue,192714.0,5.0
2,Honda,White,84714.0,4.0
3,Toyota,White,154365.0,4.0
4,Nissan,Blue,181577.0,3.0
...,...,...,...,...
995,Toyota,Black,35820.0,4.0
996,,White,155144.0,3.0
997,Nissan,Blue,66604.0,4.0
998,Honda,White,215883.0,4.0


In [17]:
X.head()
df.head()

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431.0,4.0,15323.0
1,BMW,Blue,192714.0,5.0,19943.0
2,Honda,White,84714.0,4.0,28343.0
3,Toyota,White,154365.0,4.0,13434.0
4,Nissan,Blue,181577.0,3.0,14043.0


# working with NaN values in d dataset

In [23]:
df.dropna(subset=['Price'],inplace=True)

In [26]:
df.isna().sum()

Make             47
Colour           46
Odometer (KM)    48
Doors            47
Price             0
dtype: int64

In [27]:
df.shape

(950, 5)

In [61]:
Y = df['Price']
Y

0      15323.0
1      19943.0
2      28343.0
3      13434.0
4      14043.0
        ...   
995    32042.0
996     5716.0
997    31570.0
998     4001.0
999    12732.0
Name: Price, Length: 950, dtype: float64

In [29]:
from sklearn.impute import SimpleImputer

In [38]:
from sklearn.compose import ColumnTransformer
df['Doors'].value_counts()

4.0    768
5.0     71
3.0     64
Name: Doors, dtype: int64

In [42]:
#define categorical features
cat_feat = ['Make','Colour']
cat_odo = ['Odometer (KM)']
cat_doo = ['Doors']

In [40]:
#imputer
feat_imputer = SimpleImputer(strategy='constant',fill_value='missing')
odo_imputer = SimpleImputer(strategy='mean')
doo_imputer = SimpleImputer(strategy='constant',fill_value=4)

In [49]:
CT = ColumnTransformer([('feat_imputer',feat_imputer,cat_feat),
                        ('odo_imputer',odo_imputer,cat_odo),
                        ('doo_imputer',doo_imputer,cat_doo)])

In [51]:
Xnew = CT.fit_transform(X)
Xnew

array([['Honda', 'White', 35431.0, 4.0],
       ['BMW', 'Blue', 192714.0, 5.0],
       ['Honda', 'White', 84714.0, 4.0],
       ...,
       ['Nissan', 'Blue', 66604.0, 4.0],
       ['Honda', 'White', 215883.0, 4.0],
       ['Toyota', 'Blue', 248360.0, 4.0]], dtype=object)

In [55]:
df1 = DataFrame(Xnew,columns=['Make','Colour','Odometer (KM)','Doors'])
df1

Unnamed: 0,Make,Colour,Odometer (KM),Doors
0,Honda,White,35431,4
1,BMW,Blue,192714,5
2,Honda,White,84714,4
3,Toyota,White,154365,4
4,Nissan,Blue,181577,3
...,...,...,...,...
945,Toyota,Black,35820,4
946,missing,White,155144,3
947,Nissan,Blue,66604,4
948,Honda,White,215883,4


In [64]:
df1.isna().sum()

Make             0
Colour           0
Odometer (KM)    0
Doors            0
dtype: int64

# encoding categorical data

In [65]:
categorical_features = ['Make','Colour','Doors']

In [66]:
from sklearn.preprocessing import OneHotEncoder

In [68]:
from sklearn.compose import ColumnTransformer

In [69]:
HT = OneHotEncoder()

In [71]:
CT = ColumnTransformer([('HT',HT,categorical_features)], remainder='passthrough')

In [80]:
Xtransform = CT.fit_transform(df1)

Unnamed: 0,Make,Colour,Odometer (KM),Doors
0,Honda,White,35431,4
1,BMW,Blue,192714,5
2,Honda,White,84714,4
3,Toyota,White,154365,4
4,Nissan,Blue,181577,3
...,...,...,...,...
945,Toyota,Black,35820,4
946,missing,White,155144,3
947,Nissan,Blue,66604,4
948,Honda,White,215883,4


# spliting dataset into train and test set

In [81]:
from sklearn.ensemble import RandomForestRegressor

In [82]:
RFR = RandomForestRegressor()

In [83]:
from sklearn.model_selection import train_test_split

In [87]:
X_train,X_test,Y_train,Y_test = train_test_split(Xtransform,Y,test_size = 0.2)

RFR.fit(X_train,Y_train)

RandomForestRegressor()

# predicting the output of test set

In [88]:
Ypred = RFR.predict(X_test)

In [89]:
Ypred

array([14988.41333333, 15553.46333333, 10896.53      , 19703.91      ,
       17949.87      , 16068.28      , 12424.31      , 17437.3175    ,
       22665.61      , 11289.23      ,  9832.3       , 10589.0503749 ,
       40602.56      , 18477.01      , 20303.68      , 25070.61      ,
       13328.63      ,  8241.21      , 18653.23      , 12671.4       ,
       37550.38      , 10377.74      , 25151.67      , 10663.99      ,
       23954.34      , 14713.9       , 11791.85      , 14617.        ,
       23071.2       , 13150.15      , 20462.47      ,  5985.04      ,
       17898.39      , 18033.98      ,  8317.85      , 20378.7       ,
        9532.29      , 11283.85      , 13985.86      , 13035.53      ,
       11604.07      , 10351.37      , 13650.3575    , 16305.3       ,
       17451.46421429, 24356.42      , 19815.06      , 34286.65      ,
        9520.24      ,  9761.37      , 14847.29215861, 17609.41166667,
        7520.47      , 22694.7       , 17515.55      , 17731.03      ,
      

In [90]:
RFR.score(X_test,Y_test)

0.29158378391797946

In [92]:
from sklearn.linear_model import Ridge

In [94]:
RE = Ridge(random_state = 5)

In [95]:
from sklearn.model_selection import train_test_split

In [96]:
X_train,X_test,Y_train,Y_test = train_test_split(Xtransform,Y,test_size = 0.2)

In [97]:
RE.fit(X_train,Y_train)

Ridge(random_state=5)

In [98]:
Yr = RE.predict(X_test)

In [99]:
Yr

array([12864.31866148, 15680.42315011, 17604.07197653, 19657.26842814,
       19285.12780656, 19259.88034771, 15674.84005255, 17484.70238788,
       15054.42452135, 16325.44383705, 20033.01582919, 21912.29632329,
       18065.69038162, 15941.86785146, 15739.66327295, 13580.83264099,
       15772.51955467, 15912.24521606, 17655.30801265, 10967.89361122,
       11850.61590875, 15941.86785131, 20346.16336749, 10218.57275816,
       11034.3472937 , 15155.1179078 , 14286.4274653 , 10348.07097482,
       11542.65620431, 10336.70714801, 17829.76745707, 18054.72181855,
       10763.69058572, 14250.2608516 , 19315.21724311, 20267.55533132,
       15731.90622601, 15377.84891124, 20071.55402403, 12965.30849559,
       20473.58645147, 15941.86785138, 15031.49923619, 19854.94960498,
       11368.29557578, 21102.84600491, 18274.09308239, 12480.7648053 ,
       21356.70401191, 21409.96577373, 19710.03611006, 15049.82958232,
       19612.90009553, 18850.28850577, 18495.58888841, 18341.04084424,
      

In [100]:
RE.score(X_test,Y_test)

0.15011857117399174

In [102]:
from sklearn.svm import SVR

In [103]:
SV = SVR()
SV.fit(X_train,Y_train)

SVR()

In [104]:
SV.score(X_test,Y_test)

-0.04321882642773933