# Encoding / Imputation

In [211]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [212]:
df = pd.read_csv('./Data/car-sales-extended.csv')
df.head()

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431,4,15323
1,BMW,Blue,192714,5,19943
2,Honda,White,84714,4,28343
3,Toyota,White,154365,4,13434
4,Nissan,Blue,181577,3,14043


In [213]:
X = df.drop('Price', axis=1)
y = df['Price']

In [214]:
df.dtypes

Make             object
Colour           object
Odometer (KM)     int64
Doors             int64
Price             int64
dtype: object

### Convert cetegorical data to numerical

In [215]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer


categorical_feature = ['Make', 'Colour', 'Doors']
one_hot = OneHotEncoder()
transformer = ColumnTransformer([('one_hot',
                                one_hot,
                                categorical_feature)],
                                remainder='passthrough')

transformed_X = transformer.fit_transform(X)

In [216]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

model = LinearRegression()


In [217]:
np.random.seed(42)

X_train, X_text, y_train, y_test = train_test_split(transformed_X, y, test_size=0.2)

In [218]:
model.fit(X_train,y_train)
model.score(X_text,y_test)


0.4231640732012397

### Missing Values 

##### 1. Fill them with some value (imputation)
##### 2. Remove Sample with missing values

In [219]:
car_sales_missing = pd.read_csv('./Data/car-sales-extended-missing-data.csv')
car_sales_missing.head()

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431.0,4.0,15323.0
1,BMW,Blue,192714.0,5.0,19943.0
2,Honda,White,84714.0,4.0,28343.0
3,Toyota,White,154365.0,4.0,13434.0
4,Nissan,Blue,181577.0,3.0,14043.0


In [220]:
car_sales_missing.isna().sum()

Make             49
Colour           50
Odometer (KM)    50
Doors            50
Price            50
dtype: int64

In [221]:
X = df.drop('Price', axis=1)
y = df['Price']

###### Option 1: Fill missing data with Pandas 

In [222]:
car_sales_missing['Make'].fillna('missing', inplace=True)
car_sales_missing['Colour'].fillna('missing', inplace=True)
car_sales_missing['Odometer (KM)'].fillna(car_sales_missing['Odometer (KM)'].mean(), inplace=True)
car_sales_missing['Doors'].fillna(4, inplace = True)

In [223]:
car_sales_missing.isna().sum()

Make              0
Colour            0
Odometer (KM)     0
Doors             0
Price            50
dtype: int64

In [224]:
##### Remove Missin Price Row

car_sales_missing['Price'].dropna(inplace=True)

In [225]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer


categorical_feature = ['Make', 'Colour', 'Doors']
one_hot = OneHotEncoder()
transformer = ColumnTransformer([('one_hot',
                                one_hot,
                                categorical_feature)],
                                remainder='passthrough')

transformed_X = transformer.fit_transform(X)

###### Option2: Fill missing values Using Sklearn

In [226]:
car_sales_missing = pd.read_csv('./Data/car-sales-extended-missing-data.csv')
car_sales_missing.isna().sum()

Make             49
Colour           50
Odometer (KM)    50
Doors            50
Price            50
dtype: int64

In [227]:
car_sales_missing.dropna(subset=['Price'],inplace=True)
car_sales_missing.isna().sum()

Make             47
Colour           46
Odometer (KM)    48
Doors            47
Price             0
dtype: int64

In [228]:
X = car_sales_missing.drop('Price', axis=1)
y = car_sales_missing['Price']


In [229]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2)

In [230]:
# fill missing value with sklearn
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

cat_imputer = SimpleImputer(strategy='constant', fill_value='missing')
door_imputer = SimpleImputer(strategy='constant', fill_value=4)
num_imputer = SimpleImputer(strategy= "mean")

cat_feature =['Make', 'Colour']
door_feature = ['Doors']
num_feature = ['Odometer (KM)']

imputer = ColumnTransformer([
    ('cat_imputer', cat_imputer, cat_feature),
    ('door_imputer', door_imputer, door_feature),
    ('num_imputer', num_imputer,num_feature)
])

filled_X_Train = imputer.fit_transform(X_train)

filled_X_Test = imputer.fit_transform(X_test)

In [231]:
Filled_Train = pd.DataFrame(filled_X_Train, columns = ['Make', 'Colour', 'Doors', 'Odometer (KM)'])
Fillet_Test = pd.DataFrame(filled_X_Test, columns = ['Make', 'Colour', 'Doors', 'Odometer (KM)'])

In [232]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer


categorical_feature = ['Make', 'Colour', 'Doors']
one_hot = OneHotEncoder()
transformer = ColumnTransformer([('one_hot',
                                one_hot,
                                categorical_feature)],
                                remainder='passthrough')

transformed_X_train = transformer.fit_transform(Filled_Train)
transformed_X_test = transformer.fit_transform(Fillet_Test)

In [233]:
Fillet_Test.isna().sum()

Make             0
Colour           0
Doors            0
Odometer (KM)    0
dtype: int64

In [234]:
transformed_X_train, transformed_X_test

(<760x15 sparse matrix of type '<class 'numpy.float64'>'
 	with 3040 stored elements in Compressed Sparse Row format>,
 <190x15 sparse matrix of type '<class 'numpy.float64'>'
 	with 760 stored elements in Compressed Sparse Row format>)

In [235]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(transformed_X_train,y_train)
model.score(transformed_X_test,y_test)

0.35464881063678433