# No Encoding Car Sales

In [1]:
import pandas as pd
df = pd.read_csv('car-sales-missing-data.csv')
df

Unnamed: 0,Make,Colour,Odometer,Doors,Price
0,Toyota,White,150043.0,4.0,4000.0
1,Honda,Red,87899.0,4.0,5000.0
2,Toyota,Blue,,3.0,7000.0
3,BMW,Black,11179.0,5.0,22000.0
4,Nissan,White,213095.0,4.0,3500.0
5,Toyota,Green,,4.0,4500.0
6,Honda,,,4.0,7500.0
7,Honda,Blue,,4.0,
8,Toyota,White,60000.0,,
9,,White,31600.0,4.0,9700.0


In [9]:
print(f'Shape: {df.shape}')
print(f'\ntop 5 elements:\n{df.head()}')
print(f'\nlast 5 elements:\n{df.tail()}')
print(f'\nnumber of columns:\n{len(df.columns)}')
print(f'\nnumber of rows:\n{len(df.index)}')
print(f'\nframe showing null values:\n{df.isnull()}')
print(f'\ncolumn wise null value count:\n{df.isnull().sum()}')

Shape: (10, 5)

top 5 elements:
     Make Colour  Odometer  Doors    Price
0  Toyota  White  150043.0    4.0   4000.0
1   Honda    Red   87899.0    4.0   5000.0
2  Toyota   Blue       NaN    3.0   7000.0
3     BMW  Black   11179.0    5.0  22000.0
4  Nissan  White  213095.0    4.0   3500.0

last 5 elements:
     Make Colour  Odometer  Doors   Price
5  Toyota  Green       NaN    4.0  4500.0
6   Honda    NaN       NaN    4.0  7500.0
7   Honda   Blue       NaN    4.0     NaN
8  Toyota  White   60000.0    NaN     NaN
9     NaN  White   31600.0    4.0  9700.0

number of columns:
5

number of rows:
10

frame showing null values:
    Make  Colour  Odometer  Doors  Price
0  False   False     False  False  False
1  False   False     False  False  False
2  False   False      True  False  False
3  False   False     False  False  False
4  False   False     False  False  False
5  False   False      True  False  False
6  False    True      True  False  False
7  False   False      True  False   True
8

## Elemination of null values
We can do this in a couple of ways

1) Drop rows having null values
> df.dropna(inplace = True)\
> Not a good way!

2) Fill up something
> Put zero there
> > df.fillna(0,inplace = True)

> Fill with value of previous row
> > df.fillna(method = 'pad',inplace = True)

> Fill with value of the row after
> > df.fillna(method = 'bfill',inplace = Tue)


In [10]:
# using pad
df.fillna(method = 'pad', inplace = True)
df

Unnamed: 0,Make,Colour,Odometer,Doors,Price
0,Toyota,White,150043.0,4.0,4000.0
1,Honda,Red,87899.0,4.0,5000.0
2,Toyota,Blue,87899.0,3.0,7000.0
3,BMW,Black,11179.0,5.0,22000.0
4,Nissan,White,213095.0,4.0,3500.0
5,Toyota,Green,213095.0,4.0,4500.0
6,Honda,Green,213095.0,4.0,7500.0
7,Honda,Blue,213095.0,4.0,7500.0
8,Toyota,White,60000.0,4.0,7500.0
9,Toyota,White,31600.0,4.0,9700.0


## Splitting methods

Splitting can be done in two ways

1) Using drop
> This will yeild a data frame as an output\
It's goes like\
df.drop(['Column_name'], axis = 1)\
axis = 1 specifies that we're selecting a cloumn\

2) Using iloc
> This will yeild a array as an output and is fom single column\
It's goes like\
df.iloc[:,:].values This is basically slicing where first half is row and second half is column\ 
start[:]end(not included)\ 
if only [:,x] the x column is selected\
if [:,:] the whole thing is selected 

In [15]:
#we'll use drop for simplicity
x = df.drop(['Price','Make','Colour'],axis = 1)
y = df['Price']
y

0     4000.0
1     5000.0
2     7000.0
3    22000.0
4     3500.0
5     4500.0
6     7500.0
7     7500.0
8     7500.0
9     9700.0
Name: Price, dtype: float64

In [16]:
#Now we'll try to do splitting for train and test
from sklearn.model_selection import train_test_split as tts
x_train, x_test, y_train, y_test =  tts(x,y,test_size = 0.25)

In [17]:
from sklearn.linear_model import LinearRegression
regression = LinearRegression()
regression.fit(x_train, y_train) #training

LinearRegression()

In [21]:
y_pred = regression.predict(x_test) #putting test values to do sum prediction
x_pred = regression.predict(x_train)
print("Test Score: ",regression.score(x_test,y_test))
print("Train Score: ",regression.score(x_train,y_train))

Test Score:  -3.5382225427796055
Train Score:  0.7426529510790547


# Label Encoding Car Sales

In [47]:
import pandas as pd
df = pd.read_csv('car-sales-missing-data.csv')
df

Unnamed: 0,Make,Colour,Odometer,Doors,Price
0,Toyota,White,150043.0,4.0,4000.0
1,Honda,Red,87899.0,4.0,5000.0
2,Toyota,Blue,,3.0,7000.0
3,BMW,Black,11179.0,5.0,22000.0
4,Nissan,White,213095.0,4.0,3500.0
5,Toyota,Green,,4.0,4500.0
6,Honda,,,4.0,7500.0
7,Honda,Blue,,4.0,
8,Toyota,White,60000.0,,
9,,White,31600.0,4.0,9700.0


In [48]:
df.isnull().sum()

Make        1
Colour      1
Odometer    4
Doors       1
Price       2
dtype: int64

In [49]:
df.fillna(method = 'pad', inplace = True)
df

Unnamed: 0,Make,Colour,Odometer,Doors,Price
0,Toyota,White,150043.0,4.0,4000.0
1,Honda,Red,87899.0,4.0,5000.0
2,Toyota,Blue,87899.0,3.0,7000.0
3,BMW,Black,11179.0,5.0,22000.0
4,Nissan,White,213095.0,4.0,3500.0
5,Toyota,Green,213095.0,4.0,4500.0
6,Honda,Green,213095.0,4.0,7500.0
7,Honda,Blue,213095.0,4.0,7500.0
8,Toyota,White,60000.0,4.0,7500.0
9,Toyota,White,31600.0,4.0,9700.0


## Encoding

There are two ways to go with encoding\ 

1) Label Encoding
> Firstly its syntax<br>
from sklearn.preprocessing import LabelEncoder<br>
from sklearn.compose import ColumnTransformer<br>
label_encoder = LabelEncoder()<br>
x['Make'] = label_encoder.fit_transform(x['Make'])
.<br>
.<br>
.<br>
.<br>
and so on for all string values<br>

2) One Hot Encoder
> Firstly its syntax<br>
from sklearn.preprocessing import OneHotEncoder<br>
from sklearn.compose import ColumnTransformer<br>
ohe = ColumnTransformer([("Make,Colour",OneHotEncoder(),0,1])],remainder="passthrough")<br>
x=ct.fit_transform(x)<br>


The main differnce between LE and OHE is:

Let's say you have a list of fruits: apple, orange, and pear.

Label encoding is a way to give each fruit a number. For example, you can give apple the number 1, orange the number 2, and pear the number 3.

One hot encoding is a different way to represent the fruits. Instead of giving them numbers, you make a list of three fruit "columns" and put a "1" in the column for the fruit you want, and a "0" in the other two columns. So, if you want to represent the fruit apple, you would put a "1" in the apple column and a "0" in the orange and pear columns.

In [69]:
x = df.drop(['Price'],axis = 1)
y = df['Price']
y

#Can also do:
#x=df.iloc[:,:-1]
#y=df.iloc[:,4]


0     4000.0
1     5000.0
2     7000.0
3    22000.0
4     3500.0
5     4500.0
6     7500.0
7     7500.0
8     7500.0
9     9700.0
Name: Price, dtype: float64

In [70]:
#applying label encoder
from sklearn.preprocessing import LabelEncoder
from sklearn.compose import ColumnTransformer
label_encoder = LabelEncoder()


In [71]:
x['Make'] = label_encoder.fit_transform(x['Make'])

In [72]:
x['Colour'] = label_encoder.fit_transform(x['Colour'])

In [73]:
x['Doors'] = label_encoder.fit_transform(x['Doors'])

In [74]:
x

Unnamed: 0,Make,Colour,Odometer,Doors
0,3,4,150043.0,1
1,1,3,87899.0,1
2,3,1,87899.0,0
3,0,0,11179.0,2
4,2,4,213095.0,1
5,3,2,213095.0,1
6,1,2,213095.0,1
7,1,1,213095.0,1
8,3,4,60000.0,1
9,3,4,31600.0,1


In [75]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.25,random_state=3)

In [76]:
from sklearn.linear_model import LinearRegression
regressor=LinearRegression()
regressor.fit(x_train,y_train)

LinearRegression()

In [77]:
y_pred=regressor.predict(x_test)
x_pred=regressor.predict(x_train)
y_pred

array([ 1569.53402064,  3570.77522123, 12924.71962177])

In [78]:
print('Train Score:', regressor.score(x_train,y_train) )
print('Test Score:', regressor.score(x_test,y_test) )

Train Score: 0.9971809792242989
Test Score: -60.19470377586204


# One Hot Encoding Car Sales

In [79]:
import pandas as pd
df = pd.read_csv('car-sales-missing-data.csv')
df

Unnamed: 0,Make,Colour,Odometer,Doors,Price
0,Toyota,White,150043.0,4.0,4000.0
1,Honda,Red,87899.0,4.0,5000.0
2,Toyota,Blue,,3.0,7000.0
3,BMW,Black,11179.0,5.0,22000.0
4,Nissan,White,213095.0,4.0,3500.0
5,Toyota,Green,,4.0,4500.0
6,Honda,,,4.0,7500.0
7,Honda,Blue,,4.0,
8,Toyota,White,60000.0,,
9,,White,31600.0,4.0,9700.0


In [80]:
df.isnull().sum()

Make        1
Colour      1
Odometer    4
Doors       1
Price       2
dtype: int64

In [81]:
df.fillna(method = 'pad', inplace = True)
df

Unnamed: 0,Make,Colour,Odometer,Doors,Price
0,Toyota,White,150043.0,4.0,4000.0
1,Honda,Red,87899.0,4.0,5000.0
2,Toyota,Blue,87899.0,3.0,7000.0
3,BMW,Black,11179.0,5.0,22000.0
4,Nissan,White,213095.0,4.0,3500.0
5,Toyota,Green,213095.0,4.0,4500.0
6,Honda,Green,213095.0,4.0,7500.0
7,Honda,Blue,213095.0,4.0,7500.0
8,Toyota,White,60000.0,4.0,7500.0
9,Toyota,White,31600.0,4.0,9700.0


In [82]:
x = df.drop(['Price'],axis = 1)
y = df['Price']
y

#Can also do:
#x=df.iloc[:,:-1]
#y=df.iloc[:,4]


0     4000.0
1     5000.0
2     7000.0
3    22000.0
4     3500.0
5     4500.0
6     7500.0
7     7500.0
8     7500.0
9     9700.0
Name: Price, dtype: float64

In [83]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
ct=ColumnTransformer([("Make,Colour",OneHotEncoder(),[0,1])],remainder="passthrough")
x=ct.fit_transform(x)

In [84]:
x

array([[0.00000e+00, 0.00000e+00, 0.00000e+00, 1.00000e+00, 0.00000e+00,
        0.00000e+00, 0.00000e+00, 0.00000e+00, 1.00000e+00, 1.50043e+05,
        4.00000e+00],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00,
        0.00000e+00, 0.00000e+00, 1.00000e+00, 0.00000e+00, 8.78990e+04,
        4.00000e+00],
       [0.00000e+00, 0.00000e+00, 0.00000e+00, 1.00000e+00, 0.00000e+00,
        1.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 8.78990e+04,
        3.00000e+00],
       [1.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 1.00000e+00,
        0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 1.11790e+04,
        5.00000e+00],
       [0.00000e+00, 0.00000e+00, 1.00000e+00, 0.00000e+00, 0.00000e+00,
        0.00000e+00, 0.00000e+00, 0.00000e+00, 1.00000e+00, 2.13095e+05,
        4.00000e+00],
       [0.00000e+00, 0.00000e+00, 0.00000e+00, 1.00000e+00, 0.00000e+00,
        0.00000e+00, 1.00000e+00, 0.00000e+00, 0.00000e+00, 2.13095e+05,
        4.0000

In [86]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=3)

In [87]:
from sklearn.linear_model import LinearRegression
regressor=LinearRegression()
regressor.fit(x_train,y_train)

LinearRegression()

In [88]:
y_pred=regressor.predict(x_test)
x_pred=regressor.predict(x_train)
y_pred

array([4713.90444146, 3498.22637915])

In [89]:
print('Train Score:', regressor.score(x_train,y_train) )
print('Test Score:', regressor.score(x_test,y_test) )

Train Score: 0.9980774935781943
Test Score: 0.9084834883890277
