# Mulitcolinearity will effect while applying One-hot-Encoding

In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('cars.csv')

In [3]:
df.head()

Unnamed: 0,brand,km_driven,fuel,owner,selling_price
0,Maruti,145500,Diesel,First Owner,450000
1,Skoda,120000,Diesel,Second Owner,370000
2,Honda,140000,Petrol,Third Owner,158000
3,Hyundai,127000,Diesel,First Owner,225000
4,Maruti,120000,Petrol,First Owner,130000


In [4]:
df.shape

(8128, 5)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8128 entries, 0 to 8127
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   brand          8128 non-null   object
 1   km_driven      8128 non-null   int64 
 2   fuel           8128 non-null   object
 3   owner          8128 non-null   object
 4   selling_price  8128 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 317.6+ KB


## Checing Categories data

In [6]:
# If we want to check how many unique values in the data set
df['brand'].nunique()

32

In [7]:
df['brand'].value_counts()

Maruti           2448
Hyundai          1415
Mahindra          772
Tata              734
Toyota            488
Honda             467
Ford              397
Chevrolet         230
Renault           228
Volkswagen        186
BMW               120
Skoda             105
Nissan             81
Jaguar             71
Volvo              67
Datsun             65
Mercedes-Benz      54
Fiat               47
Audi               40
Lexus              34
Jeep               31
Mitsubishi         14
Land                6
Force               6
Isuzu               5
Kia                 4
Ambassador          4
MG                  3
Daewoo              3
Opel                1
Peugeot             1
Ashok               1
Name: brand, dtype: int64

In [8]:
df['owner'].value_counts()

First Owner             5289
Second Owner            2105
Third Owner              555
Fourth & Above Owner     174
Test Drive Car             5
Name: owner, dtype: int64

In [9]:
df['owner'].nunique()

5

In [10]:
df['fuel'].value_counts()

Diesel    4402
Petrol    3631
CNG         57
LPG         38
Name: fuel, dtype: int64

In [11]:
df['fuel'].nunique()

4

## 1. OneHotEncoding using Pandas

* By using Pandas we use dummies on two columns fuel=4,owner=5 unique values total 12 columns

In [12]:
pd.get_dummies(df,columns=['fuel','owner'])

Unnamed: 0,brand,km_driven,selling_price,fuel_CNG,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_First Owner,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,Maruti,145500,450000,0,1,0,0,1,0,0,0,0
1,Skoda,120000,370000,0,1,0,0,0,0,1,0,0
2,Honda,140000,158000,0,0,0,1,0,0,0,0,1
3,Hyundai,127000,225000,0,1,0,0,1,0,0,0,0
4,Maruti,120000,130000,0,0,0,1,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
8123,Hyundai,110000,320000,0,0,0,1,1,0,0,0,0
8124,Hyundai,119000,135000,0,1,0,0,0,1,0,0,0
8125,Maruti,120000,382000,0,1,0,0,1,0,0,0,0
8126,Tata,25000,290000,0,1,0,0,1,0,0,0,0


## 2. K-1 OneHotEncoding
* To remove one column in the data set for "Multicolinearity"

In [13]:
# here we removed first column from the "Fuel" and first column from the "Owner" remaining total 10 columns
pd.get_dummies(df,columns=['fuel','owner'],drop_first=True)

Unnamed: 0,brand,km_driven,selling_price,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,Maruti,145500,450000,1,0,0,0,0,0,0
1,Skoda,120000,370000,1,0,0,0,1,0,0
2,Honda,140000,158000,0,0,1,0,0,0,1
3,Hyundai,127000,225000,1,0,0,0,0,0,0
4,Maruti,120000,130000,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...
8123,Hyundai,110000,320000,0,0,1,0,0,0,0
8124,Hyundai,119000,135000,1,0,0,1,0,0,0
8125,Maruti,120000,382000,1,0,0,0,0,0,0
8126,Tata,25000,290000,1,0,0,0,0,0,0


## 3. OneHotEncoding using Sklearn
* Same will be doing hear

* First remove 'Selling column' and test the data remaining 4-columns

In [14]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(df.iloc[:,0:4],df.iloc[:,-1],test_size=0.2,random_state=2)

In [15]:
X_train.head()

Unnamed: 0,brand,km_driven,fuel,owner
5571,Hyundai,35000,Diesel,First Owner
2038,Jeep,60000,Diesel,First Owner
2957,Hyundai,25000,Petrol,First Owner
7618,Mahindra,130000,Diesel,Second Owner
6684,Hyundai,155000,Diesel,First Owner


In [16]:
X_test.head()

Unnamed: 0,brand,km_driven,fuel,owner
606,Hyundai,80000,Petrol,First Owner
7575,Mahindra,70000,Diesel,Second Owner
7705,Toyota,68089,Petrol,First Owner
4305,Hyundai,70000,Petrol,Second Owner
2685,Mahindra,97000,Diesel,Second Owner


In [17]:
# Selling price column as y_train its an integer column
y_train.head()

5571    1150000
2038    1689999
2957     580000
7618     150000
6684     320000
Name: selling_price, dtype: int64

In [18]:
from sklearn.preprocessing import OneHotEncoder

In [19]:
# To remove converted data first column we use 'drop' function
# To control the values we use 'dtype' function
ohe = OneHotEncoder(drop='first')

## From above X_train dataset we we using two columns 'fuel' and 'owner' in a DataSet

In [20]:
ohe.fit_transform(X_train[['fuel','owner']])

<6502x7 sparse matrix of type '<class 'numpy.float64'>'
	with 8746 stored elements in Compressed Sparse Row format>

In [21]:
X_train_new = ohe.fit_transform(X_train[['fuel','owner']])

In [22]:
X_train_new

<6502x7 sparse matrix of type '<class 'numpy.float64'>'
	with 8746 stored elements in Compressed Sparse Row format>

In [23]:
# Converting the data into array
X_train_new = ohe.fit_transform(X_train[['fuel','owner']]).toarray()

In [24]:
X_train_new

array([[1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 1., ..., 0., 0., 0.],
       [1., 0., 0., ..., 1., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.]])

In [25]:
# In this dataset we have 'fuel-4' and 'owner-5' total 9 columns removing first 1 column from the both columns
X_train_new.shape

(6502, 7)

In [26]:
X_test_new = ohe.fit_transform(X_test[['fuel','owner']]).toarray()

In [27]:
X_test_new.shape

(1626, 7)

In [28]:
X_test_new

array([[0., 0., 1., ..., 0., 0., 0.],
       [1., 0., 0., ..., 1., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 1., ..., 1., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.]])

# Adding to the Remaining Datasest

In [29]:
# If we want 'brand' and 'kmdriven'
X_train[['brand','km_driven']]

Unnamed: 0,brand,km_driven
5571,Hyundai,35000
2038,Jeep,60000
2957,Hyundai,25000
7618,Mahindra,130000
6684,Hyundai,155000
...,...,...
3606,Ford,35000
5704,Maruti,120000
6637,Tata,15000
2575,Maruti,32500


In [30]:
# If we want those in those values in "Numpy-array"
X_train[['brand','km_driven']].values

array([['Hyundai', 35000],
       ['Jeep', 60000],
       ['Hyundai', 25000],
       ...,
       ['Tata', 15000],
       ['Maruti', 32500],
       ['Isuzu', 121000]], dtype=object)

# Note : if we add 'sparse' to our column we dont need to add '.toarray'

## we want to add both 'X_train_new' and 'X_train' numpy array in one dataset

In [31]:
np.hstack((X_train[['brand','km_driven']].values,X_train_new))

array([['Hyundai', 35000, 1.0, ..., 0.0, 0.0, 0.0],
       ['Jeep', 60000, 1.0, ..., 0.0, 0.0, 0.0],
       ['Hyundai', 25000, 0.0, ..., 0.0, 0.0, 0.0],
       ...,
       ['Tata', 15000, 0.0, ..., 0.0, 0.0, 0.0],
       ['Maruti', 32500, 1.0, ..., 1.0, 0.0, 0.0],
       ['Isuzu', 121000, 1.0, ..., 0.0, 0.0, 0.0]], dtype=object)

In [32]:
np.hstack((X_train[['brand','km_driven']].values,X_train_new)).shape

(6502, 9)

# Another way of converting Data

In [33]:
ohe = OneHotEncoder(drop='first',sparse=False,dtype=np.int32)

In [34]:
X_train_new = ohe.fit_transform(X_train[['fuel','owner']])

In [35]:
X_test_new = ohe.transform(X_test[['fuel','owner']])

In [36]:
X_train_new.shape

(6502, 7)

In [37]:
X_train_new

array([[1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0],
       ...,
       [0, 0, 1, ..., 0, 0, 0],
       [1, 0, 0, ..., 1, 0, 0],
       [1, 0, 0, ..., 0, 0, 0]])

In [38]:
X_test_new.shape

(1626, 7)

In [39]:
np.hstack((X_train[['brand','km_driven']].values,X_train_new))

array([['Hyundai', 35000, 1, ..., 0, 0, 0],
       ['Jeep', 60000, 1, ..., 0, 0, 0],
       ['Hyundai', 25000, 0, ..., 0, 0, 0],
       ...,
       ['Tata', 15000, 0, ..., 0, 0, 0],
       ['Maruti', 32500, 1, ..., 1, 0, 0],
       ['Isuzu', 121000, 1, ..., 0, 0, 0]], dtype=object)

In [40]:
np.hstack((X_train[['brand','km_driven']].values,X_train_new)).shape

(6502, 9)

## 4. OneHotEncoding with Top Categories

In [41]:
counts = df['brand']
counts

0        Maruti
1         Skoda
2         Honda
3       Hyundai
4        Maruti
         ...   
8123    Hyundai
8124    Hyundai
8125     Maruti
8126       Tata
8127       Tata
Name: brand, Length: 8128, dtype: object

In [42]:
counts.nunique()

32

In [43]:
counts = df['brand'].value_counts()
counts

Maruti           2448
Hyundai          1415
Mahindra          772
Tata              734
Toyota            488
Honda             467
Ford              397
Chevrolet         230
Renault           228
Volkswagen        186
BMW               120
Skoda             105
Nissan             81
Jaguar             71
Volvo              67
Datsun             65
Mercedes-Benz      54
Fiat               47
Audi               40
Lexus              34
Jeep               31
Mitsubishi         14
Land                6
Force               6
Isuzu               5
Kia                 4
Ambassador          4
MG                  3
Daewoo              3
Opel                1
Peugeot             1
Ashok               1
Name: brand, dtype: int64

In [44]:
df['brand'].nunique()
threshold = 100

In [45]:
# thershold means less than 100 values
repl = counts[counts <= threshold].index
repl

Index(['Nissan', 'Jaguar', 'Volvo', 'Datsun', 'Mercedes-Benz', 'Fiat', 'Audi',
       'Lexus', 'Jeep', 'Mitsubishi', 'Land', 'Force', 'Isuzu', 'Kia',
       'Ambassador', 'MG', 'Daewoo', 'Opel', 'Peugeot', 'Ashok'],
      dtype='object')

In [46]:
repl.nunique()

20

In [47]:
# Here we adding one new column to add 'threshold values' as 'uncommon=20' remaining 'columns=12' "12+1=13" columns
pd.get_dummies(df['brand'].replace(repl, 'uncommon'))

Unnamed: 0,BMW,Chevrolet,Ford,Honda,Hyundai,Mahindra,Maruti,Renault,Skoda,Tata,Toyota,Volkswagen,uncommon
0,0,0,0,0,0,0,1,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,1,0,0,0,0
2,0,0,0,1,0,0,0,0,0,0,0,0,0
3,0,0,0,0,1,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8123,0,0,0,0,1,0,0,0,0,0,0,0,0
8124,0,0,0,0,1,0,0,0,0,0,0,0,0
8125,0,0,0,0,0,0,1,0,0,0,0,0,0
8126,0,0,0,0,0,0,0,0,0,1,0,0,0


In [48]:
pd.get_dummies(df['brand'].replace(repl, 'uncommon')).sample(5)

Unnamed: 0,BMW,Chevrolet,Ford,Honda,Hyundai,Mahindra,Maruti,Renault,Skoda,Tata,Toyota,Volkswagen,uncommon
5421,0,0,0,0,1,0,0,0,0,0,0,0,0
789,0,0,0,0,0,0,0,0,0,1,0,0,0
6545,0,0,0,0,1,0,0,0,0,0,0,0,0
5796,0,0,0,0,0,0,0,0,0,0,0,1,0
7140,0,0,0,0,0,0,0,0,0,0,0,0,1
