In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

In [13]:
df = pd.read_csv("customer.csv",usecols=['review','education','purchased'])
print(df.shape)
df.head(2)

(50, 3)


Unnamed: 0,review,education,purchased
0,Average,School,No
1,Poor,UG,No


In [20]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(df.drop('purchased',axis=1),
                                                   df['purchased'],
                                                   test_size=0.2,
                                                   random_state=42)
print(x_train.shape, x_test.shape)

(40, 2) (10, 2)


In [21]:
print(x_train['review'].unique())
print(x_train['education'].unique())

['Poor' 'Average' 'Good']
['School' 'UG' 'PG']


In [22]:
x_train.head()

Unnamed: 0,review,education
12,Poor,School
4,Average,UG
37,Average,PG
8,Average,UG
3,Good,PG


# Ordinal Encoding

In [23]:
from sklearn.preprocessing import OrdinalEncoder

oe = OrdinalEncoder(categories=[['Poor', 'Average', 'Good'],['School', 'UG' ,'PG']])

# x_train = oe.fit(x_train)

x_train = oe.fit_transform(x_train)
x_test = oe.transform(x_test)

In [24]:
oe.categories_

[array(['Poor', 'Average', 'Good'], dtype=object),
 array(['School', 'UG', 'PG'], dtype=object)]

In [25]:
x_train = pd.DataFrame(x_train)
x_train.columns = ['Review','Education']
x_train.head()

Unnamed: 0,Review,Education
0,0.0,0.0
1,1.0,1.0
2,1.0,2.0
3,1.0,1.0
4,2.0,2.0


In [26]:
y_train.head()

12     No
4      No
37    Yes
8      No
3      No
Name: purchased, dtype: object

# Label Encoding

In [27]:
from sklearn.preprocessing import LabelEncoder  # LabelEncoder is for the target encoder only. 
                                                # It is used only if target is categorical
    # Encode target labels with value between 0 and n_classes-1. 
    # This transformer should be used to encode target values, ie y, and not the input X.

le = LabelEncoder()

y_train = le.fit_transform(y_train)
y_test = le.transform(y_test)

In [28]:
le.classes_

array(['No', 'Yes'], dtype=object)

In [29]:
y_train =pd.DataFrame(y_train)
y_train.head()

Unnamed: 0,0
0,0
1,0
2,1
3,0
4,0


# One Hot Encoding

In [31]:
df = pd.read_csv("Cars.csv")
print(df.shape)
df.head()

(8128, 5)


Unnamed: 0,brand,km_driven,fuel,owner,selling_price
0,Maruti,145500,Diesel,First Owner,450000
1,Skoda,120000,Diesel,Second Owner,370000
2,Honda,140000,Petrol,Third Owner,158000
3,Hyundai,127000,Diesel,First Owner,225000
4,Maruti,120000,Petrol,First Owner,130000


In [37]:
for col in ['fuel','owner','brand']:
#     print(col)
    print(df[col].value_counts().reset_index())
    print("------------------------")

     fuel  count
0  Diesel   4402
1  Petrol   3631
2     CNG     57
3     LPG     38
------------------------
                  owner  count
0           First Owner   5289
1          Second Owner   2105
2           Third Owner    555
3  Fourth & Above Owner    174
4        Test Drive Car      5
------------------------
            brand  count
0          Maruti   2448
1         Hyundai   1415
2        Mahindra    772
3            Tata    734
4          Toyota    488
5           Honda    467
6            Ford    397
7       Chevrolet    230
8         Renault    228
9      Volkswagen    186
10            BMW    120
11          Skoda    105
12         Nissan     81
13         Jaguar     71
14          Volvo     67
15         Datsun     65
16  Mercedes-Benz     54
17           Fiat     47
18           Audi     40
19          Lexus     34
20           Jeep     31
21     Mitsubishi     14
22          Force      6
23           Land      6
24          Isuzu      5
25            Kia      4
26  

##### 1. OHE using pandas

In [39]:
df.head(1)

Unnamed: 0,brand,km_driven,fuel,owner,selling_price
0,Maruti,145500,Diesel,First Owner,450000


In [40]:
pd.get_dummies(df,columns=['fuel','owner'],drop_first=True)  # drop_first is used to drop first category of each column in order to have k-1 classes

Unnamed: 0,brand,km_driven,selling_price,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,Maruti,145500,450000,True,False,False,False,False,False,False
1,Skoda,120000,370000,True,False,False,False,True,False,False
2,Honda,140000,158000,False,False,True,False,False,False,True
3,Hyundai,127000,225000,True,False,False,False,False,False,False
4,Maruti,120000,130000,False,False,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...
8123,Hyundai,110000,320000,False,False,True,False,False,False,False
8124,Hyundai,119000,135000,True,False,False,True,False,False,False
8125,Maruti,120000,382000,True,False,False,False,False,False,False
8126,Tata,25000,290000,True,False,False,False,False,False,False


##### 2. OHE using sklearn

In [42]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(df.iloc[:,:-1],
                                                   df.iloc[:,-1],
                                                   test_size=0.2,
                                                   random_state=2)

print(x_train.shape,x_test.shape)

(6502, 4) (1626, 4)


In [43]:
x_train.head(2)

Unnamed: 0,brand,km_driven,fuel,owner
5571,Hyundai,35000,Diesel,First Owner
2038,Jeep,60000,Diesel,First Owner


In [44]:
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(drop='first',sparse=False,dtype=np.int32)

x_train_new = ohe.fit_transform(x_train[['fuel','owner']])
x_test_new = ohe.transform(x_test[['fuel','owner']])

In [45]:
x_train_new

array([[1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0],
       ...,
       [0, 0, 1, ..., 0, 0, 0],
       [1, 0, 0, ..., 1, 0, 0],
       [1, 0, 0, ..., 0, 0, 0]])

In [46]:
np.hstack((x_train[['brand','km_driven']].values,x_train_new))

array([['Hyundai', 35000, 1, ..., 0, 0, 0],
       ['Jeep', 60000, 1, ..., 0, 0, 0],
       ['Hyundai', 25000, 0, ..., 0, 0, 0],
       ...,
       ['Tata', 15000, 0, ..., 0, 0, 0],
       ['Maruti', 32500, 1, ..., 1, 0, 0],
       ['Isuzu', 121000, 1, ..., 0, 0, 0]], dtype=object)

##### OHE with Top Categories

In [57]:
counts = df['brand'].value_counts()
threshold =100
repl = counts[counts <= threshold].index # .index --> Extracts only the brand names, not the counts.
repl

Index(['Nissan', 'Jaguar', 'Volvo', 'Datsun', 'Mercedes-Benz', 'Fiat', 'Audi',
       'Lexus', 'Jeep', 'Mitsubishi', 'Force', 'Land', 'Isuzu', 'Kia',
       'Ambassador', 'Daewoo', 'MG', 'Ashok', 'Opel', 'Peugeot'],
      dtype='object', name='brand')

In [59]:
pd.get_dummies(df['brand'].replace(repl, 'uncommon'),dtype='int').sample(5)

Unnamed: 0,BMW,Chevrolet,Ford,Honda,Hyundai,Mahindra,Maruti,Renault,Skoda,Tata,Toyota,Volkswagen,uncommon
6934,0,0,0,0,0,0,0,0,0,1,0,0,0
5529,0,0,0,1,0,0,0,0,0,0,0,0,0
5899,0,0,0,0,1,0,0,0,0,0,0,0,0
1816,0,0,0,0,0,0,0,0,0,0,0,0,1
4675,0,0,0,0,0,0,0,0,0,1,0,0,0
