In [153]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder,OrdinalEncoder
from sklearn.model_selection import train_test_split

# Ordinal Encoding

In [154]:
df = pd.read_csv("customer.csv")

In [155]:
df.head(5)

Unnamed: 0,age,gender,review,education,purchased
0,30,Female,Average,School,No
1,68,Female,Poor,UG,No
2,70,Female,Good,PG,No
3,72,Female,Good,PG,No
4,16,Female,Average,UG,No


In [156]:
df['review'].unique()

array(['Average', 'Poor', 'Good'], dtype=object)

In [157]:
df['education'].unique()

array(['School', 'UG', 'PG'], dtype=object)

In [158]:
X = df[['review','education']] # giving data to iput column

In [159]:
y = df['purchased']

In [160]:
# before apply any encoding we have perform train test split

In [161]:
X_train , X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)

In [162]:
# we can see that review and education columns are ordinal data so apply ordinal encoding
# remeber pass categories in lower to higher order 

In [163]:
oe = OrdinalEncoder(categories = [['Average', 'Poor', 'Good'],['School', 'UG', 'PG']])

In [164]:
# then fit the our input columns

In [165]:
oe.fit(X_train)

In [166]:
# then transform the X_train

In [167]:
X_train = oe.transform(X_train)

In [168]:
X_train = pd.DataFrame(X_train) # by default this sklearn libries give output in array form 

In [169]:
X_train.head()

Unnamed: 0,0,1
0,1.0,0.0
1,2.0,0.0
2,0.0,1.0
3,1.0,2.0
4,2.0,0.0


# label Encoding

In [170]:
# now for y_train , it is a label data means our target varaiable
# so apply label encoding on it 

In [171]:
le = LabelEncoder()

In [172]:
le.fit(y_train)

In [173]:
le.transform(y_train)

array([1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1,
       1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1])

# One Hot Encoding

In [174]:
df = pd.read_csv("cars.csv")

In [175]:
df.head()

Unnamed: 0,brand,km_driven,fuel,owner,selling_price
0,Maruti,145500,Diesel,First Owner,450000
1,Skoda,120000,Diesel,Second Owner,370000
2,Honda,140000,Petrol,Third Owner,158000
3,Hyundai,127000,Diesel,First Owner,225000
4,Maruti,120000,Petrol,First Owner,130000


**Using Pandas**

In [176]:
# Before any Encoding We have to perform train test split

In [177]:
df = pd.read_csv("cars.csv")

In [203]:
X_train_dummie = pd.get_dummies(df.iloc[:,1:],dtype='int',drop_first = True) # get_dummies used for ohe,drop_first is used for multicolinarity

In [204]:
X_train_dummie

Unnamed: 0,km_driven,selling_price,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,145500,450000,1,0,0,0,0,0,0
1,120000,370000,1,0,0,0,1,0,0
2,140000,158000,0,0,1,0,0,0,1
3,127000,225000,1,0,0,0,0,0,0
4,120000,130000,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...
8123,110000,320000,0,0,1,0,0,0,0
8124,119000,135000,1,0,0,1,0,0,0
8125,120000,382000,1,0,0,0,0,0,0
8126,25000,290000,1,0,0,0,0,0,0


**using Sklearn OneHotEncoder**

In [180]:
from sklearn.preprocessing import OneHotEncoder

In [189]:
X_train,X_test,y_train,y_test = train_test_split(df.iloc[:,:4],df.iloc[:,-1],test_size=0.2)

In [190]:
ohe = OneHotEncoder(sparse_output = False,dtype='int',drop='first') 
# sparse_output by default is set true 
# dtype by fefault float
# drop = 'first' used to avoid multicolinearity problem

In [191]:
ohe.fit(X_train[['fuel','owner']])
#first fit the trainnig data

In [192]:
X_train_ohe = ohe.transform(X_train[['fuel','owner']])
# then transform
X_train_ohe

array([[1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0],
       ...,
       [1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0]])

In [193]:
X_test_ohe = ohe.transform(X_test[['fuel','owner']])
X_test_ohe

array([[1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0],
       ...,
       [0, 0, 1, ..., 0, 0, 0],
       [1, 0, 0, ..., 1, 0, 0],
       [1, 0, 0, ..., 0, 0, 0]])

In [194]:
X_train.shape

(6502, 4)

In [195]:
import numpy as np

In [198]:
X_train_final = np.hstack((X_train[['brand','km_driven']],X_train_ohe))
X_train_final

array([['Honda', 70000, 1, ..., 0, 0, 0],
       ['Mahindra', 120000, 1, ..., 0, 0, 0],
       ['Skoda', 11000, 0, ..., 0, 0, 0],
       ...,
       ['Toyota', 18890, 1, ..., 0, 0, 0],
       ['Maruti', 110000, 1, ..., 0, 0, 0],
       ['Hyundai', 9400, 0, ..., 0, 0, 0]], dtype=object)

**OneHotEncoding with Top Categories**

In [208]:
counts = df['brand'].value_counts()

In [209]:
counts

brand
Maruti           2448
Hyundai          1415
Mahindra          772
Tata              734
Toyota            488
Honda             467
Ford              397
Chevrolet         230
Renault           228
Volkswagen        186
BMW               120
Skoda             105
Nissan             81
Jaguar             71
Volvo              67
Datsun             65
Mercedes-Benz      54
Fiat               47
Audi               40
Lexus              34
Jeep               31
Mitsubishi         14
Force               6
Land                6
Isuzu               5
Kia                 4
Ambassador          4
Daewoo              3
MG                  3
Ashok               1
Opel                1
Peugeot             1
Name: count, dtype: int64

In [216]:
repl = counts[counts <= 100].index # get brand which are less occuring

In [218]:
repl

Index(['Nissan', 'Jaguar', 'Volvo', 'Datsun', 'Mercedes-Benz', 'Fiat', 'Audi',
       'Lexus', 'Jeep', 'Mitsubishi', 'Force', 'Land', 'Isuzu', 'Kia',
       'Ambassador', 'Daewoo', 'MG', 'Ashok', 'Opel', 'Peugeot'],
      dtype='object', name='brand')

In [219]:
pd.get_dummies(df['brand'].replace(repl,'unknown'),dtype='int')

Unnamed: 0,BMW,Chevrolet,Ford,Honda,Hyundai,Mahindra,Maruti,Renault,Skoda,Tata,Toyota,Volkswagen,unknown
0,0,0,0,0,0,0,1,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,1,0,0,0,0
2,0,0,0,1,0,0,0,0,0,0,0,0,0
3,0,0,0,0,1,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8123,0,0,0,0,1,0,0,0,0,0,0,0,0
8124,0,0,0,0,1,0,0,0,0,0,0,0,0
8125,0,0,0,0,0,0,1,0,0,0,0,0,0
8126,0,0,0,0,0,0,0,0,0,1,0,0,0
