In [75]:
import pandas as pd
import numpy as np
import sklearn

In [4]:
df = pd.read_csv('cars.csv')

In [5]:
df

Unnamed: 0,brand,km_driven,fuel,owner,selling_price
0,Maruti,145500,Diesel,First Owner,450000
1,Skoda,120000,Diesel,Second Owner,370000
2,Honda,140000,Petrol,Third Owner,158000
3,Hyundai,127000,Diesel,First Owner,225000
4,Maruti,120000,Petrol,First Owner,130000
...,...,...,...,...,...
8123,Hyundai,110000,Petrol,First Owner,320000
8124,Hyundai,119000,Diesel,Fourth & Above Owner,135000
8125,Maruti,120000,Diesel,First Owner,382000
8126,Tata,25000,Diesel,First Owner,290000


In [10]:
df['brand'].nunique()

32

In [13]:
df['brand'].value_counts()

brand
Maruti           2448
Hyundai          1415
Mahindra          772
Tata              734
Toyota            488
Honda             467
Ford              397
Chevrolet         230
Renault           228
Volkswagen        186
BMW               120
Skoda             105
Nissan             81
Jaguar             71
Volvo              67
Datsun             65
Mercedes-Benz      54
Fiat               47
Audi               40
Lexus              34
Jeep               31
Mitsubishi         14
Force               6
Land                6
Isuzu               5
Kia                 4
Ambassador          4
Daewoo              3
MG                  3
Ashok               1
Opel                1
Peugeot             1
Name: count, dtype: int64

## 1. OneHot Encoding Using Pandas

In [14]:
pd.get_dummies(df, columns = ['fuel', 'owner'])

Unnamed: 0,brand,km_driven,selling_price,fuel_CNG,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_First Owner,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,Maruti,145500,450000,False,True,False,False,True,False,False,False,False
1,Skoda,120000,370000,False,True,False,False,False,False,True,False,False
2,Honda,140000,158000,False,False,False,True,False,False,False,False,True
3,Hyundai,127000,225000,False,True,False,False,True,False,False,False,False
4,Maruti,120000,130000,False,False,False,True,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...
8123,Hyundai,110000,320000,False,False,False,True,True,False,False,False,False
8124,Hyundai,119000,135000,False,True,False,False,False,True,False,False,False
8125,Maruti,120000,382000,False,True,False,False,True,False,False,False,False
8126,Tata,25000,290000,False,True,False,False,True,False,False,False,False


## 2. N-1 OneHot Encoding (removing multi-collineaity)

In [70]:
pd.get_dummies(df, columns = ['fuel', 'owner'], drop_first = True, dtype=int)

Unnamed: 0,brand,km_driven,selling_price,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,Maruti,145500,450000,1,0,0,0,0,0,0
1,Skoda,120000,370000,1,0,0,0,1,0,0
2,Honda,140000,158000,0,0,1,0,0,0,1
3,Hyundai,127000,225000,1,0,0,0,0,0,0
4,Maruti,120000,130000,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...
8123,Hyundai,110000,320000,0,0,1,0,0,0,0
8124,Hyundai,119000,135000,1,0,0,1,0,0,0
8125,Maruti,120000,382000,1,0,0,0,0,0,0
8126,Tata,25000,290000,1,0,0,0,0,0,0


## 3. OneHot Encoding Using sklearn

In [18]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df.iloc[:,0:4], df.iloc[:,-1], test_size = 0.2, random_state = 1)

In [19]:
from sklearn.preprocessing import OneHotEncoder

In [39]:
# class sklearn.preprocessing.OneHotEncoder(*, categories='auto', drop=None, sparse_output=True, dtype=<class 'numpy.float64'>, 
#handle_unknown='error', min_frequency=None, max_categories=None, feature_name_combiner='concat')
ohe = OneHotEncoder(drop = 'first', sparse_output =False, dtype=np.int32)
# droping first column here to remove Multi-collinearity

In [41]:
X_train_new = ohe.fit_transform(X_train[['fuel','owner']]) # since sparse is set to false in above line we dont need to do .toarray()

In [42]:
X_test_new = ohe.transform(X_test[['fuel', 'owner']])

In [43]:
X_train_new.shape

(6502, 7)

In [45]:
np.hstack((X_train[['brand', 'km_driven']].values, X_train_new))

array([['Honda', 100000, 1, ..., 1, 0, 0],
       ['Maruti', 120000, 1, ..., 1, 0, 0],
       ['Renault', 100000, 1, ..., 1, 0, 0],
       ...,
       ['Hyundai', 70000, 0, ..., 0, 0, 0],
       ['Maruti', 110000, 1, ..., 1, 0, 0],
       ['Maruti', 65755, 0, ..., 0, 0, 0]], dtype=object)

## 4. OneHot Encoding with top categories (combining not frequent categories)

In [48]:
counts = df['brand'].value_counts()

In [66]:
df['brand'].nunique()
threshold = 100

In [67]:
repl = counts[counts<=threshold].index

In [72]:
pd.get_dummies(df['brand'].replace(repl, 'uncommon'), dtype=int)

Unnamed: 0,BMW,Chevrolet,Ford,Honda,Hyundai,Mahindra,Maruti,Renault,Skoda,Tata,Toyota,Volkswagen,uncommon
0,0,0,0,0,0,0,1,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,1,0,0,0,0
2,0,0,0,1,0,0,0,0,0,0,0,0,0
3,0,0,0,0,1,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8123,0,0,0,0,1,0,0,0,0,0,0,0,0
8124,0,0,0,0,1,0,0,0,0,0,0,0,0
8125,0,0,0,0,0,0,1,0,0,0,0,0,0
8126,0,0,0,0,0,0,0,0,0,1,0,0,0


## 5. OneHot Encoding with Column Transformer

In [73]:
df1 = pd.read_csv('covid_toy.csv')

In [74]:
df1

Unnamed: 0,age,gender,fever,cough,city,has_covid
0,60,Male,103.0,Mild,Kolkata,No
1,27,Male,100.0,Mild,Delhi,Yes
2,42,Male,101.0,Mild,Delhi,No
3,31,Female,98.0,Mild,Kolkata,No
4,65,Female,101.0,Mild,Mumbai,No
...,...,...,...,...,...,...
95,12,Female,104.0,Mild,Bangalore,No
96,51,Female,101.0,Strong,Kolkata,Yes
97,20,Female,101.0,Mild,Bangalore,No
98,5,Female,98.0,Strong,Mumbai,No


In [76]:
from sklearn.model_selection import train_test_split

In [78]:
X_train, X_test, y_train, y_test = train_test_split(df1.drop(columns = ['has_covid']), df1['has_covid'], test_size= 0.2)

In [84]:
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder

In [87]:
transformer = ColumnTransformer(transformers=[
    ('tnf1',SimpleImputer(),['fever']),
    ('tnf2',OrdinalEncoder(categories=[['Mild','Strong']]),['cough']),
    ('tnf3',OneHotEncoder(sparse_output=False,drop='first'),['gender','city'])
],remainder='passthrough')

In [88]:
X_train

Unnamed: 0,age,gender,fever,cough,city
20,12,Male,98.0,Strong,Bangalore
3,31,Female,98.0,Mild,Kolkata
41,82,Male,,Mild,Kolkata
16,69,Female,103.0,Mild,Kolkata
95,12,Female,104.0,Mild,Bangalore
...,...,...,...,...,...
88,5,Female,100.0,Mild,Kolkata
50,19,Male,101.0,Mild,Delhi
18,64,Female,98.0,Mild,Bangalore
87,47,Male,101.0,Strong,Bangalore


In [91]:
transformer.fit_transform(X_train)

array([[ 98.        ,   1.        ,   1.        ,   0.        ,
          0.        ,   0.        ,  12.        ],
       [ 98.        ,   0.        ,   0.        ,   0.        ,
          1.        ,   0.        ,  31.        ],
       [100.83333333,   0.        ,   1.        ,   0.        ,
          1.        ,   0.        ,  82.        ],
       [103.        ,   0.        ,   0.        ,   0.        ,
          1.        ,   0.        ,  69.        ],
       [104.        ,   0.        ,   0.        ,   0.        ,
          0.        ,   0.        ,  12.        ],
       [100.        ,   1.        ,   0.        ,   0.        ,
          0.        ,   0.        ,  47.        ],
       [102.        ,   1.        ,   0.        ,   0.        ,
          0.        ,   0.        ,  24.        ],
       [ 98.        ,   0.        ,   1.        ,   1.        ,
          0.        ,   0.        ,  83.        ],
       [100.        ,   1.        ,   0.        ,   0.        ,
          1.    

In [92]:
X_test

Unnamed: 0,age,gender,fever,cough,city
94,79,Male,,Strong,Kolkata
9,64,Female,101.0,Mild,Delhi
75,5,Male,102.0,Mild,Kolkata
65,69,Female,102.0,Mild,Bangalore
93,27,Male,100.0,Mild,Kolkata
43,22,Female,99.0,Mild,Bangalore
2,42,Male,101.0,Mild,Delhi
79,48,Female,103.0,Mild,Kolkata
12,25,Female,99.0,Strong,Kolkata
35,82,Female,102.0,Strong,Bangalore


In [93]:
transformer.fit_transform(X_test)

array([[100.88888889,   1.        ,   1.        ,   0.        ,
          1.        ,   0.        ,  79.        ],
       [101.        ,   0.        ,   0.        ,   1.        ,
          0.        ,   0.        ,  64.        ],
       [102.        ,   0.        ,   1.        ,   0.        ,
          1.        ,   0.        ,   5.        ],
       [102.        ,   0.        ,   0.        ,   0.        ,
          0.        ,   0.        ,  69.        ],
       [100.        ,   0.        ,   1.        ,   0.        ,
          1.        ,   0.        ,  27.        ],
       [ 99.        ,   0.        ,   0.        ,   0.        ,
          0.        ,   0.        ,  22.        ],
       [101.        ,   0.        ,   1.        ,   1.        ,
          0.        ,   0.        ,  42.        ],
       [103.        ,   0.        ,   0.        ,   0.        ,
          1.        ,   0.        ,  48.        ],
       [ 99.        ,   1.        ,   0.        ,   0.        ,
          1.    