# Encoding

In [None]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [None]:
df = pd.read_csv('autos_dataset.csv')
df.head().T

Unnamed: 0,0,1,2,3,4
symboling,3,3,1,2,2
normalized-losses,?,?,?,164,164
make,alfa-romero,alfa-romero,alfa-romero,audi,audi
fuel-type,gas,gas,gas,gas,gas
aspiration,std,std,std,std,std
num-of-doors,two,two,two,four,four
body-style,convertible,convertible,hatchback,sedan,sedan
drive-wheels,rwd,rwd,rwd,fwd,4wd
engine-location,front,front,front,front,front
wheel-base,88.6,88.6,94.5,99.8,99.4


In [None]:
df['fuel-type'].value_counts()

gas       185
diesel     20
Name: fuel-type, dtype: int64

In [None]:
df['aspiration'].value_counts()

std      168
turbo     37
Name: aspiration, dtype: int64

In [None]:
df['num-of-cylinders'].value_counts()

four      159
six        24
five       11
eight       5
two         4
three       1
twelve      1
Name: num-of-cylinders, dtype: int64

In [None]:
df['num-of-cylinders'].replace({'four':4, 'six':6, 'five':5, 'three':3, 'twelve':12, 
                                'two':2, 'eight':8},inplace = True)

In [None]:
df['fuel-system'].value_counts()

mpfi    94
2bbl    66
idi     20
1bbl    11
spdi     9
4bbl     3
mfi      1
spfi     1
Name: fuel-system, dtype: int64

In [None]:
high  >> 2
medium >> 1
low  >> 0
high  >> 2
medium >> 1
medium >> 1

# 1. Label Encoding

In [None]:
df_test = pd.DataFrame({'Test':['high','medium','low','high','medium','high','low']})
df_test

Unnamed: 0,Test
0,high
1,medium
2,low
3,high
4,medium
5,high
6,low


In [None]:
label_enc = LabelEncoder()
label_enc.fit_transform(df_test['Test'])


array([0, 2, 1, 0, 2, 0, 1])

In [None]:
df_test['Test'].replace({'low':0,'medium':1,'high':2},inplace = True)
df_test['Test']

0    2
1    1
2    0
3    2
4    1
5    2
6    0
Name: Test, dtype: int64

In [None]:
df['num-of-cylinders'].value_counts()

four      159
six        24
five       11
eight       5
two         4
three       1
twelve      1
Name: num-of-cylinders, dtype: int64

In [None]:
label_enc = LabelEncoder()
label_enc.fit_transform(df['num-of-cylinders'])

array([2, 2, 3, 2, 1, 1, 1, 1, 1, 1, 2, 2, 3, 3, 3, 3, 3, 3, 4, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 3, 3, 5, 2, 2, 2, 2, 2, 6, 6, 6, 6, 2, 2, 2, 2, 2, 2, 2,
       2, 1, 1, 1, 1, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 0, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 3, 3, 2])

In [None]:
sorted(df['num-of-cylinders'].unique())

['eight', 'five', 'four', 'six', 'three', 'twelve', 'two']

In [None]:
four        2
six         3
five        1
eight       0
two         6
three       4
twelve      5

# 2. One Hot Encoding

In [None]:
df[['aspiration']].tail()

Unnamed: 0,aspiration
200,std
201,turbo
202,std
203,turbo
204,turbo


In [None]:
one_hot_enc = OneHotEncoder()
array = one_hot_enc.fit_transform(df[['aspiration']]).toarray()
df_asp = pd.DataFrame(array,dtype=int)
df_asp.tail() 
.

Unnamed: 0,0,1
200,1,0
201,0,1
202,1,0
203,0,1
204,0,1


In [None]:
df_asp1 = pd.get_dummies(df,columns=['aspiration'],drop_first=True)
df_asp1.columns

Index(['symboling', 'normalized-losses', 'make', 'fuel-type', 'num-of-doors',
       'body-style', 'drive-wheels', 'engine-location', 'wheel-base', 'length',
       'width', 'height', 'curb-weight', 'engine-type', 'num-of-cylinders',
       'engine-size', 'fuel-system', 'bore', 'stroke', 'compression-ratio',
       'horsepower', 'peak-rpm', 'city-mpg', 'highway-mpg', 'price',
       'aspiration_turbo'],
      dtype='object')

In [None]:
             high    medium    low
high           1       0        0 
medium         0       1        0
low            0       0        1  
high           1       0        0   
medium         0       1        0
medium         0       1        0

In [None]:
df_test = pd.DataFrame({'Test':['high','medium','low','high','medium','high','low']})
df_test

Unnamed: 0,Test
0,high
1,medium
2,low
3,high
4,medium
5,high
6,low


In [None]:
df_test_one = pd.get_dummies(df_test,columns=['Test'])
df_test_one

Unnamed: 0,Test_high,Test_low,Test_medium
0,1,0,0
1,0,0,1
2,0,1,0
3,1,0,0
4,0,0,1
5,1,0,0
6,0,1,0


In [None]:
df_test_one = pd.get_dummies(df_test,columns=['Test'],drop_first=True, prefix = 'Risk')
df_test_one

Unnamed: 0,Risk_low,Risk_medium
0,0,0
1,0,1
2,1,0
3,0,0
4,0,1
5,0,0
6,1,0


In [None]:
500 unique value (locations)

0
1
2
3
.
.
500