In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
import category_encoders as ce
from sklearn.preprocessing import *

# 1. Reading Data

In [3]:
train_df = pd.read_csv('./dataset/train.csv', index_col='id')
test_df = pd.read_csv('./dataset/test.csv', index_col='id')
submission_df = pd.read_csv('./dataset/sample_submission.csv', index_col='id')

In [4]:
train_df.head()

Unnamed: 0_level_0,bin_0,bin_1,bin_2,bin_3,bin_4,nom_0,nom_1,nom_2,nom_3,nom_4,...,nom_9,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5,day,month,target
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,0,0,T,Y,Green,Triangle,Snake,Finland,Bassoon,...,2f4cb3d51,2,Grandmaster,Cold,h,D,kr,2,2,0
1,0,1,0,T,Y,Green,Trapezoid,Hamster,Russia,Piano,...,f83c56c21,1,Grandmaster,Hot,a,A,bF,7,8,0
2,0,0,0,F,Y,Blue,Trapezoid,Lion,Russia,Theremin,...,ae6800dd0,1,Expert,Lava Hot,h,R,Jc,7,2,0
3,0,1,0,F,Y,Red,Trapezoid,Snake,Canada,Oboe,...,8270f0d71,1,Grandmaster,Boiling Hot,i,D,kW,2,1,1
4,0,0,0,F,N,Red,Trapezoid,Lion,Canada,Oboe,...,b164b72a7,1,Grandmaster,Freezing,a,R,qP,7,8,0


In [5]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 300000 entries, 0 to 299999
Data columns (total 24 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   bin_0   300000 non-null  int64 
 1   bin_1   300000 non-null  int64 
 2   bin_2   300000 non-null  int64 
 3   bin_3   300000 non-null  object
 4   bin_4   300000 non-null  object
 5   nom_0   300000 non-null  object
 6   nom_1   300000 non-null  object
 7   nom_2   300000 non-null  object
 8   nom_3   300000 non-null  object
 9   nom_4   300000 non-null  object
 10  nom_5   300000 non-null  object
 11  nom_6   300000 non-null  object
 12  nom_7   300000 non-null  object
 13  nom_8   300000 non-null  object
 14  nom_9   300000 non-null  object
 15  ord_0   300000 non-null  int64 
 16  ord_1   300000 non-null  object
 17  ord_2   300000 non-null  object
 18  ord_3   300000 non-null  object
 19  ord_4   300000 non-null  object
 20  ord_5   300000 non-null  object
 21  day     300000 non-null  int64 
 

In [6]:
for i in train_df.columns.values:
    print(i, 'has', train_df[i].value_counts().shape[0], 'unique values')

bin_0 has 2 unique values
bin_1 has 2 unique values
bin_2 has 2 unique values
bin_3 has 2 unique values
bin_4 has 2 unique values
nom_0 has 3 unique values
nom_1 has 6 unique values
nom_2 has 6 unique values
nom_3 has 6 unique values
nom_4 has 4 unique values
nom_5 has 222 unique values
nom_6 has 522 unique values
nom_7 has 1220 unique values
nom_8 has 2215 unique values
nom_9 has 11981 unique values
ord_0 has 3 unique values
ord_1 has 5 unique values
ord_2 has 6 unique values
ord_3 has 15 unique values
ord_4 has 26 unique values
ord_5 has 192 unique values
day has 7 unique values
month has 12 unique values
target has 2 unique values


In [7]:
# Checking if dataset is unbalanced
train_df['target'].value_counts()

0    208236
1     91764
Name: target, dtype: int64

# 2. Data Preprocessing

In [8]:
def handle_binary_values(df):
    df['bin_3'] = df['bin_3'].map({'T':1, 'F':0})
    df['bin_4'] = df['bin_4'].map({'Y':1, 'N':0})

In [9]:
def handle_nominal_values():
    pass

In [10]:
def handle_ordinal_values():
    pass

In [11]:
train_df['nom_0'].value_counts()

Green    127341
Blue      96166
Red       76493
Name: nom_0, dtype: int64

In [12]:
train_df['nom_1'].value_counts()

Trapezoid    101181
Square        49597
Star          45904
Circle        37320
Polygon       36143
Triangle      29855
Name: nom_1, dtype: int64

In [13]:
train_df['nom_2'].value_counts()

Lion       101295
Cat         49659
Snake       45979
Dog         37444
Axolotl     36136
Hamster     29487
Name: nom_2, dtype: int64

# 3. Classic Encoding
## 3.1. Label Encoding

In [14]:
encoder = LabelEncoder()
print(
    pd.DataFrame({'OG': train_df['nom_5'], 'Transformed': encoder.fit_transform(train_df['nom_0'])})\
    .reset_index().drop('id', axis=1).head(5)
)

          OG  Transformed
0  50f116bcf            1
1  b3b4d25d0            1
2  3263bdce5            0
3  f12246592            2
4  5b0f5acd5            2


## 3.2. Ordinal Encoding

In [15]:
encoder = ce.OrdinalEncoder(cols=['nom_5'])
encoder.fit_transform(train_df[['nom_5']], train_df['target']).reset_index(drop=True).head(5)

  elif pd.api.types.is_categorical(cols):


Unnamed: 0,nom_5
0,1
1,2
2,3
3,4
4,5


## 3.3. One-hot Encoding

In [16]:
encoder = ce.OneHotEncoder()
encoder.fit_transform(train_df['nom_5'], train_df['target']).reset_index(drop=True).head(5)

  elif pd.api.types.is_categorical(cols):


Unnamed: 0,nom_5_1,nom_5_2,nom_5_3,nom_5_4,nom_5_5,nom_5_6,nom_5_7,nom_5_8,nom_5_9,nom_5_10,...,nom_5_213,nom_5_214,nom_5_215,nom_5_216,nom_5_217,nom_5_218,nom_5_219,nom_5_220,nom_5_221,nom_5_222
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## 3.4. Binary Encoding

In [17]:
encoder = ce.BinaryEncoder()
encoder.fit_transform(train_df['nom_5'], train_df['target']).reset_index(drop=True)

  elif pd.api.types.is_categorical(cols):


Unnamed: 0,nom_5_0,nom_5_1,nom_5_2,nom_5_3,nom_5_4,nom_5_5,nom_5_6,nom_5_7,nom_5_8
0,0,0,0,0,0,0,0,0,1
1,0,0,0,0,0,0,0,1,0
2,0,0,0,0,0,0,0,1,1
3,0,0,0,0,0,0,1,0,0
4,0,0,0,0,0,0,1,0,1
...,...,...,...,...,...,...,...,...,...
299995,0,1,0,0,0,1,1,1,0
299996,0,0,0,1,1,1,1,0,1
299997,0,0,0,1,0,1,0,1,0
299998,0,0,0,0,0,1,1,1,0


## 3.5. Base N Encoder

In [18]:
encoder = ce.BaseNEncoder(base=10)
encoder.fit_transform(train_df['nom_5'], train_df['target']).reset_index(drop=True).head(5)

  elif pd.api.types.is_categorical(cols):


Unnamed: 0,nom_5_0,nom_5_1,nom_5_2,nom_5_3
0,0,0,0,1
1,0,0,0,2
2,0,0,0,3
3,0,0,0,4
4,0,0,0,5


## 3.6. Hashing  Encoder

In [19]:
encoder = ce.HashingEncoder()
encoder.fit_transform(train_df['nom_5'], train_df['target']).reset_index(drop=True).head(5)

Unnamed: 0,col_0,col_1,col_2,col_3,col_4,col_5,col_6,col_7
0,0,0,0,0,0,1,0,0
1,0,1,0,0,0,0,0,0
2,0,0,0,1,0,0,0,0
3,0,0,0,0,0,0,0,1
4,0,0,1,0,0,0,0,0


## 3.7. Frequency Encoding

In [26]:
encoder = ce.CountEncoder()
encoder.fit_transform(train_df['nom_5'], train_df['target']).reset_index(drop=True)

Unnamed: 0,nom_5
0,2594
1,792
2,2524
3,975
4,2010
...,...
299995,2045
299996,2729
299997,2349
299998,2014


# 4. Contrast Encoding

## 4.1. Helmert Encoding

In [23]:
encoder = ce.HelmertEncoder()
encoder.fit_transform(train_df['nom_5'], train_df['target']).reset_index(drop=True).head(5)

  elif pd.api.types.is_categorical(cols):


Unnamed: 0,intercept,nom_5_0,nom_5_1,nom_5_2,nom_5_3,nom_5_4,nom_5_5,nom_5_6,nom_5_7,nom_5_8,...,nom_5_211,nom_5_212,nom_5_213,nom_5_214,nom_5_215,nom_5_216,nom_5_217,nom_5_218,nom_5_219,nom_5_220
0,1,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
1,1,1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
2,1,0.0,2.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
3,1,0.0,0.0,3.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
4,1,0.0,0.0,0.0,4.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0


## 4.2. Sum Coding

In [27]:
encoder = ce.SumEncoder()
encoder.fit_transform(train_df['nom_5'], train_df['target']).reset_index(drop=True).head(5)

  elif pd.api.types.is_categorical(cols):


Unnamed: 0,intercept,nom_5_0,nom_5_1,nom_5_2,nom_5_3,nom_5_4,nom_5_5,nom_5_6,nom_5_7,nom_5_8,...,nom_5_211,nom_5_212,nom_5_213,nom_5_214,nom_5_215,nom_5_216,nom_5_217,nom_5_218,nom_5_219,nom_5_220
0,1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## 4.3. Backward Difference 

In [41]:
encoder = ce.BackwardDifferenceEncoder()
encoder.fit_transform(train_df['nom_5'], train_df['target']).reset_index(drop=True).head(5)

  elif pd.api.types.is_categorical(cols):


Unnamed: 0,intercept,nom_5_0,nom_5_1,nom_5_2,nom_5_3,nom_5_4,nom_5_5,nom_5_6,nom_5_7,nom_5_8,...,nom_5_211,nom_5_212,nom_5_213,nom_5_214,nom_5_215,nom_5_216,nom_5_217,nom_5_218,nom_5_219,nom_5_220
0,1,-0.995495,-0.990991,-0.986486,-0.981982,-0.977477,-0.972973,-0.968468,-0.963964,-0.959459,...,-0.045045,-0.040541,-0.036036,-0.031532,-0.027027,-0.022523,-0.018018,-0.013514,-0.009009,-0.004505
1,1,0.004505,-0.990991,-0.986486,-0.981982,-0.977477,-0.972973,-0.968468,-0.963964,-0.959459,...,-0.045045,-0.040541,-0.036036,-0.031532,-0.027027,-0.022523,-0.018018,-0.013514,-0.009009,-0.004505
2,1,0.004505,0.009009,-0.986486,-0.981982,-0.977477,-0.972973,-0.968468,-0.963964,-0.959459,...,-0.045045,-0.040541,-0.036036,-0.031532,-0.027027,-0.022523,-0.018018,-0.013514,-0.009009,-0.004505
3,1,0.004505,0.009009,0.013514,-0.981982,-0.977477,-0.972973,-0.968468,-0.963964,-0.959459,...,-0.045045,-0.040541,-0.036036,-0.031532,-0.027027,-0.022523,-0.018018,-0.013514,-0.009009,-0.004505
4,1,0.004505,0.009009,0.013514,0.018018,-0.977477,-0.972973,-0.968468,-0.963964,-0.959459,...,-0.045045,-0.040541,-0.036036,-0.031532,-0.027027,-0.022523,-0.018018,-0.013514,-0.009009,-0.004505


## 4.4. Polynomial Encoding

In [48]:
encoder = ce.PolynomialEncoder()
encoder.fit_transform(train_df['nom_5'], train_df['target']).reset_index(drop=True).head(5)

  elif pd.api.types.is_categorical(cols):
  raw_poly = scores.reshape((-1, 1)) ** np.arange(n).reshape((1, -1))


Unnamed: 0,intercept,nom_5_0,nom_5_1,nom_5_2,nom_5_3,nom_5_4,nom_5_5,nom_5_6,nom_5_7,nom_5_8,...,nom_5_211,nom_5_212,nom_5_213,nom_5_214,nom_5_215,nom_5_216,nom_5_217,nom_5_218,nom_5_219,nom_5_220
0,1,,,,,,,,,,...,,,,,,,,,,
1,1,,,,,,,,,,...,,,,,,,,,,
2,1,,,,,,,,,,...,,,,,,,,,,
3,1,,,,,,,,,,...,,,,,,,,,,
4,1,,,,,,,,,,...,,,,,,,,,,


# 5. Bayesian Encoding

## 5.1. Target

In [49]:
encoder = ce.TargetEncoder()
encoder.fit_transform(train_df['nom_5'], train_df['target']).reset_index(drop=True).head(5)

  elif pd.api.types.is_categorical(cols):


Unnamed: 0,nom_5
0,0.358134
1,0.388889
2,0.274564
3,0.234872
4,0.312438


## 5.2. LeaveOneOut

In [50]:
encoder = ce.LeaveOneOutEncoder()
encoder.fit_transform(train_df['nom_5'], train_df['target']).reset_index(drop=True).head(5)

Unnamed: 0,nom_5
0,0.358272
1,0.389381
2,0.274673
3,0.234086
4,0.312593


## 5.3. WeightOfEvidence

In [51]:
encoder = ce.WOEEncoder()
encoder.fit_transform(train_df['nom_5'], train_df['target']).reset_index(drop=True).head(5)

  elif pd.api.types.is_categorical(cols):


Unnamed: 0,nom_5
0,0.236444
1,0.368632
2,-0.151252
3,-0.358546
4,0.031561


In [52]:
train_df.sample(3)

Unnamed: 0_level_0,bin_0,bin_1,bin_2,bin_3,bin_4,nom_0,nom_1,nom_2,nom_3,nom_4,...,nom_9,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5,day,month,target
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
290581,0,0,0,F,N,Red,Trapezoid,Lion,Russia,Bassoon,...,3c6b8da7a,2,Grandmaster,Freezing,h,S,cp,1,1,0
227277,0,0,0,F,Y,Green,Triangle,Snake,Costa Rica,Piano,...,dbc8fb63f,2,Grandmaster,Boiling Hot,j,I,sD,3,1,1
85892,0,0,0,F,N,Blue,Trapezoid,Hamster,Canada,Bassoon,...,f88695eff,1,Novice,Boiling Hot,h,Z,FI,5,9,0
