# Encoding (only for catagorical data)
1. Label Encoding or Ordinal Encoding
2. One-Hot Encoding
3. Effect Encoding
4. Binary Encoding
5. Base-N Encoding
6. Hash Encoding

In [1]:
import pandas as pd
import warnings
warnings.simplefilter('ignore')

In [2]:
data = pd.read_csv('no_outliers/data_nooutliers1.csv')
data.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,CustomerID,ProdTaken,Age,TypeofContact,CityTier,DurationOfPitch,Occupation,Gender,...,ProductPitched,PreferredPropertyStar,MaritalStatus,NumberOfTrips,Passport,PitchSatisfactionScore,OwnCar,NumberOfChildrenVisiting,Designation,MonthlyIncome
0,0,0,200000,1,41.0,Self Enquiry,3,6.0,Salaried,Female,...,Deluxe,3.0,Single,1.0,1,2,1,0.0,Manager,20993.0
1,1,1,200001,0,49.0,Company Invited,1,14.0,Salaried,Male,...,Deluxe,4.0,Divorced,2.0,0,3,1,2.0,Manager,20130.0
2,2,2,200002,1,37.0,Self Enquiry,1,8.0,Free Lancer,Male,...,Basic,3.0,Single,7.0,1,3,0,0.0,Executive,17090.0
3,3,3,200003,0,33.0,Company Invited,1,9.0,Salaried,Female,...,Basic,3.0,Divorced,2.0,1,5,1,1.0,Executive,17909.0
4,4,4,200004,0,32.0,Self Enquiry,1,8.0,Small Business,Male,...,Basic,4.0,Divorced,1.0,0,5,1,0.0,Executive,18468.0


In [3]:
cat_data=data[[fea for fea in data.columns if data[fea].dtype == 'O']]

In [4]:
cat_data.head()

Unnamed: 0,TypeofContact,Occupation,Gender,ProductPitched,MaritalStatus,Designation
0,Self Enquiry,Salaried,Female,Deluxe,Single,Manager
1,Company Invited,Salaried,Male,Deluxe,Divorced,Manager
2,Self Enquiry,Free Lancer,Male,Basic,Single,Executive
3,Company Invited,Salaried,Female,Basic,Divorced,Executive
4,Self Enquiry,Small Business,Male,Basic,Divorced,Executive


In [5]:
import category_encoders as ce

## Label or ordinal encoding

In [6]:
data.TypeofContact.unique()

array(['Self Enquiry', 'Company Invited'], dtype=object)

In [7]:
enc_toc = ce.OrdinalEncoder(cols=['TypeofContact'],return_df=True,mapping = [{'col':'TypeofContact','mapping':{'Self Enquiry':0,'Company Invited':1}}])

In [8]:
data_enc_toc=enc_toc.fit_transform(cat_data)

In [9]:
data_enc_toc['CustomerID'] = data['CustomerID']

## One Hot encoding

In [10]:
cat_data.columns

Index(['TypeofContact', 'Occupation', 'Gender', 'ProductPitched',
       'MaritalStatus', 'Designation'],
      dtype='object')

In [11]:
enc_oh = ce.OneHotEncoder(cols=['TypeofContact', 'Occupation', 'Gender', 'ProductPitched',
       'MaritalStatus', 'Designation'],handle_unknown='return_nan',return_df=True,use_cat_names=True)

In [12]:
data_enc_oh = enc_oh.fit_transform(cat_data)

In [13]:
data_enc_oh['CustomerID'] = data['CustomerID']

## Effect encoding

In [14]:
enc_effect = ce.sum_coding.SumEncoder(cols=cat_data.columns,verbose=False)

In [15]:
data_enc_effect = enc_effect.fit_transform(cat_data)

In [16]:
data_enc_effect['CustomerID'] = data['CustomerID']

## Hash encoder

In [17]:
enc_hash = ce.HashingEncoder(cols=cat_data.columns)

In [18]:
data_enc_hash=enc_hash.fit_transform(cat_data)

In [19]:
data_enc_hash['CustomerID'] = data['CustomerID']

## Binary encoder
its useless here though

In [20]:
enc_binary = ce.BinaryEncoder(cols=cat_data.columns)

In [21]:
enc_binary.fit_transform(cat_data)

Unnamed: 0,TypeofContact_0,TypeofContact_1,Occupation_0,Occupation_1,Occupation_2,Gender_0,Gender_1,ProductPitched_0,ProductPitched_1,ProductPitched_2,MaritalStatus_0,MaritalStatus_1,MaritalStatus_2,Designation_0,Designation_1,Designation_2
0,0,1,0,0,1,0,1,0,0,1,0,0,1,0,0,1
1,1,0,0,0,1,1,0,0,0,1,0,1,0,0,0,1
2,0,1,0,1,0,1,0,0,1,0,0,0,1,0,1,0
3,1,0,0,0,1,0,1,0,1,0,0,1,0,0,1,0
4,0,1,0,1,1,1,0,0,1,0,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4874,0,1,0,1,1,1,0,0,0,1,1,0,0,0,0,1
4875,1,0,0,0,1,1,0,0,1,0,0,0,1,0,1,0
4876,0,1,0,0,1,0,1,0,1,1,0,1,1,0,1,1
4877,0,1,0,1,1,1,0,0,1,0,0,0,1,0,1,0


## Base-N encoding

In [22]:
enc_baseN = ce.BaseNEncoder(cols=cat_data.columns,return_df=True,base=5)

In [23]:
data_enc_baseN=enc_baseN.fit_transform(cat_data)

In [24]:
data_enc_baseN['CustomerID'] = data['CustomerID']

# Turning encoded values to csv

In [25]:
import os
os.makedirs('encoded')

FileExistsError: [WinError 183] Cannot create a file when that file already exists: 'encoded'

In [26]:
data_enc_baseN.to_csv('encoded/data_enc_baseN.csv')

In [27]:
data_enc_effect.to_csv('encoded/data_enc_effect.csv')

In [28]:
data_enc_hash.to_csv('encoded/data_enc_hash.csv')

In [29]:
data_enc_oh.to_csv('encoded/data_enc_oh.csv')

In [30]:
data_enc_toc.to_csv('encoded/data_enc_toc.csv')