# Recap:

1. Feature Scaling  
    a. MinMax Scaling  
    b. Robust Scaling  
    c. StandardScaling  

2. Encoding:  
    a. Label Encoding - applied on ordinal variable  
    b. One Hot Encoding - applied on nominal variable  
    c. Ordinal Encoding - applied on ordinal variable  

# Agenda:

1. Complete Feature Encoding
2. Apply both Feature Encoding and Feature Scaling on a dataset
3. Linear Regression

## Load the standard libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

## Loading the data

In [60]:
data = pd.read_csv('temperature_data.csv')
data

Unnamed: 0,Temperature,Color,Target
0,Hot,Red,1
1,Cold,Yellow,1
2,Very Hot,Blue,1
3,Warm,Blue,0
4,Hot,Red,1
5,Warm,Yellow,0
6,Warm,Red,1
7,Hot,Yellow,0
8,Hot,Yellow,1
9,Cold,Yellow,1


## Note:

1. Temperature is a Ordinal variable - we apply label encoding and ordinal encoding on this varaible
2. Color is a nominal variable - we apply one hot encoding on this variable

## Label Encoding on Temperature column

In [3]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le

In [4]:
data['Temperature'] = le.fit_transform(data['Temperature'])
data

Unnamed: 0,Temperature,Color,Target
0,1,Red,1
1,0,Yellow,1
2,2,Blue,1
3,3,Blue,0
4,1,Red,1
5,3,Yellow,0
6,3,Red,1
7,1,Yellow,0
8,1,Yellow,1
9,0,Yellow,1


## Ordinal Encoding on Temperature Column

In [6]:
data

Unnamed: 0,Temperature,Color,Target
0,Hot,Red,1
1,Cold,Yellow,1
2,Very Hot,Blue,1
3,Warm,Blue,0
4,Hot,Red,1
5,Warm,Yellow,0
6,Warm,Red,1
7,Hot,Yellow,0
8,Hot,Yellow,1
9,Cold,Yellow,1


In [7]:
from sklearn.preprocessing import OrdinalEncoder
oe = OrdinalEncoder()
oe

In [9]:
data['Temperature'] = oe.fit_transform(data[['Temperature']])
data

Unnamed: 0,Temperature,Color,Target
0,1.0,Red,1
1,0.0,Yellow,1
2,2.0,Blue,1
3,3.0,Blue,0
4,1.0,Red,1
5,3.0,Yellow,0
6,3.0,Red,1
7,1.0,Yellow,0
8,1.0,Yellow,1
9,0.0,Yellow,1


## Manual Encoding the Temperature Variable

In [47]:
data

Unnamed: 0,Temperature,Color,Target
0,Hot,Red,1
1,Cold,Yellow,1
2,Very Hot,Blue,1
3,Warm,Blue,0
4,Hot,Red,1
5,Warm,Yellow,0
6,Warm,Red,1
7,Hot,Yellow,0
8,Hot,Yellow,1
9,Cold,Yellow,1


In [48]:
data['Temperature'].unique()

array(['Hot', 'Cold', 'Very Hot', 'Warm'], dtype=object)

In [49]:
dic = {
    'Cold' : 0,
    'Warm' : 1,
    'Hot' : 2,
    'Very Hot' : 3
}
dic

{'Cold': 0, 'Warm': 1, 'Hot': 2, 'Very Hot': 3}

In [50]:
data['Temperature'] = data['Temperature'].replace(dic)
data

Unnamed: 0,Temperature,Color,Target
0,2,Red,1
1,0,Yellow,1
2,3,Blue,1
3,1,Blue,0
4,2,Red,1
5,1,Yellow,0
6,1,Red,1
7,2,Yellow,0
8,2,Yellow,1
9,0,Yellow,1


## Nominal Data Encoding

## One Hot Encoding on Color Variable

1. import OneHotEncoder function from sklearn library
2. Apply onehotencoder on nominal varible
3. store the output of ohe in a data_ohe
4. concatenate original data and data_ohe
5. drop the nominal varible from the dat

In [30]:
data

Unnamed: 0,Temperature,Color,Target
0,2,Red,1
1,0,Yellow,1
2,3,Blue,1
3,1,Blue,0
4,2,Red,1
5,1,Yellow,0
6,1,Red,1
7,2,Yellow,0
8,2,Yellow,1
9,0,Yellow,1


In [31]:
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder()
ohe

In [32]:
data_ohe = ohe.fit_transform(data[['Color']])
data_ohe

<10x3 sparse matrix of type '<class 'numpy.float64'>'
	with 10 stored elements in Compressed Sparse Row format>

In [33]:
data_ohe.toarray()

array([[0., 1., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.]])

In [35]:
data_ohe = pd.DataFrame(data_ohe.toarray(), columns = ['Blue', 'Red', 'Yellow'])
data_ohe

Unnamed: 0,Blue,Red,Yellow
0,0.0,1.0,0.0
1,0.0,0.0,1.0
2,1.0,0.0,0.0
3,1.0,0.0,0.0
4,0.0,1.0,0.0
5,0.0,0.0,1.0
6,0.0,1.0,0.0
7,0.0,0.0,1.0
8,0.0,0.0,1.0
9,0.0,0.0,1.0


In [36]:
data

Unnamed: 0,Temperature,Color,Target
0,2,Red,1
1,0,Yellow,1
2,3,Blue,1
3,1,Blue,0
4,2,Red,1
5,1,Yellow,0
6,1,Red,1
7,2,Yellow,0
8,2,Yellow,1
9,0,Yellow,1


In [37]:
data = pd.concat([data, data_ohe], axis = 1)
data

Unnamed: 0,Temperature,Color,Target,Blue,Red,Yellow
0,2,Red,1,0.0,1.0,0.0
1,0,Yellow,1,0.0,0.0,1.0
2,3,Blue,1,1.0,0.0,0.0
3,1,Blue,0,1.0,0.0,0.0
4,2,Red,1,0.0,1.0,0.0
5,1,Yellow,0,0.0,0.0,1.0
6,1,Red,1,0.0,1.0,0.0
7,2,Yellow,0,0.0,0.0,1.0
8,2,Yellow,1,0.0,0.0,1.0
9,0,Yellow,1,0.0,0.0,1.0


In [38]:
data = data.drop('Color', axis = 1)
data

Unnamed: 0,Temperature,Target,Blue,Red,Yellow
0,2,1,0.0,1.0,0.0
1,0,1,0.0,0.0,1.0
2,3,1,1.0,0.0,0.0
3,1,0,1.0,0.0,0.0
4,2,1,0.0,1.0,0.0
5,1,0,0.0,0.0,1.0
6,1,1,0.0,1.0,0.0
7,2,0,0.0,0.0,1.0
8,2,1,0.0,0.0,1.0
9,0,1,0.0,0.0,1.0


## One Hot Encoding on Color variable using pandas pd.get_dummies()

In [51]:
data

Unnamed: 0,Temperature,Color,Target
0,2,Red,1
1,0,Yellow,1
2,3,Blue,1
3,1,Blue,0
4,2,Red,1
5,1,Yellow,0
6,1,Red,1
7,2,Yellow,0
8,2,Yellow,1
9,0,Yellow,1


In [61]:
data_ohe = pd.get_dummies(data['Color'])
data_ohe

Unnamed: 0,Blue,Red,Yellow
0,0,1,0
1,0,0,1
2,1,0,0
3,1,0,0
4,0,1,0
5,0,0,1
6,0,1,0
7,0,0,1
8,0,0,1
9,0,0,1


In [54]:
data = pd.concat([data, data_ohe], axis = 1)
data

Unnamed: 0,Temperature,Color,Target,Blue,Red,Yellow
0,2,Red,1,0,1,0
1,0,Yellow,1,0,0,1
2,3,Blue,1,1,0,0
3,1,Blue,0,1,0,0
4,2,Red,1,0,1,0
5,1,Yellow,0,0,0,1
6,1,Red,1,0,1,0
7,2,Yellow,0,0,0,1
8,2,Yellow,1,0,0,1
9,0,Yellow,1,0,0,1


In [55]:
data = data.drop('Color', axis = 1)
data

Unnamed: 0,Temperature,Target,Blue,Red,Yellow
0,2,1,0,1,0
1,0,1,0,0,1
2,3,1,1,0,0
3,1,0,1,0,0
4,2,1,0,1,0
5,1,0,0,0,1
6,1,1,0,1,0
7,2,0,0,0,1
8,2,1,0,0,1
9,0,1,0,0,1


## Importing the Kaggle data

In [74]:
data = pd.read_csv('marketing_campaign.csv', sep = '\t')
data.head()

Unnamed: 0,ID,Year_Birth,Education,Marital_Status,Income,Kidhome,Teenhome,Dt_Customer,Recency,MntWines,...,NumWebVisitsMonth,AcceptedCmp3,AcceptedCmp4,AcceptedCmp5,AcceptedCmp1,AcceptedCmp2,Complain,Z_CostContact,Z_Revenue,Response
0,5524,1957,Graduation,Single,58138.0,0,0,04-09-2012,58,635,...,7,0,0,0,0,0,0,3,11,1
1,2174,1954,Graduation,Single,46344.0,1,1,08-03-2014,38,11,...,5,0,0,0,0,0,0,3,11,0
2,4141,1965,Graduation,Together,71613.0,0,0,21-08-2013,26,426,...,4,0,0,0,0,0,0,3,11,0
3,6182,1984,Graduation,Together,26646.0,1,0,10-02-2014,26,11,...,6,0,0,0,0,0,0,3,11,0
4,5324,1981,PhD,Married,58293.0,1,0,19-01-2014,94,173,...,5,0,0,0,0,0,0,3,11,0


In [75]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2240 entries, 0 to 2239
Data columns (total 29 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   ID                   2240 non-null   int64  
 1   Year_Birth           2240 non-null   int64  
 2   Education            2240 non-null   object 
 3   Marital_Status       2240 non-null   object 
 4   Income               2216 non-null   float64
 5   Kidhome              2240 non-null   int64  
 6   Teenhome             2240 non-null   int64  
 7   Dt_Customer          2240 non-null   object 
 8   Recency              2240 non-null   int64  
 9   MntWines             2240 non-null   int64  
 10  MntFruits            2240 non-null   int64  
 11  MntMeatProducts      2240 non-null   int64  
 12  MntFishProducts      2240 non-null   int64  
 13  MntSweetProducts     2240 non-null   int64  
 14  MntGoldProds         2240 non-null   int64  
 15  NumDealsPurchases    2240 non-null   i

## Since Education is a Ordinal variable, apply label Encoding on Education variable

In [76]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le

In [77]:
data['Education'] = le.fit_transform(data['Education'])
data.head()

Unnamed: 0,ID,Year_Birth,Education,Marital_Status,Income,Kidhome,Teenhome,Dt_Customer,Recency,MntWines,...,NumWebVisitsMonth,AcceptedCmp3,AcceptedCmp4,AcceptedCmp5,AcceptedCmp1,AcceptedCmp2,Complain,Z_CostContact,Z_Revenue,Response
0,5524,1957,2,Single,58138.0,0,0,04-09-2012,58,635,...,7,0,0,0,0,0,0,3,11,1
1,2174,1954,2,Single,46344.0,1,1,08-03-2014,38,11,...,5,0,0,0,0,0,0,3,11,0
2,4141,1965,2,Together,71613.0,0,0,21-08-2013,26,426,...,4,0,0,0,0,0,0,3,11,0
3,6182,1984,2,Together,26646.0,1,0,10-02-2014,26,11,...,6,0,0,0,0,0,0,3,11,0
4,5324,1981,4,Married,58293.0,1,0,19-01-2014,94,173,...,5,0,0,0,0,0,0,3,11,0


## Marital Status variables is nominal, apply one hot encoding on Marital_Status

In [78]:
data_ohe = pd.get_dummies(data['Marital_Status'])
data_ohe

Unnamed: 0,Absurd,Alone,Divorced,Married,Single,Together,Widow,YOLO
0,0,0,0,0,1,0,0,0
1,0,0,0,0,1,0,0,0
2,0,0,0,0,0,1,0,0
3,0,0,0,0,0,1,0,0
4,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...
2235,0,0,0,1,0,0,0,0
2236,0,0,0,0,0,1,0,0
2237,0,0,1,0,0,0,0,0
2238,0,0,0,0,0,1,0,0


In [79]:
data = pd.concat([data, data_ohe], axis = 1)
data.head()

Unnamed: 0,ID,Year_Birth,Education,Marital_Status,Income,Kidhome,Teenhome,Dt_Customer,Recency,MntWines,...,Z_Revenue,Response,Absurd,Alone,Divorced,Married,Single,Together,Widow,YOLO
0,5524,1957,2,Single,58138.0,0,0,04-09-2012,58,635,...,11,1,0,0,0,0,1,0,0,0
1,2174,1954,2,Single,46344.0,1,1,08-03-2014,38,11,...,11,0,0,0,0,0,1,0,0,0
2,4141,1965,2,Together,71613.0,0,0,21-08-2013,26,426,...,11,0,0,0,0,0,0,1,0,0
3,6182,1984,2,Together,26646.0,1,0,10-02-2014,26,11,...,11,0,0,0,0,0,0,1,0,0
4,5324,1981,4,Married,58293.0,1,0,19-01-2014,94,173,...,11,0,0,0,0,1,0,0,0,0


In [80]:
data = data.drop('Marital_Status', axis = 1)
data

Unnamed: 0,ID,Year_Birth,Education,Income,Kidhome,Teenhome,Dt_Customer,Recency,MntWines,MntFruits,...,Z_Revenue,Response,Absurd,Alone,Divorced,Married,Single,Together,Widow,YOLO
0,5524,1957,2,58138.0,0,0,04-09-2012,58,635,88,...,11,1,0,0,0,0,1,0,0,0
1,2174,1954,2,46344.0,1,1,08-03-2014,38,11,1,...,11,0,0,0,0,0,1,0,0,0
2,4141,1965,2,71613.0,0,0,21-08-2013,26,426,49,...,11,0,0,0,0,0,0,1,0,0
3,6182,1984,2,26646.0,1,0,10-02-2014,26,11,4,...,11,0,0,0,0,0,0,1,0,0
4,5324,1981,4,58293.0,1,0,19-01-2014,94,173,43,...,11,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2235,10870,1967,2,61223.0,0,1,13-06-2013,46,709,43,...,11,0,0,0,0,1,0,0,0,0
2236,4001,1946,4,64014.0,2,1,10-06-2014,56,406,0,...,11,0,0,0,0,0,0,1,0,0
2237,7270,1981,2,56981.0,0,0,25-01-2014,91,908,48,...,11,0,0,0,1,0,0,0,0,0
2238,8235,1956,3,69245.0,0,1,24-01-2014,8,428,30,...,11,0,0,0,0,0,0,1,0,0


# Feature Scaling

- Income and Amount Spent on wines are not on a same scale hence apply FeatureScaling

In [81]:
from sklearn.preprocessing import MinMaxScaler
mms = MinMaxScaler()
mms

In [82]:
data[['Income', 'MntWines']] = mms.fit_transform(data[['Income', 'MntWines']])
data.head()

Unnamed: 0,ID,Year_Birth,Education,Income,Kidhome,Teenhome,Dt_Customer,Recency,MntWines,MntFruits,...,Z_Revenue,Response,Absurd,Alone,Divorced,Married,Single,Together,Widow,YOLO
0,5524,1957,2,0.084832,0,0,04-09-2012,58,0.425318,88,...,11,1,0,0,0,0,1,0,0,0
1,2174,1954,2,0.067095,1,1,08-03-2014,38,0.007368,1,...,11,0,0,0,0,0,1,0,0,0
2,4141,1965,2,0.105097,0,0,21-08-2013,26,0.285332,49,...,11,0,0,0,0,0,0,1,0,0
3,6182,1984,2,0.037471,1,0,10-02-2014,26,0.007368,4,...,11,0,0,0,0,0,0,1,0,0
4,5324,1981,4,0.085065,1,0,19-01-2014,94,0.115874,43,...,11,0,0,0,0,1,0,0,0,0
