## Backward difference coding Using `Categorical encoders`

https://contrib.scikit-learn.org/categorical-encoding/backward_difference.html#

### Installing categorical encoders

In [1]:
import pandas as pd
import numpy as np

from statsmodels.formula.api import ols

import category_encoders as ce

### Creating a dataframe
The same 'ol Iris dataset

In [2]:
iris_data = pd.read_csv("datasets/iris.csv")

iris_data.sample(10)

Unnamed: 0,Species,sepal_length,sepal_width,petal_length,petal_width
15,Iris-setosa,5.7,4.4,1.5,0.4
96,Iris-versicolor,5.7,2.9,4.2,1.3
89,Iris-versicolor,5.5,2.5,4.0,1.3
138,Iris-virginica,6.0,3.0,4.8,1.8
128,Iris-virginica,6.4,2.8,5.6,2.1
54,Iris-versicolor,6.5,2.8,4.6,1.5
43,Iris-setosa,5.0,3.5,1.6,0.6
83,Iris-versicolor,6.0,2.7,5.1,1.6
44,Iris-setosa,5.1,3.8,1.9,0.4
60,Iris-versicolor,5.0,2.0,3.5,1.0


In [3]:
iris_data.drop(columns=['sepal_length', 'sepal_width', 'petal_width'], 
               inplace=True)

iris_data.sample(5)

Unnamed: 0,Species,petal_length
91,Iris-versicolor,4.6
22,Iris-setosa,1.0
97,Iris-versicolor,4.3
61,Iris-versicolor,4.2
85,Iris-versicolor,4.5


In [4]:
iris_data.describe()

Unnamed: 0,petal_length
count,150.0
mean,3.758667
std,1.76442
min,1.0
25%,1.6
50%,4.35
75%,5.1
max,6.9


In [5]:
iris_data.groupby(by='Species').mean()

Unnamed: 0_level_0,petal_length
Species,Unnamed: 1_level_1
Iris-setosa,1.464
Iris-versicolor,4.26
Iris-virginica,5.552


In [6]:
iris_data.groupby(by='Species').mean().diff()

Unnamed: 0_level_0,petal_length
Species,Unnamed: 1_level_1
Iris-setosa,
Iris-versicolor,2.796
Iris-virginica,1.292


In [7]:
mod = ols("petal_length ~ C(Species, Diff)", 
          data=iris_data)

res = mod.fit()

res.summary()

0,1,2,3
Dep. Variable:,petal_length,R-squared:,0.941
Model:,OLS,Adj. R-squared:,0.941
Method:,Least Squares,F-statistic:,1179.0
Date:,"Tue, 28 Jul 2020",Prob (F-statistic):,3.05e-91
Time:,18:06:42,Log-Likelihood:,-84.84
No. Observations:,150,AIC:,175.7
Df Residuals:,147,BIC:,184.7
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,3.7587,0.035,106.978,0.000,3.689,3.828
"C(Species, Diff)[D.Iris-setosa]",2.7960,0.086,32.488,0.000,2.626,2.966
"C(Species, Diff)[D.Iris-versicolor]",1.2920,0.086,15.012,0.000,1.122,1.462

0,1,2,3
Omnibus:,4.393,Durbin-Watson:,2.0
Prob(Omnibus):,0.111,Jarque-Bera (JB):,5.37
Skew:,0.121,Prob(JB):,0.0682
Kurtosis:,3.895,Cond. No.,3.0


https://contrib.scikit-learn.org/categorical-encoding/backward_difference.html

### Creating a object and giving the column which need to be encoded as argument

In [8]:
encoder = ce.BackwardDifferenceEncoder(cols=['Species'])
encoder

BackwardDifferenceEncoder(cols=['Species'], drop_invariant=False,
                          handle_missing='value', handle_unknown='value',
                          mapping=None, return_df=True, verbose=0)

### Dataframe with Encoded columns of  `color`

In [9]:
species_encoded = encoder.fit_transform(iris_data)

species_encoded.head()

Unnamed: 0,intercept,Species_0,Species_1,petal_length
0,1,-0.666667,-0.333333,1.4
1,1,-0.666667,-0.333333,1.4
2,1,-0.666667,-0.333333,1.3
3,1,-0.666667,-0.333333,1.5
4,1,-0.666667,-0.333333,1.4


In [10]:
species_encoded.sample(5)

Unnamed: 0,intercept,Species_0,Species_1,petal_length
4,1,-0.666667,-0.333333,1.4
25,1,-0.666667,-0.333333,1.6
91,1,0.333333,-0.333333,4.6
90,1,0.333333,-0.333333,4.4
30,1,-0.666667,-0.333333,1.6


In [11]:
encoded_iris = pd.concat([iris_data['Species'], species_encoded], 
                         axis = 1)

encoded_iris.sample(10)

Unnamed: 0,Species,intercept,Species_0,Species_1,petal_length
54,Iris-versicolor,1,0.333333,-0.333333,4.6
136,Iris-virginica,1,0.333333,0.666667,5.6
35,Iris-setosa,1,-0.666667,-0.333333,1.2
64,Iris-versicolor,1,0.333333,-0.333333,3.6
122,Iris-virginica,1,0.333333,0.666667,6.7
13,Iris-setosa,1,-0.666667,-0.333333,1.1
39,Iris-setosa,1,-0.666667,-0.333333,1.5
19,Iris-setosa,1,-0.666667,-0.333333,1.5
23,Iris-setosa,1,-0.666667,-0.333333,1.7
72,Iris-versicolor,1,0.333333,-0.333333,4.9


In [12]:
X = encoded_iris.drop(columns=['Species', 'petal_length'])

y = encoded_iris.petal_length

In [13]:
X.head()

Unnamed: 0,intercept,Species_0,Species_1
0,1,-0.666667,-0.333333
1,1,-0.666667,-0.333333
2,1,-0.666667,-0.333333
3,1,-0.666667,-0.333333
4,1,-0.666667,-0.333333


In [14]:
from sklearn.linear_model import LinearRegression

linear_model = LinearRegression(fit_intercept=False)

linear_model.fit(X, y)

print("Training_score : " , linear_model.score(X, y))

Training_score :  0.9413189735606261


In [15]:
linear_model.coef_

array([3.75866667, 2.796     , 1.292     ])

In [16]:
linear_model.intercept_

0.0