## Simple Coding 
This coding method compares each level to a fixed reference level. In simple coding, the intercept is the grand mean of all the levels of the factors.

https://www.statsmodels.org/dev/contrasts.html#simple-coding

### Importing library

In [1]:
import pandas as pd
import numpy as np

from statsmodels.formula.api import ols
import statsmodels.api as sm

### Reading dataset

In [2]:
iris_data = pd.read_csv("datasets/iris.csv")

iris_data.sample(10)

Unnamed: 0,Species,sepal_length,sepal_width,petal_length,petal_width
13,Iris-setosa,4.3,3.0,1.1,0.1
63,Iris-versicolor,6.1,2.9,4.7,1.4
140,Iris-virginica,6.7,3.1,5.6,2.4
2,Iris-setosa,4.7,3.2,1.3,0.2
47,Iris-setosa,4.6,3.2,1.4,0.2
119,Iris-virginica,6.0,2.2,5.0,1.5
51,Iris-versicolor,6.4,3.2,4.5,1.5
36,Iris-setosa,5.5,3.5,1.3,0.2
35,Iris-setosa,5.0,3.2,1.2,0.2
133,Iris-virginica,6.3,2.8,5.1,1.5


In [3]:
iris_data.shape

(150, 5)

In [4]:
iris_data.drop(columns=['sepal_length', 'sepal_width', 'petal_width'], 
               inplace=True)

iris_data.sample(5)

Unnamed: 0,Species,petal_length
83,Iris-versicolor,5.1
30,Iris-setosa,1.6
87,Iris-versicolor,4.4
36,Iris-setosa,1.3
131,Iris-virginica,6.4


In [5]:
iris_data.describe()

Unnamed: 0,petal_length
count,150.0
mean,3.758667
std,1.76442
min,1.0
25%,1.6
50%,4.35
75%,5.1
max,6.9


In [6]:
iris_species_mean = iris_data.groupby(by='Species').mean()

iris_species_mean

Unnamed: 0_level_0,petal_length
Species,Unnamed: 1_level_1
Iris-setosa,1.464
Iris-versicolor,4.26
Iris-virginica,5.552


In [7]:
iris_species_mean.loc['Iris-versicolor']['petal_length'] - \
    iris_species_mean.loc['Iris-setosa']['petal_length']

2.796

In [8]:
iris_species_mean.loc['Iris-virginica']['petal_length'] - \
    iris_species_mean.loc['Iris-setosa']['petal_length']

4.087999999999999

In [9]:
mod = ols("petal_length ~ Species", 
          data=iris_data)

res = mod.fit()

res.summary()

0,1,2,3
Dep. Variable:,petal_length,R-squared:,0.941
Model:,OLS,Adj. R-squared:,0.941
Method:,Least Squares,F-statistic:,1179.0
Date:,"Tue, 28 Jul 2020",Prob (F-statistic):,3.05e-91
Time:,17:50:39,Log-Likelihood:,-84.84
No. Observations:,150,AIC:,175.7
Df Residuals:,147,BIC:,184.7
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,1.4640,0.061,24.057,0.000,1.344,1.584
Species[T.Iris-versicolor],2.7960,0.086,32.488,0.000,2.626,2.966
Species[T.Iris-virginica],4.0880,0.086,47.500,0.000,3.918,4.258

0,1,2,3
Omnibus:,4.393,Durbin-Watson:,2.0
Prob(Omnibus):,0.111,Jarque-Bera (JB):,5.37
Skew:,0.121,Prob(JB):,0.0682
Kurtosis:,3.895,Cond. No.,3.73


In [10]:
from patsy.contrasts import ContrastMatrix

def _name_levels(prefix, levels):
    return ["[%s%s]" % (prefix, level) for level in levels]

#### Simple effect coding
https://www.statsmodels.org/dev/contrasts.html#user-defined

In [11]:
class Simple(object):
    
    def _simple_contrast(self, levels):
        nlevels = len(levels)
        
        contr = -1./nlevels * np.ones((nlevels, nlevels-1))
        contr[1:][np.diag_indices(nlevels-1)] = (nlevels-1.)/nlevels
        
        return contr

    def code_with_intercept(self, levels):
        contrast = np.column_stack((np.ones(len(levels)),
                                   self._simple_contrast(levels)))
        
        return ContrastMatrix(contrast, _name_levels("Simp.", levels))

    def code_without_intercept(self, levels):
        contrast = self._simple_contrast(levels)
        
        return ContrastMatrix(contrast, _name_levels("Simp.", levels[:-1]))


In [12]:
mod = ols("petal_length ~ C(Species, Simple)", 
          data=iris_data)

res = mod.fit()

res.summary()

0,1,2,3
Dep. Variable:,petal_length,R-squared:,0.941
Model:,OLS,Adj. R-squared:,0.941
Method:,Least Squares,F-statistic:,1179.0
Date:,"Tue, 28 Jul 2020",Prob (F-statistic):,3.05e-91
Time:,17:50:39,Log-Likelihood:,-84.84
No. Observations:,150,AIC:,175.7
Df Residuals:,147,BIC:,184.7
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,3.7587,0.035,106.978,0.000,3.689,3.828
"C(Species, Simple)[Simp.Iris-setosa]",2.7960,0.086,32.488,0.000,2.626,2.966
"C(Species, Simple)[Simp.Iris-versicolor]",4.0880,0.086,47.500,0.000,3.918,4.258

0,1,2,3
Omnibus:,4.393,Durbin-Watson:,2.0
Prob(Omnibus):,0.111,Jarque-Bera (JB):,5.37
Skew:,0.121,Prob(JB):,0.0682
Kurtosis:,3.895,Cond. No.,3.0


In [13]:
iris_data.Species.unique()

array(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'], dtype=object)

In [14]:
levels = [0, 1, 2]

In [15]:
contrast_with_intercept = Simple().code_with_intercept(levels)

contrast_with_intercept.matrix

array([[ 1.        , -0.33333333, -0.33333333],
       [ 1.        ,  0.66666667, -0.33333333],
       [ 1.        , -0.33333333,  0.66666667]])

In [16]:
contrast_without_intercept = Simple().code_without_intercept(levels)

contrast_without_intercept.matrix

array([[-0.33333333, -0.33333333],
       [ 0.66666667, -0.33333333],
       [-0.33333333,  0.66666667]])

In [17]:
from sklearn import preprocessing

label_encoder = preprocessing.LabelEncoder()

In [18]:
iris_data_enc = iris_data.copy()

iris_data_enc['species_encoded'] = label_encoder.fit_transform(iris_data.Species)

In [19]:
label_encoder.classes_

array(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'], dtype=object)

#### Setting iris-setosa is the reference class
Out of the 3 classes, we can generate a contrast matrix with iris-setosa (index 0) as the reference class by subtracting 0 from the encoded label values

In [20]:
iris_data_contrast_setosa = \
    contrast_without_intercept.matrix[iris_data_enc.species_encoded - 0, :]

iris_data_contrast_setosa[:5]

array([[-0.33333333, -0.33333333],
       [-0.33333333, -0.33333333],
       [-0.33333333, -0.33333333],
       [-0.33333333, -0.33333333],
       [-0.33333333, -0.33333333]])

In [21]:
contrast_with_intercept.matrix[iris_data_enc.species_encoded - 0, :][:5]

array([[ 1.        , -0.33333333, -0.33333333],
       [ 1.        , -0.33333333, -0.33333333],
       [ 1.        , -0.33333333, -0.33333333],
       [ 1.        , -0.33333333, -0.33333333],
       [ 1.        , -0.33333333, -0.33333333]])

#### Setting iris-versicolor is the reference class
Out of the 3 classes, we can generate a contrast matrix with iris-versicolor (index 1) as the reference class by subtracting 1 from the encoded label values

In [22]:
iris_data_contrast_versicolor = \
    contrast_without_intercept.matrix[iris_data_enc.species_encoded - 1, :]

iris_data_contrast_versicolor[:5]

array([[-0.33333333,  0.66666667],
       [-0.33333333,  0.66666667],
       [-0.33333333,  0.66666667],
       [-0.33333333,  0.66666667],
       [-0.33333333,  0.66666667]])

In [23]:
contrast_with_intercept.matrix[iris_data_enc.species_encoded - 1, :][:5]

array([[ 1.        , -0.33333333,  0.66666667],
       [ 1.        , -0.33333333,  0.66666667],
       [ 1.        , -0.33333333,  0.66666667],
       [ 1.        , -0.33333333,  0.66666667],
       [ 1.        , -0.33333333,  0.66666667]])

#### Setting iris-virginica is the reference class
Out of the 3 classes, we can generate a contrast matrix with iris-virginica (index 2) as the reference class by subtracting 2 from the encoded label values

In [24]:
iris_data_contrast_virginica = \
    contrast_without_intercept.matrix[iris_data_enc.species_encoded - 2, :]

iris_data_contrast_virginica[:5]

array([[ 0.66666667, -0.33333333],
       [ 0.66666667, -0.33333333],
       [ 0.66666667, -0.33333333],
       [ 0.66666667, -0.33333333],
       [ 0.66666667, -0.33333333]])

In [25]:
contrast_with_intercept.matrix[iris_data_enc.species_encoded - 2, :][:5]

array([[ 1.        ,  0.66666667, -0.33333333],
       [ 1.        ,  0.66666667, -0.33333333],
       [ 1.        ,  0.66666667, -0.33333333],
       [ 1.        ,  0.66666667, -0.33333333],
       [ 1.        ,  0.66666667, -0.33333333]])

In [26]:
iris_data_contrast = pd.DataFrame(iris_data_contrast_setosa, 
                                  columns =['Iris-versicolor', 'Iris-virginica'])

iris_data_contrast.sample(10)

Unnamed: 0,Iris-versicolor,Iris-virginica
18,-0.333333,-0.333333
81,0.666667,-0.333333
30,-0.333333,-0.333333
92,0.666667,-0.333333
13,-0.333333,-0.333333
70,0.666667,-0.333333
91,0.666667,-0.333333
52,0.666667,-0.333333
117,-0.333333,0.666667
98,0.666667,-0.333333


In [27]:
iris_data_enc = pd.concat([iris_data, iris_data_contrast], 
                          axis=1)

iris_data_enc.sample(10)

Unnamed: 0,Species,petal_length,Iris-versicolor,Iris-virginica
28,Iris-setosa,1.4,-0.333333,-0.333333
60,Iris-versicolor,3.5,0.666667,-0.333333
109,Iris-virginica,6.1,-0.333333,0.666667
16,Iris-setosa,1.3,-0.333333,-0.333333
31,Iris-setosa,1.5,-0.333333,-0.333333
18,Iris-setosa,1.7,-0.333333,-0.333333
25,Iris-setosa,1.6,-0.333333,-0.333333
136,Iris-virginica,5.6,-0.333333,0.666667
47,Iris-setosa,1.4,-0.333333,-0.333333
142,Iris-virginica,5.1,-0.333333,0.666667


In [28]:
X = iris_data_enc.drop(columns=['Species', 'petal_length'])

y = iris_data_enc.petal_length

In [29]:
X.head()

Unnamed: 0,Iris-versicolor,Iris-virginica
0,-0.333333,-0.333333
1,-0.333333,-0.333333
2,-0.333333,-0.333333
3,-0.333333,-0.333333
4,-0.333333,-0.333333


In [30]:
from sklearn.linear_model import LinearRegression

linear_model = LinearRegression()

linear_model.fit(X, y)

print("Training_score : " , linear_model.score(X, y))

Training_score :  0.9413189735606261


In [31]:
linear_model.coef_

array([2.796, 4.088])

In [32]:
linear_model.intercept_

3.7586666666666657