# Regression Analysis with dummy encoded categorical values

In [1]:
import pandas as pd

In [2]:
student_health = pd.read_csv('datasets/student_health.csv')

student_health.head()

Unnamed: 0,Grade,Gender,Height_cm,Weight_kg
0,First,Male,105,21
1,First,Female,126,25
2,First,Male,126,25
3,First,Male,112,20
4,First,Female,133,32


In [3]:
student_health.shape

(413, 4)

In [4]:
grade_mean = student_health.groupby(by='Grade').mean()

grade_mean

Unnamed: 0_level_0,Height_cm,Weight_kg
Grade,Unnamed: 1_level_1,Unnamed: 2_level_1
First,122.357143,26.183673
Second,126.168317,33.148515
Third,134.775701,36.070093


In [5]:
grade_mean.loc['Second']['Weight_kg'] - grade_mean.loc['First']['Weight_kg']

6.96484138209739

In [6]:
grade_mean.loc['Third']['Weight_kg'] - grade_mean.loc['First']['Weight_kg']

9.88641998855617

In [7]:
import statsmodels.api as sm
from statsmodels.formula.api import ols

import numpy as np

#### OLS Regression against Grade
The coefficients and T-statistics confirm that the grade does have a bearing on student weight

In [8]:
mod = ols("Weight_kg ~ Grade", 
          data=student_health)

res = mod.fit()

res.summary()

0,1,2,3
Dep. Variable:,Weight_kg,R-squared:,0.465
Model:,OLS,Adj. R-squared:,0.463
Method:,Least Squares,F-statistic:,178.3
Date:,"Tue, 28 Jul 2020",Prob (F-statistic):,1.89e-56
Time:,17:49:57,Log-Likelihood:,-1186.2
No. Observations:,413,AIC:,2378.0
Df Residuals:,410,BIC:,2391.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,26.1837,0.434,60.382,0.000,25.331,27.036
Grade[T.Second],6.9648,0.609,11.443,0.000,5.768,8.161
Grade[T.Third],9.8864,0.524,18.882,0.000,8.857,10.916

0,1,2,3
Omnibus:,23.848,Durbin-Watson:,2.854
Prob(Omnibus):,0.0,Jarque-Bera (JB):,25.558
Skew:,0.582,Prob(JB):,2.82e-06
Kurtosis:,2.641,Cond. No.,4.51


#### OLS with Treatment (Dummy) coding
It generates the same output as previously, showing that OLS accounts for the dummy trap and the default coding is dummy coding (called Treatment Coding in R)

In [9]:
mod = ols("Weight_kg ~ C(Grade, Treatment)", 
          data=student_health)

res = mod.fit()

res.summary()

0,1,2,3
Dep. Variable:,Weight_kg,R-squared:,0.465
Model:,OLS,Adj. R-squared:,0.463
Method:,Least Squares,F-statistic:,178.3
Date:,"Tue, 28 Jul 2020",Prob (F-statistic):,1.89e-56
Time:,17:49:57,Log-Likelihood:,-1186.2
No. Observations:,413,AIC:,2378.0
Df Residuals:,410,BIC:,2391.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,26.1837,0.434,60.382,0.000,25.331,27.036
"C(Grade, Treatment)[T.Second]",6.9648,0.609,11.443,0.000,5.768,8.161
"C(Grade, Treatment)[T.Third]",9.8864,0.524,18.882,0.000,8.857,10.916

0,1,2,3
Omnibus:,23.848,Durbin-Watson:,2.854
Prob(Omnibus):,0.0,Jarque-Bera (JB):,25.558
Skew:,0.582,Prob(JB):,2.82e-06
Kurtosis:,2.641,Cond. No.,4.51


In [10]:
student_health.drop(columns= ['Gender', 'Height_cm'], inplace=True)

student_health.head()

Unnamed: 0,Grade,Weight_kg
0,First,21
1,First,25
2,First,25
3,First,20
4,First,32


### The `Grade` column will be dummy encoded, dummy encoder does not encode string value so it is necesary label encode the `Grade` column first 

Label encoding the `Grade` Column

- 0 - First
- 1 - Second
- 2 - Third

In [11]:
from sklearn.preprocessing import LabelEncoder

grade_encoder = preprocessing.LabelEncoder()

student_health['Grade'] = grade_encoder.fit_transform(student_health.Grade)

NameError: name 'preprocessing' is not defined

In [None]:
student_health.sample(5)

In [None]:
grade_encoder.classes_

In [None]:
student_health.describe()

### Installing and Importing the Patsy library

In [None]:
! pip install patsy

In [None]:
from patsy.contrasts import Treatment

https://www.statsmodels.org/devel/contrasts.html#treatment-dummy-coding

In [None]:
student_health.Grade.unique()

### There are three classes

In [None]:
levels = [0, 1, 2]

### Contrast matrix without intercept

In [None]:
contrast_without_intercept_0 = Treatment(reference=0)\
                                .code_without_intercept(levels)

print(contrast_without_intercept_0.matrix)

In [None]:
contrast_without_intercept_1 = Treatment(reference=1)\
                                .code_without_intercept(levels)

print(contrast_without_intercept_1.matrix)

In [None]:
contrast_without_intercept_2 = Treatment(reference=2)\
                                .code_without_intercept(levels)

print(contrast_without_intercept_2.matrix)

### Contrast matrix with intercept

In [None]:
contrast_with_intercept = Treatment(reference=0)\
                            .code_with_intercept(levels)

print(contrast_with_intercept.matrix)

In [None]:
contrast_with_intercept = Treatment(reference=1)\
                            .code_with_intercept(levels)

print(contrast_with_intercept.matrix)

## 1) Taking Contrast without intercept for Regression

In [None]:
health_data_dummy = student_health

### Creating Contrast matrix for the `Grade` column of the dataset

In [None]:
health_data_contrast = contrast_without_intercept_0.matrix[health_data_dummy.Grade, :]
    
health_data_contrast[90:105]

### Assigning name to the columns

In [None]:
student_health_contrast_df = pd.DataFrame(health_data_contrast, 
                                          columns =['grade_2', 'grade_3'], dtype=np.int)

student_health_contrast_df.sample(5)

### concatenating the encoded columns with the datset

In [None]:
health_data_dummy = pd.concat([health_data_dummy, student_health_contrast_df], 
                              axis=1)

health_data_dummy.sample(10)

### Deleting the column `Grade`

In [None]:
health_data_dummy.drop(columns = ['Grade'], 
                       inplace=True)

In [None]:
health_data_dummy.sample(5)

In [None]:
X = health_data_dummy.drop('Weight_kg', axis='columns')

y = health_data_dummy.Weight_kg

In [None]:
X.sample(5)

In [None]:
y.sample(5)

In [None]:
X_with_constant = sm.add_constant(X)

mod = sm.OLS(y, X_with_constant)

res = mod.fit()

res.summary()

### Importing sklearn linear regression model
https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html

### Creating the model

In [None]:
from sklearn.linear_model import LinearRegression

linear_model_dummy = LinearRegression(fit_intercept=True)

### Training the model

In [None]:
linear_model_dummy.fit(X, y)

print("Training_score : " , linear_model_dummy.score(X, y))

In [None]:
linear_model_dummy.coef_

In [None]:
linear_model_dummy.intercept_

## 2) Taking Contrast matrix with intercept for regression

In [None]:
health_data_ohe = student_health

In [None]:
contrast_with_intercept.matrix

### Creating Contrast matrix for the `Grade` column of the dataset

In [None]:
health_data_contrast = contrast_with_intercept.matrix[health_data_ohe.Grade, :]

health_data_contrast[90:105]

### Assigning name to the columns

In [None]:
health_data_contrast = pd.DataFrame(health_data_contrast, 
                                    columns = ['grade_1', 'grade_2', 'grade_3'],
                                    dtype=np.int)

health_data_contrast.sample(5)

### Concatenating the encoded columns with the dataset

In [None]:
health_data_ohe = pd.concat([health_data_ohe, health_data_contrast], 
                            axis=1)

health_data_ohe.sample(10)

### Dropping column `'Grade'`

In [None]:
health_data_ohe.drop(columns = ['Grade'], 
                     inplace=True) 

In [None]:
X = health_data_ohe.drop('Weight_kg', axis='columns')

y = health_data_ohe.Weight_kg

X.head()

### Creating the model

In [None]:
linear_model_ohe = LinearRegression(fit_intercept=False)

### Training the model

In [None]:
linear_model_ohe.fit(X, y)

print("Training_score : " , linear_model_ohe.score(X, y))

### Testing the model

In [None]:
linear_model_ohe.coef_

In [None]:
linear_model_ohe.intercept_