## Example 1 - Label encoding vs One-hot encoding

In [1]:
import numpy as np
import pandas as pd

In [2]:
from sklearn.linear_model import LinearRegression
import statsmodels.formula.api as smf

In [3]:
data0 = pd.read_csv('small.csv')
data0

Unnamed: 0,X1,X2,Y
0,S,-0.1,19.19
1,S,2.53,22.74
2,S,4.86,23.91
3,M,0.26,7.07
4,M,2.55,7.93
5,M,4.87,8.93
6,L,0.08,20.63
7,L,2.62,23.46
8,L,5.09,25.75


## Label encoding

In [4]:
data1 = data0.copy()
data1

Unnamed: 0,X1,X2,Y
0,S,-0.1,19.19
1,S,2.53,22.74
2,S,4.86,23.91
3,M,0.26,7.07
4,M,2.55,7.93
5,M,4.87,8.93
6,L,0.08,20.63
7,L,2.62,23.46
8,L,5.09,25.75


In [5]:
# Replace categories (S,M,L) with integers

In [6]:
data1['X1']=data1['X1'].replace(('S','M','L'),
                                (0,1,2))
data1

Unnamed: 0,X1,X2,Y
0,0,-0.1,19.19
1,0,2.53,22.74
2,0,4.86,23.91
3,1,0.26,7.07
4,1,2.55,7.93
5,1,4.87,8.93
6,2,0.08,20.63
7,2,2.62,23.46
8,2,5.09,25.75


In [7]:
# Split response, predictors

X, y = data1[['X1','X2']], data1.Y
X

Unnamed: 0,X1,X2
0,0,-0.1
1,0,2.53
2,0,4.86
3,1,0.26
4,1,2.55
5,1,4.87
6,2,0.08
7,2,2.62
8,2,5.09


In [8]:
y

0    19.19
1    22.74
2    23.91
3     7.07
4     7.93
5     8.93
6    20.63
7    23.46
8    25.75
Name: Y, dtype: float64

## sklearn

In [9]:
from sklearn.linear_model import LinearRegression

In [10]:
m1 = LinearRegression().fit(X,y)

In [11]:
m1.intercept_

15.167783009625168

In [12]:
m1.coef_

array([0.60192355, 0.77691744])

In [13]:
# Least Squares plane 

In [14]:
# Yhat = 15.167 + 0.602 X1 + 0.777 X2

In [15]:
# R-squared

In [16]:
R2 = m1.score(X,y)
R2

0.05259259304144803

In [17]:
# number of rows, number of predictors

In [18]:
n, p = 9, 2

In [19]:
# adj R-squared

In [20]:
print (1 - (1-R2)*(n-1)/(n-p-1))

-0.2632098759447359


In [21]:
m11 = smf.ols('Y ~ X1 + X2',data=data1).fit()
m11.summary()



0,1,2,3
Dep. Variable:,Y,R-squared:,0.053
Model:,OLS,Adj. R-squared:,-0.263
Method:,Least Squares,F-statistic:,0.1665
Date:,"Mon, 18 Sep 2023",Prob (F-statistic):,0.85
Time:,13:39:58,Log-Likelihood:,-30.212
No. Observations:,9,AIC:,66.42
Df Residuals:,6,BIC:,67.02
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,15.1678,5.682,2.670,0.037,1.265,29.070
X1,0.6019,3.474,0.173,0.868,-7.899,9.103
X2,0.7769,1.428,0.544,0.606,-2.716,4.270

0,1,2,3
Omnibus:,3.603,Durbin-Watson:,0.961
Prob(Omnibus):,0.165,Jarque-Bera (JB):,1.553
Skew:,-0.704,Prob(JB):,0.46
Kurtosis:,1.53,Cond. No.,7.46


## One-hot encoding

In [22]:
data2 = data0.copy()
y = data2.Y
X = data2.drop(columns='Y',axis=1)
X

Unnamed: 0,X1,X2
0,S,-0.1
1,S,2.53
2,S,4.86
3,M,0.26
4,M,2.55
5,M,4.87
6,L,0.08
7,L,2.62
8,L,5.09


In [23]:
# Creating binary columns from X1

In [24]:
X_binary = pd.get_dummies(X,columns = ['X1'])
X_binary

Unnamed: 0,X2,X1_L,X1_M,X1_S
0,-0.1,0,0,1
1,2.53,0,0,1
2,4.86,0,0,1
3,0.26,0,1,0
4,2.55,0,1,0
5,4.87,0,1,0
6,0.08,1,0,0
7,2.62,1,0,0
8,5.09,1,0,0


In [25]:
X_binary.drop(columns = 'X1_S',
              inplace=True)
X_binary

Unnamed: 0,X2,X1_L,X1_M
0,-0.1,0,0
1,2.53,0,0
2,4.86,0,0
3,0.26,0,1
4,2.55,0,1
5,4.87,0,1
6,0.08,1,0
7,2.62,1,0
8,5.09,1,0


In [26]:
# rename columns
X_binary.columns = ['X2', 'L', 'M']
X_binary

Unnamed: 0,X2,L,M
0,-0.1,0,0
1,2.53,0,0
2,4.86,0,0
3,0.26,0,1
4,2.55,0,1
5,4.87,0,1
6,0.08,1,0
7,2.62,1,0
8,5.09,1,0


In [27]:
# reorder columns
X_binary = X_binary.reindex(columns = ['M','L','X2'])
X_binary

Unnamed: 0,M,L,X2
0,0,0,-0.1
1,0,0,2.53
2,0,0,4.86
3,1,0,0.26
4,1,0,2.55
5,1,0,4.87
6,0,1,0.08
7,0,1,2.62
8,0,1,5.09


In [28]:
m2 = LinearRegression().fit(X_binary,y)

In [29]:
R2 = m2.score(X_binary,y)
R2

0.9926482907525312

In [30]:
# Find adj R-squared with sklearn

In [31]:
n, p = 9, 3

In [32]:
print (1 - (1-R2)*(n-1)/(n-p-1))

0.98823726520405


In [33]:
m2.intercept_

19.96499706357271

In [34]:
m2.coef_

array([-14.07601525,   1.19741635,   0.81550189])

### statsmodels.formula.api

In [35]:
import statsmodels.formula.api as smf

In [36]:
m3 = smf.ols('Y ~ C(X1,Treatment(reference = "S")) + X2',
             data=data0).fit()
m3.summary()



0,1,2,3
Dep. Variable:,Y,R-squared:,0.993
Model:,OLS,Adj. R-squared:,0.988
Method:,Least Squares,F-statistic:,225.0
Date:,"Mon, 18 Sep 2023",Prob (F-statistic):,9.42e-06
Time:,13:39:58,Log-Likelihood:,-8.3472
No. Observations:,9,AIC:,24.69
Df Residuals:,5,BIC:,25.48
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,19.9650,0.580,34.413,0.000,18.474,21.456
"C(X1, Treatment(reference=""S""))[T.L]",1.1974,0.671,1.786,0.134,-0.526,2.921
"C(X1, Treatment(reference=""S""))[T.M]",-14.0760,0.670,-20.998,0.000,-15.799,-12.353
X2,0.8155,0.138,5.920,0.002,0.461,1.170

0,1,2,3
Omnibus:,0.672,Durbin-Watson:,1.798
Prob(Omnibus):,0.715,Jarque-Bera (JB):,0.529
Skew:,0.003,Prob(JB):,0.768
Kurtosis:,1.812,Cond. No.,11.3


### fitted equations

$ \hat{Y}_S =  19.9650 \qquad +\qquad 0.8155 X2$ <br>
$ \hat{Y}_M = (19.9650 - 14.07) + 0.8155 X2$ <br>
$ \hat{Y}_L = (19.9650 + 1.197) + 0.8155 X2$