## Helmert Coding using `Categorical encoders`

### Importing modules

In [1]:
import pandas as pd
import numpy as np

from statsmodels.formula.api import ols

### Loading the dataset
https://www.kaggle.com/toramky/automobile-dataset

In [2]:
car_data = pd.read_csv('datasets/auto-mpg.csv', na_values='?')

car_data.sample(10)

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
305,28.4,4,151.0,90.0,2670,16.0,79,1,buick skylark limited
57,24.0,4,113.0,95.0,2278,15.5,72,3,toyota corona hardtop
388,26.0,4,156.0,92.0,2585,14.5,82,1,chrysler lebaron medallion
207,20.0,4,130.0,102.0,3150,15.7,76,2,volvo 245
367,28.0,4,112.0,88.0,2605,19.6,82,1,chevrolet cavalier
304,37.3,4,91.0,69.0,2130,14.7,79,2,fiat strada custom
165,20.0,8,262.0,110.0,3221,13.5,75,1,chevrolet monza 2+2
182,28.0,4,107.0,86.0,2464,15.5,76,2,fiat 131
55,27.0,4,97.0,60.0,1834,19.0,71,2,volkswagen model 111
257,19.4,6,232.0,90.0,3210,17.2,78,1,amc concord


In [3]:
car_data = car_data[['mpg', 'cylinders']]

In [4]:
car_data.dropna(inplace=True)

In [5]:
car_data.shape

(398, 2)

In [6]:
car_data.sample(10)

Unnamed: 0,mpg,cylinders
216,31.5,4
385,38.0,4
195,29.0,4
229,16.0,8
327,36.4,5
65,14.0,8
91,13.0,8
147,24.0,4
397,31.0,4
75,14.0,8


### Printing the different classes present

In [7]:
car_data['cylinders'].unique()

array([8, 4, 6, 3, 5])

In [8]:
car_data.sort_values(by=['cylinders'], inplace=True)

car_data.reset_index(inplace=True, drop=True)

car_data.head(10)

Unnamed: 0,mpg,cylinders
0,18.0,3
1,19.0,3
2,23.7,3
3,21.5,3
4,27.5,4
5,30.0,4
6,25.1,4
7,36.1,4
8,39.4,4
9,36.1,4


In [9]:
car_data.mean()

mpg          23.514573
cylinders     5.454774
dtype: float64

In [10]:
car_data_grouped = car_data.groupby(by=['cylinders']).mean()

car_data_grouped

Unnamed: 0_level_0,mpg
cylinders,Unnamed: 1_level_1
3,20.55
4,29.286765
5,27.366667
6,19.985714
8,14.963107


In [11]:
car_data_grouped['mpg'].mean()

22.430450490875963

In [12]:
coefficient_cylinder_4 = \
    (car_data_grouped.loc[4]['mpg'] - car_data_grouped.loc[3]['mpg']) / 2

coefficient_cylinder_4

4.368382352941179

In [13]:
mean_34 = (car_data_grouped.loc[3]['mpg'] + car_data_grouped.loc[4]['mpg']) / 2

coefficient_cylinder_5 = (car_data_grouped.loc[5]['mpg'] - mean_34) / 3

coefficient_cylinder_5

0.8160947712418279

In [14]:
mod = ols("mpg ~ C(cylinders, Helmert)", 
          data=car_data)

res = mod.fit()

res.summary()

0,1,2,3
Dep. Variable:,mpg,R-squared:,0.637
Model:,OLS,Adj. R-squared:,0.634
Method:,Least Squares,F-statistic:,172.6
Date:,"Tue, 28 Jul 2020",Prob (F-statistic):,3.68e-85
Time:,18:06:54,Log-Likelihood:,-1180.8
No. Observations:,398,AIC:,2372.0
Df Residuals:,393,BIC:,2392.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,22.4305,0.739,30.353,0.000,20.978,23.883
"C(cylinders, Helmert)[H.4]",4.3684,1.194,3.657,0.000,2.020,6.717
"C(cylinders, Helmert)[H.5]",0.8161,0.994,0.821,0.412,-1.138,2.770
"C(cylinders, Helmert)[H.6]",-1.4372,0.329,-4.371,0.000,-2.084,-0.791
"C(cylinders, Helmert)[H.8]",-1.8668,0.206,-9.079,0.000,-2.271,-1.463

0,1,2,3
Omnibus:,48.011,Durbin-Watson:,1.255
Prob(Omnibus):,0.0,Jarque-Bera (JB):,71.51
Skew:,0.793,Prob(JB):,2.96e-16
Kurtosis:,4.341,Cond. No.,12.8


### Importing categorical encoders

In [15]:
import category_encoders as ce

### Creating a HelmertEncoder on the 'cylinders' column

In [16]:
ce_helmert = ce.HelmertEncoder(cols = ['cylinders'])
ce_helmert

HelmertEncoder(cols=['cylinders'], drop_invariant=False, handle_missing='value',
               handle_unknown='value', mapping=None, return_df=True, verbose=0)

### Encoding the dataset

In [17]:
car_he = ce_helmert.fit_transform(car_data)
car_he.sample(10)

Unnamed: 0,intercept,mpg,cylinders_0,cylinders_1,cylinders_2,cylinders_3
248,1,16.0,0.0,0.0,3.0,-1.0
177,1,24.0,1.0,-1.0,-1.0,-1.0
388,1,14.0,0.0,0.0,0.0,4.0
304,1,15.0,0.0,0.0,0.0,4.0
159,1,26.0,1.0,-1.0,-1.0,-1.0
288,1,16.0,0.0,0.0,3.0,-1.0
0,1,18.0,-1.0,-1.0,-1.0,-1.0
148,1,26.0,1.0,-1.0,-1.0,-1.0
180,1,35.0,1.0,-1.0,-1.0,-1.0
189,1,30.0,1.0,-1.0,-1.0,-1.0


In [18]:
pd.concat([car_data['cylinders'], car_he], axis=1).sample(10)

Unnamed: 0,cylinders,intercept,mpg,cylinders_0,cylinders_1,cylinders_2,cylinders_3
128,4,1,29.8,1.0,-1.0,-1.0,-1.0
196,4,1,19.0,1.0,-1.0,-1.0,-1.0
341,8,1,11.0,0.0,0.0,0.0,4.0
30,4,1,29.5,1.0,-1.0,-1.0,-1.0
241,6,1,19.8,0.0,0.0,3.0,-1.0
327,8,1,15.0,0.0,0.0,0.0,4.0
211,6,1,18.0,0.0,0.0,3.0,-1.0
191,4,1,28.0,1.0,-1.0,-1.0,-1.0
231,6,1,20.2,0.0,0.0,3.0,-1.0
300,8,1,9.0,0.0,0.0,0.0,4.0


In [19]:
X = car_he.drop(columns = ['mpg'], axis=1)

y = car_he['mpg']

X.sample(10)

Unnamed: 0,intercept,cylinders_0,cylinders_1,cylinders_2,cylinders_3
56,1,1.0,-1.0,-1.0,-1.0
334,1,0.0,0.0,0.0,4.0
167,1,1.0,-1.0,-1.0,-1.0
237,1,0.0,0.0,3.0,-1.0
90,1,1.0,-1.0,-1.0,-1.0
371,1,0.0,0.0,0.0,4.0
126,1,1.0,-1.0,-1.0,-1.0
27,1,1.0,-1.0,-1.0,-1.0
111,1,1.0,-1.0,-1.0,-1.0
376,1,0.0,0.0,0.0,4.0


In [20]:
y.head(10)

0    18.0
1    19.0
2    23.7
3    21.5
4    27.5
5    30.0
6    25.1
7    36.1
8    39.4
9    36.1
Name: mpg, dtype: float64

In [21]:
from sklearn.linear_model import LinearRegression

linear_model = LinearRegression()

linear_model.fit(X, y)

print("Training_score : " , linear_model.score(X, y))

Training_score :  0.6372420899156167


In [22]:
linear_model.coef_

array([ 0.        ,  4.36838235,  0.81609477, -1.43719071, -1.86683592])