# Power Transformer

## 1. Box-Cox Transform

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as stats

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.preprocessing import PowerTransformer

In [3]:
df=pd.read_csv('datasets/concrete_data.csv')

In [4]:
df.head()

Unnamed: 0,Cement,Blast Furnace Slag,Fly Ash,Water,Superplasticizer,Coarse Aggregate,Fine Aggregate,Age,Strength
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28,79.99
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28,61.89
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270,40.27
3,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365,41.05
4,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360,44.3


In [6]:
df.shape

(1030, 9)

In [7]:
X=df.drop(columns=['Strength'])
y=df.iloc[:,-1]

In [8]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

In [10]:
#Applying Regression withoput any transformation
lr=LinearRegression()
lr.fit(X_train,y_train)
y_pred=lr.predict(X_test)
r2_score(y_test,y_pred)

0.627553179231485

In [11]:
#Cross check
lr=LinearRegression()
np.mean(cross_val_score(lr,X,y,scoring='r2'))

np.float64(0.46099404916628633)

In [12]:
#applying Box-cox
pt=PowerTransformer(method='box-cox')
X_train_trf=pt.fit_transform(X_train+0.000001)
X_test_trf=pt.transform(X_test+0.000001)

pd.DataFrame({'cols':X_train.columns,'box_cox_lambdas':pt.lambdas_})

Unnamed: 0,cols,box_cox_lambdas
0,Cement,0.177025
1,Blast Furnace Slag,0.025093
2,Fly Ash,-0.03897
3,Water,0.772682
4,Superplasticizer,0.098811
5,Coarse Aggregate,1.129813
6,Fine Aggregate,1.782018
7,Age,0.066631


In [13]:
#Applying Regression on transformed data
lr=LinearRegression()
lr.fit(X_train_trf,y_train)
y_pred=lr.predict(X_test_trf)
r2_score(y_test,y_pred)

0.8047825013722173

In [14]:
#Cross check
pt=PowerTransformer(method='box-cox')
X_trf=pt.fit_transform(X+0.000001)
lr=LinearRegression()
np.mean(cross_val_score(lr,X_trf,y,scoring='r2'))

np.float64(0.6662950317141856)

In [18]:
#applying yeo-jhonson
pt1=PowerTransformer()

X_train_trf2=pt1.fit_transform(X_train)
X_test_trf2=pt1.transform(X_test)

lr=LinearRegression()
lr.fit(X_train_trf2,y_train)

y_pred3=lr.predict(X_test_trf2)

print(r2_score(y_test,y_pred3))
pd.DataFrame({'cols':X_train.columns,'yeo_johnson_lambdas':pt.lambdas_})

0.8161906513339305


Unnamed: 0,cols,yeo_johnson_lambdas
0,Cement,0.174348
1,Blast Furnace Slag,0.015715
2,Fly Ash,-0.161447
3,Water,0.771307
4,Superplasticizer,0.253935
5,Coarse Aggregate,1.13005
6,Fine Aggregate,1.7831
7,Age,0.019885


In [19]:
#Cross check
pt3=PowerTransformer()
X_trf=pt3.fit_transform(X)
lr=LinearRegression()
np.mean(cross_val_score(lr,X_trf,y,scoring='r2'))

np.float64(0.6834625134285744)