In [151]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
from scipy import stats

In [152]:
delivery_df = pd.read_csv("delivery_time.csv")
delivery_df.head()

Unnamed: 0,Delivery Time,Sorting Time
0,21.0,10
1,13.5,4
2,19.75,6
3,24.0,9
4,29.0,10


In [153]:
delivery_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21 entries, 0 to 20
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Delivery Time  21 non-null     float64
 1   Sorting Time   21 non-null     int64  
dtypes: float64(1), int64(1)
memory usage: 468.0 bytes


In [154]:
delivery_df.isnull().sum()

Delivery Time    0
Sorting Time     0
dtype: int64

In [155]:
delivery_df['sortingTime_Log'] = np.log(delivery_df['Sorting Time'])
delivery_df['DeliveryTime_Log'] = np.log(delivery_df['Delivery Time'])

delivery_df['sortingTime_sqrt'] = np.sqrt(delivery_df['Sorting Time'])
delivery_df['DeliveryTime_sqrt'] = np.sqrt(delivery_df['Delivery Time'])

delivery_df['sortingTime_exp'] = np.exp(delivery_df['Sorting Time'])
delivery_df['DeliveryTime_exp'] = np.exp(delivery_df['Delivery Time'])

delivery_df['sortingTime_boxcox'],_ = stats.boxcox(delivery_df['Sorting Time'])
delivery_df['DeliveryTime_boxcox'], _ = stats.boxcox(delivery_df['Delivery Time'])

delivery_df.head()

Unnamed: 0,Delivery Time,Sorting Time,sortingTime_Log,DeliveryTime_Log,sortingTime_sqrt,DeliveryTime_sqrt,sortingTime_exp,DeliveryTime_exp,sortingTime_boxcox,DeliveryTime_boxcox
0,21.0,10,2.302585,3.044522,3.162278,4.582576,22026.465795,1318816000.0,5.719365,7.818086
1,13.5,4,1.386294,2.60269,2.0,3.674235,54.59815,729416.4,2.338756,5.750615
2,19.75,6,1.791759,2.983153,2.44949,4.444097,403.428793,377847000.0,3.57325,7.500267
3,24.0,9,2.197225,3.178054,3.0,4.898979,8103.083928,26489120000.0,5.211954,8.547516
4,29.0,10,2.302585,3.367296,3.162278,5.385165,22026.465795,3931334000000.0,5.719365,9.676687


In [163]:
x = delivery_df[['Sorting Time']]
y = delivery_df[['Delivery Time']]

x_log = delivery_df[['sortingTime_Log']]
y_log = delivery_df[['DeliveryTime_Log']]

x_sqrt = delivery_df[['sortingTime_sqrt']]
y_sqrt = delivery_df[['DeliveryTime_sqrt']]

x_exp = delivery_df[['sortingTime_exp']]
y_exp = delivery_df[['DeliveryTime_exp']]

x_boxcox = delivery_df[['sortingTime_boxcox']]
y_boxcox = delivery_df[['DeliveryTime_boxcox']]

In [164]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2)
x_sqrt_train,x_srqt_test,y_sqrt_train,y_sqrt_test = train_test_split(x_sqrt,y_sqrt,test_size=0.2)
x_log_train,x_log_test,y_log_train,y_log_test = train_test_split(x_log,y_log,test_size=0.2)
x_exp_train,x_exp_test,y_exp_train,y_exp_test = train_test_split(x_exp,y_exp,test_size=0.2)
x_boxcox_train,x_boxcox_test,y_boxcox_train,y_boxcox_test = train_test_split(x_boxcox,y_boxcox,test_size=0.2)

In [165]:
model = LinearRegression()
model_sqrt = LinearRegression()
model_log = LinearRegression()
model_exp = LinearRegression()
model_boxcox = LinearRegression()

In [166]:
model.fit(x_train,y_train)
model_sqrt.fit(x_sqrt_train,y_sqrt_train)
model_exp.fit(x_exp_train,y_exp_train)
model_log.fit(x_log_train,y_log_train)
model_boxcox.fit(x_boxcox_train, y_boxcox_train)

In [167]:
y_pred = model.predict(x_test)
y_pred_sqrt = model_sqrt.predict(x_srqt_test)
y_pred_log = model_log.predict(x_log_test)
y_pred_exp = model_exp.predict(x_exp_test)
y_pred_boxcox = model_boxcox.predict(x_boxcox_test)


In [169]:
print("No Transformation Score : " , model.score(x_train,y_train)*100)
print("Log Transformation Score : " , model_log.score(x_log_train,y_log_train)*100)
print("sqrt Transformation Score : " , model_sqrt.score(x_sqrt_train,y_sqrt_train)*100)
print("exp Transformation Score : " , model_exp.score(x_exp_train,y_exp_train)*100)
print("boxcox Transformation Score : " , model_boxcox.score(x_boxcox_train,y_boxcox_train)*100)

No Transformation Score :  77.050191720921
Log Transformation Score :  86.41356250934858
sqrt Transformation Score :  68.29550783171236
exp Transformation Score :  26.30539896424238
boxcox Transformation Score :  78.99452270473854


In [171]:
print("No Transformartion")
print(model.coef_)
print(model.intercept_)

print("Log Transformation")
print(model_log.coef_)
print(model_log.intercept_)

print("exp Transformation")
print(model_exp.coef_)
print(model_exp.intercept_)

print("sqrt Transformation")
print(model_sqrt.coef_)
print(model_sqrt.intercept_)

print("boxcox Transformation")
print(model_boxcox.coef_)
print(model_boxcox.intercept_)

No Transformartion
[[1.47201409]]
[6.85641809]
Log Transformation
[[0.56346649]]
[1.75670781]
exp Transformation
[[58908884.62555607]]
[-8.24537262e+10]
sqrt Transformation
[[0.87727638]]
[1.86917052]
boxcox Transformation
[[0.76467632]]
[3.67571182]


In [116]:
r2 = r2_score(y_test,y_pred)
print("R-Squared : ", r2)

rmse = np.sqrt(mean_squared_error(y_test,y_pred))
print("RMSE : ", rmse)

R-Squared :  0.11067437996307095
RMSE :  3.505093110113077


In [172]:
print("Log transformation")
r2 = r2_score(y_log_test,y_pred_log)
print("R-Squared : ", r2)

rmse = np.sqrt(mean_squared_error(y_log_test,y_pred_log))
print("RMSE : ", rmse)

Log transformation
R-Squared :  -0.3740445381764348
RMSE :  0.24928022878182543


In [173]:
print("exp transformation")
r2 = r2_score(y_exp_test,y_pred_exp)
print("R-Squared : ", r2)

rmse = np.sqrt(mean_squared_error(y_exp_test,y_pred_exp))
print("RMSE : ", rmse)

exp transformation
R-Squared :  -10137237.630590938
RMSE :  68376140936.64054


In [174]:
print("boxcox transformation")
r2 = r2_score(y_boxcox_test,y_pred_boxcox)
print("R-Squared : ", r2)

rmse = np.sqrt(mean_squared_error(y_boxcox_test,y_pred_boxcox))
print("RMSE : ", rmse)

boxcox transformation
R-Squared :  0.34716758583861684
RMSE :  1.1692234636924206


In [175]:
print("sqrt transformation")
r2 = r2_score(y_sqrt_test,y_pred_sqrt)
print("R-Squared : ", r2)

rmse = np.sqrt(mean_squared_error(y_sqrt_test,y_pred_sqrt))
print("RMSE : ", rmse)

sqrt transformation
R-Squared :  0.7453974006474182
RMSE :  0.41877575147573154


#### SALARY PROBLEM

In [199]:
salary_df = pd.read_csv("Salary_Data.csv")
salary_df.head()

Unnamed: 0,YearsExperience,Salary
0,1.1,39343.0
1,1.3,46205.0
2,1.5,37731.0
3,2.0,43525.0
4,2.2,39891.0


In [200]:
salary_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30 entries, 0 to 29
Data columns (total 2 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   YearsExperience  30 non-null     float64
 1   Salary           30 non-null     float64
dtypes: float64(2)
memory usage: 612.0 bytes


In [201]:
salary_df.describe()

Unnamed: 0,YearsExperience,Salary
count,30.0,30.0
mean,5.313333,76003.0
std,2.837888,27414.429785
min,1.1,37731.0
25%,3.2,56720.75
50%,4.7,65237.0
75%,7.7,100544.75
max,10.5,122391.0


In [202]:
salary_df.isnull().sum()

YearsExperience    0
Salary             0
dtype: int64

In [203]:
salary_df['YearsExperience_Log'] = np.log(salary_df['YearsExperience'])
salary_df['Salary_Log'] = np.log(salary_df['Salary'])

salary_df['YearsExperience_sqrt'] = np.sqrt(salary_df['YearsExperience'])
salary_df['Salary_sqrt'] = np.sqrt(salary_df['Salary'])

salary_df['YearsExperience_boxcox'],_ = stats.boxcox(salary_df['YearsExperience'])
salary_df['Salary_boxcox'], _ = stats.boxcox(salary_df['Salary'])

salary_df.head()

Unnamed: 0,YearsExperience,Salary,YearsExperience_Log,Salary_Log,YearsExperience_sqrt,Salary_sqrt,YearsExperience_boxcox,Salary_boxcox
0,1.1,39343.0,0.09531,10.580073,1.048809,198.3507,0.097408,15.936022
1,1.3,46205.0,0.262364,10.740843,1.140175,214.953483,0.278674,16.285303
2,1.5,37731.0,0.405465,10.538237,1.224745,194.244691,0.445296,15.845799
3,2.0,43525.0,0.693147,10.681091,1.414214,208.62646,0.81497,16.15501
4,2.2,39891.0,0.788457,10.593906,1.48324,199.727314,0.948515,15.965914


In [204]:
x = salary_df[['YearsExperience']]
y = salary_df[['Salary']]

x_log = salary_df[['YearsExperience_Log']]
y_log = salary_df[['Salary_Log']]

x_sqrt = salary_df[['YearsExperience_sqrt']]
y_sqrt = salary_df[['Salary_sqrt']]

x_boxcox = salary_df[['YearsExperience_boxcox']]
y_boxcox = salary_df[['Salary_boxcox']]


In [205]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2)
x_sqrt_train,x_srqt_test,y_sqrt_train,y_sqrt_test = train_test_split(x_sqrt,y_sqrt,test_size=0.2)
x_log_train,x_log_test,y_log_train,y_log_test = train_test_split(x_log,y_log,test_size=0.2)
x_boxcox_train,x_boxcox_test,y_boxcox_train,y_boxcox_test = train_test_split(x_boxcox,y_boxcox,test_size=0.2)

In [206]:
model = LinearRegression()
model_sqrt = LinearRegression()
model_log = LinearRegression()
model_boxcox = LinearRegression()

In [207]:
model.fit(x_train,y_train)
model_sqrt.fit(x_sqrt_train,y_sqrt_train)
model_log.fit(x_log_train,y_log_train)
model_boxcox.fit(x_boxcox_train, y_boxcox_train)

In [208]:
y_pred = model.predict(x_test)
y_pred_sqrt = model_sqrt.predict(x_srqt_test)
y_pred_log = model_log.predict(x_log_test)
y_pred_boxcox = model_boxcox.predict(x_boxcox_test)


In [209]:
print("No Transformation Score : " , model.score(x_train,y_train)*100)
print("Log Transformation Score : " , model_log.score(x_log_train,y_log_train)*100)
print("sqrt Transformation Score : " , model_sqrt.score(x_sqrt_train,y_sqrt_train)*100)
print("boxcox Transformation Score : " , model_boxcox.score(x_boxcox_train,y_boxcox_train)*100)

No Transformation Score :  95.95834333305253
Log Transformation Score :  91.19651966187699
sqrt Transformation Score :  94.9687786191766
boxcox Transformation Score :  93.32787910623388


In [210]:
print("No Transformartion")
print(model.coef_)
print(model.intercept_)

print("Log Transformation")
print(model_log.coef_)
print(model_log.intercept_)

print("sqrt Transformation")
print(model_sqrt.coef_)
print(model_sqrt.intercept_)

print("boxcox Transformation")
print(model_boxcox.coef_)
print(model_boxcox.intercept_)

No Transformartion
[[9237.53257476]]
[26552.92368259]
Log Transformation
[[0.60026841]]
[10.26685305]
sqrt Transformation
[[78.33605467]]
[97.42011863]
boxcox Transformation
[[0.67237854]]
[15.6850015]


In [211]:
r2 = r2_score(y_test,y_pred)
print("R-Squared : ", r2)

rmse = np.sqrt(mean_squared_error(y_test,y_pred))
print("RMSE : ", rmse)

R-Squared :  0.9407677969931093
RMSE :  6442.173894526046


In [212]:
print("Log transformation")
r2 = r2_score(y_log_test,y_pred_log)
print("R-Squared : ", r2)

rmse = np.sqrt(mean_squared_error(y_log_test,y_pred_log))
print("RMSE : ", rmse)

Log transformation
R-Squared :  0.8527398396841676
RMSE :  0.1343146737500836


In [213]:
print("boxcox transformation")
r2 = r2_score(y_boxcox_test,y_pred_boxcox)
print("R-Squared : ", r2)

rmse = np.sqrt(mean_squared_error(y_boxcox_test,y_pred_boxcox))
print("RMSE : ", rmse)

boxcox transformation
R-Squared :  0.9792868720085286
RMSE :  0.11597703614451249


In [214]:
print("sqrt transformation")
r2 = r2_score(y_sqrt_test,y_pred_sqrt)
print("R-Squared : ", r2)

rmse = np.sqrt(mean_squared_error(y_sqrt_test,y_pred_sqrt))
print("RMSE : ", rmse)

sqrt transformation
R-Squared :  0.8468513331094205
RMSE :  15.213727651721825
