## Modelling

### Model Imports / Data Splits

In [11]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

import xgboost as xgb
import lightgbm as lgb
import catboost as cb

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, KFold
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

In [12]:
# Create X and y
df = pd.read_csv('train.csv')
x = df.drop(["Strength"], axis=1).drop(["id"], axis=1)
y = df['Strength']

# Create K-Fold Cross Validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)
kf.get_n_splits(x)

# Create Train Test Split

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [13]:
x.head()
print(x.shape)

(5407, 8)


In [14]:
y.head()
print(y.shape)

(5407,)


### Linear Models

In [15]:
# Linear Regression

for train_index, test_index in kf.split(x):
    x_train, x_test = x.iloc[train_index], x.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    model = LinearRegression()
    model.fit(x_train, y_train)
    
    y_pred = model.predict(x_test)
    
    print('R2 Score: ', r2_score(y_test, y_pred))
    print('MSE: ', mean_squared_error(y_test, y_pred))
    print('MAE: ', mean_absolute_error(y_test, y_pred))
    print('RMSE: ', np.sqrt(mean_squared_error(y_test, y_pred)))
    print('')

R2 Score:  0.23829629673484254
MSE:  207.27399542045347
MAE:  11.463267268124591
RMSE:  14.397013420166472

R2 Score:  0.1730954143297203
MSE:  215.5711865476042
MAE:  11.479107841246014
RMSE:  14.682342679136877

R2 Score:  0.2010132603793059
MSE:  215.46412025146626
MAE:  11.600727937734622
RMSE:  14.67869613594703

R2 Score:  0.21484074717285528
MSE:  213.21150736702617
MAE:  11.289895860620563
RMSE:  14.601763844379423

R2 Score:  0.20698829030310262
MSE:  213.82513232234098
MAE:  11.3547198841332
RMSE:  14.62276076267204



In [16]:
# Lasso Regression

for train_index, test_index in kf.split(x):
    x_train, x_test = x.iloc[train_index], x.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    model = Lasso()
    model.fit(x_train, y_train)
    
    y_pred = model.predict(x_test)
    
    print('R2 Score: ', r2_score(y_test, y_pred))
    print('MSE: ', mean_squared_error(y_test, y_pred))
    print('MAE: ', mean_absolute_error(y_test, y_pred))
    print('RMSE: ', np.sqrt(mean_squared_error(y_test, y_pred)))
    print('')
    
print(y_pred)

R2 Score:  0.23803770361086807
MSE:  207.3443635042171
MAE:  11.47555582572651
RMSE:  14.399457055882944

R2 Score:  0.17247037198466486
MSE:  215.73413294105887
MAE:  11.490405898358873
RMSE:  14.687890690669606

R2 Score:  0.20039956492333055
MSE:  215.6296165544297
MAE:  11.611284053203088
RMSE:  14.684332349631347

R2 Score:  0.21458383919236468
MSE:  213.28127122394795
MAE:  11.300342315409596
RMSE:  14.604152533575784

R2 Score:  0.20771064213900936
MSE:  213.63035969161263
MAE:  11.35719508315948
RMSE:  14.616099332298361

[58.31854521 36.85546365 27.15736117 ... 35.55313949 29.67003356
 29.01595873]


In [17]:
# Ridge Regression

for train_index, test_index in kf.split(x):
    x_train, x_test = x.iloc[train_index], x.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    model = Ridge()
    model.fit(x_train, y_train)
    
    y_pred = model.predict(x_test)
    print(y_pred)
    
    print('R2 Score: ', r2_score(y_test, y_pred))
    print('MSE: ', mean_squared_error(y_test, y_pred))
    print('MAE: ', mean_absolute_error(y_test, y_pred))
    print('RMSE: ', np.sqrt(mean_squared_error(y_test, y_pred)))
    print('')

[42.12861408 31.14823034 40.43406538 ... 31.86580424 41.52827786
 40.46708629]
R2 Score:  0.2382963399116359
MSE:  207.27398367123075
MAE:  11.463267814362982
RMSE:  14.397013012122715

[26.43493486 40.56337238 34.76879859 ... 27.11970797 42.98770235
 35.80041385]
R2 Score:  0.17309531825825197
MSE:  215.57121159310617
MAE:  11.479108886517206
RMSE:  14.682343532049172

[33.20380613 29.62295122 31.66013786 ... 29.05096705 28.81879415
 44.75584819]
R2 Score:  0.20101319730232237
MSE:  215.46413726154424
MAE:  11.600729063476543
RMSE:  14.678696715360811

[35.67331855 31.70777661 54.9132088  ... 31.88109443 30.44536453
 42.38590987]
R2 Score:  0.2148407136005942
MSE:  213.21151648363835
MAE:  11.289896473484681
RMSE:  14.601764156554452

[58.52216894 36.65043196 27.13383731 ... 35.86862873 29.55776584
 28.95887323]
R2 Score:  0.20698844815026485
MSE:  213.82508976093874
MAE:  11.35471967272111
RMSE:  14.62275930735847



### Nonlinear Models

In [18]:
# Random Forest Regression

for train_index, test_index in kf.split(x):
    x_train, x_test = x.iloc[train_index], x.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    model =  RandomForestRegressor()
    model.fit(x_train, y_train)
    
    y_pred = model.predict(x_test)
    
    print('R2 Score: ', r2_score(y_test, y_pred))
    print('MSE: ', mean_squared_error(y_test, y_pred))
    print('MAE: ', mean_absolute_error(y_test, y_pred))
    print('RMSE: ', np.sqrt(mean_squared_error(y_test, y_pred)))
    print('')

R2 Score:  0.4202183707157503
MSE:  157.76955561327085
MAE:  9.817269347442224
RMSE:  12.560635159627513

R2 Score:  0.36211514849113235
MSE:  166.29439079606163
MAE:  9.865181361602788
RMSE:  12.895518244570926

R2 Score:  0.37333402705103147
MSE:  168.99408445386604
MAE:  9.849387078354424
RMSE:  12.99977247700382

R2 Score:  0.3800681385721675
MSE:  168.3436909950976
MAE:  9.852994668496326
RMSE:  12.974732790893908

R2 Score:  0.34003621051675037
MSE:  177.95051811800695
MAE:  10.23259063353426
RMSE:  13.339809523303058



In [19]:
# CatBoost Regression

for train_index, test_index in kf.split(x):
    x_train, x_test = x.iloc[train_index], x.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    model = cb.CatBoostRegressor()  
    model.fit(x_train, y_train)
    
    y_pred = model.predict(x_test)
    
    print('R2 Score: ', r2_score(y_test, y_pred))
    print('MSE: ', mean_squared_error(y_test, y_pred))
    print('MAE: ', mean_absolute_error(y_test, y_pred))
    print('RMSE: ', np.sqrt(mean_squared_error(y_test, y_pred)))
    print('')

Learning rate set to 0.051602
0:	learn: 16.0444195	total: 156ms	remaining: 2m 35s
1:	learn: 15.7355588	total: 159ms	remaining: 1m 19s
2:	learn: 15.4738669	total: 161ms	remaining: 53.4s
3:	learn: 15.2032389	total: 162ms	remaining: 40.5s
4:	learn: 14.9648815	total: 164ms	remaining: 32.7s
5:	learn: 14.7376112	total: 167ms	remaining: 27.6s
6:	learn: 14.5347357	total: 169ms	remaining: 23.9s
7:	learn: 14.3363510	total: 171ms	remaining: 21.2s
8:	learn: 14.1558874	total: 173ms	remaining: 19.1s
9:	learn: 13.9832388	total: 176ms	remaining: 17.4s
10:	learn: 13.8424866	total: 179ms	remaining: 16.1s
11:	learn: 13.6964034	total: 181ms	remaining: 14.9s
12:	learn: 13.5719142	total: 183ms	remaining: 13.9s
13:	learn: 13.4590655	total: 187ms	remaining: 13.2s
14:	learn: 13.3427604	total: 189ms	remaining: 12.4s
15:	learn: 13.2449556	total: 191ms	remaining: 11.7s
16:	learn: 13.1541890	total: 193ms	remaining: 11.2s
17:	learn: 13.0637518	total: 195ms	remaining: 10.7s
18:	learn: 12.9926848	total: 197ms	remaini

In [20]:
# XGBoost Regression

for train_index, test_index in kf.split(x):
    x_train, x_test = x.iloc[train_index], x.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    
    model = xgb.XGBRegressor()  
    
    model.fit(x_train, y_train)
    
    y_pred = model.predict(x_test)
    
    print('R2 Score: ', r2_score(y_test, y_pred))
    print('MSE: ', mean_squared_error(y_test, y_pred))
    print('MAE: ', mean_absolute_error(y_test, y_pred))
    print('RMSE: ', np.sqrt(mean_squared_error(y_test, y_pred)))
    print('')

R2 Score:  0.4126696823145717
MSE:  159.8236966111277
MAE:  9.781385447366402
RMSE:  12.64213971648501

R2 Score:  0.3761819762991444
MSE:  162.62721708087906
MAE:  9.683981007348587
RMSE:  12.752537672199956

R2 Score:  0.3706694559032728
MSE:  169.7126439114007
MAE:  9.812811925272275
RMSE:  13.027380546809889

R2 Score:  0.36645068196647834
MSE:  172.04153756437603
MAE:  9.94669418873112
RMSE:  13.116460557802018

R2 Score:  0.3723873052657166
MSE:  169.22747275703207
MAE:  10.017482199302766
RMSE:  13.008746010166854



### Submission

In [21]:
# Load Test Data and Preprocess

X_test = pd.read_csv('test.csv').drop(["id"], axis=1)

In [22]:
# Model Ensemble 
models = [LinearRegression(), Lasso(), Ridge()]
preds = []

for model in models:
    model.fit(x, y)
    preds.append(model.predict(X_test))
    
mean = np.mean(preds, axis=0)
sample = pd.read_csv('sample_submission.csv')
#sample['Strength'] = mean
sample.to_csv('submission.csv', index=False)
    
    

    

ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- id


In [None]:
# Non-Linear Model Ensemble 
models = [cb.CatBoostRegressor(), xgb.XGBRegressor(), RandomForestRegressor()]    
preds = []

for model in models:
    model.fit(x, y)
    preds.append(model.predict(X_test))
    
mean = np.mean(preds, axis=0)
sample = pd.read_csv('sample_submission.csv')
#sample['Strength'] = mean
sample.to_csv('submission.csv', index=False)

In [None]:
# Entire Ensemble

models = [LinearRegression(), Lasso(), Ridge(), cb.CatBoostRegressor(), xgb.XGBRegressor(), RandomForestRegressor()]
preds = []

for model in models:
    model.fit(x, y)
    preds.append(model.predict(X_test))
    
mean = np.mean(preds, axis=0)
sample = pd.read_csv('sample_submission.csv')
#sample['Strength'] = mean
sample.to_csv('submission.csv', index=False)