## Modelling

### Model Imports / Data Splits

In [48]:
# Import Linear Regression Packages

from sklearn.linear_model import LinearRegression, Ridge, Lasso

# Import Decision Tree Packages

from sklearn.tree import DecisionTreeRegressor

# Import Random Forest Packages

from sklearn.ensemble import RandomForestRegressor

# Import XgBoost, LightGBM, and CatBoost Packages

import xgboost as xgb
import lightgbm as lgb
import catboost as cb

# Import Train Test Split and k-Fold Cross Validation

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, KFold

# Import Metrics

from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

# Import Data Manipulation Packages

import pandas as pd
import numpy as np

In [49]:
# Create X and y
x = pd.read_csv('x.csv')
y = pd.read_csv('y.csv')

# Create K-Fold Cross Validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)
kf.get_n_splits(x)

# Create Train Test Split

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [50]:
x.head()
print(x.shape)

(37137, 8)


In [51]:
y.head()
print(y.shape)

(37137, 1)


### Linear Models

In [52]:
# Linear Regression

for train_index, test_index in kf.split(x):
    x_train, x_test = x.iloc[train_index], x.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    model = LinearRegression()
    model.fit(x_train, y_train)
    
    y_pred = model.predict(x_test)
    
    print('R2 Score: ', r2_score(y_test, y_pred))
    print('MSE: ', mean_squared_error(y_test, y_pred))
    print('MAE: ', mean_absolute_error(y_test, y_pred))
    print('RMSE: ', np.sqrt(mean_squared_error(y_test, y_pred)))
    print('')

R2 Score:  0.5973894621135667
MSE:  0.5473421072488117
MAE:  0.5571920159937205
RMSE:  0.739825727620236

R2 Score:  0.6103672291354474
MSE:  0.5190319349866706
MAE:  0.552972642818307
RMSE:  0.7204387100834259

R2 Score:  0.6134092507486852
MSE:  0.524200439266629
MAE:  0.5494221518913832
RMSE:  0.7240168777498415

R2 Score:  0.6163494656333648
MSE:  0.5102983230810044
MAE:  0.5439338548644912
RMSE:  0.714351680253504

R2 Score:  0.6177421664034521
MSE:  0.5096535952698756
MAE:  0.5436707570874992
RMSE:  0.7139002698345727



In [53]:
# Lasso Regression

for train_index, test_index in kf.split(x):
    x_train, x_test = x.iloc[train_index], x.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    model = Lasso()
    model.fit(x_train, y_train)
    
    y_pred = model.predict(x_test)
    
    print('R2 Score: ', r2_score(y_test, y_pred))
    print('MSE: ', mean_squared_error(y_test, y_pred))
    print('MAE: ', mean_absolute_error(y_test, y_pred))
    print('RMSE: ', np.sqrt(mean_squared_error(y_test, y_pred)))
    print('')
    
print(y_pred)

R2 Score:  -7.19949165872702e-05
MSE:  1.3595806904899517
MAE:  0.9176034232749167
RMSE:  1.166010587640589

R2 Score:  -0.00021506162521345296
MSE:  1.3323919281384458
MAE:  0.9165767500056885
RMSE:  1.1542928259928005

R2 Score:  -0.00012119599076854115
MSE:  1.356121353843917
MAE:  0.9248372125803813
RMSE:  1.1645262357902963

R2 Score:  -3.787866584592514e-05
MSE:  1.3301627569557395
MAE:  0.9100020607302933
RMSE:  1.153326821397881

R2 Score:  -3.170492374904299e-05
MSE:  1.3333140854248144
MAE:  0.9124189509688222
RMSE:  1.1546922037602985

[2.08273847 2.08273779 2.08273719 ... 2.07955399 2.07955382 2.07955356]


In [54]:
# Ridge Regression

for train_index, test_index in kf.split(x):
    x_train, x_test = x.iloc[train_index], x.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    model = Ridge()
    model.fit(x_train, y_train)
    
    y_pred = model.predict(x_test)
        
    print('R2 Score: ', r2_score(y_test, y_pred))
    print('MSE: ', mean_squared_error(y_test, y_pred))
    print('MAE: ', mean_absolute_error(y_test, y_pred))
    print('RMSE: ', np.sqrt(mean_squared_error(y_test, y_pred)))
    print('')

R2 Score:  0.5973408080598819
MSE:  0.5474082515986587
MAE:  0.5572117268363843
RMSE:  0.7398704289256726

R2 Score:  0.6104249548831644
MSE:  0.5189550382039139
MAE:  0.5529079883015818
RMSE:  0.7203853400812054

R2 Score:  0.6133811334878125
MSE:  0.5242385650638162
MAE:  0.5494469029247244
RMSE:  0.7240432066277649

R2 Score:  0.6163759387236152
MSE:  0.5102631108960825
MAE:  0.5439045059014853
RMSE:  0.7143270335750163

R2 Score:  0.6177324623887329
MSE:  0.5096665333591848
MAE:  0.5436575149222608
RMSE:  0.7139093313293956



### Nonlinear Models

In [55]:
# Random Forest Regression

for train_index, test_index in kf.split(x):
    x_train, x_test = x.iloc[train_index], x.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    model =  RandomForestRegressor()
    model.fit(x_train, y_train)
    
    y_pred = model.predict(x_test)
    
    print('R2 Score: ', r2_score(y_test, y_pred))
    print('MSE: ', mean_squared_error(y_test, y_pred))
    print('MAE: ', mean_absolute_error(y_test, y_pred))
    print('RMSE: ', np.sqrt(mean_squared_error(y_test, y_pred)))
    print('')

  model.fit(x_train, y_train)


R2 Score:  0.6693195293164231
MSE:  0.44955441703076415
MAE:  0.47391995367528283
RMSE:  0.6704881930584342



  model.fit(x_train, y_train)


R2 Score:  0.6877557253753636
MSE:  0.4159422978907304
MAE:  0.46769715550619295
RMSE:  0.6449358866513247



  model.fit(x_train, y_train)


R2 Score:  0.6881663601366761
MSE:  0.422833012199684
MAE:  0.4631160591625153
RMSE:  0.6502561127737931



  model.fit(x_train, y_train)


R2 Score:  0.6993791026553211
MSE:  0.39985957546326206
MAE:  0.45424086440016176
RMSE:  0.6323445069447999



  model.fit(x_train, y_train)


R2 Score:  0.6954181315076586
MSE:  0.40609042036003434
MAE:  0.4586856320721692
RMSE:  0.6372522423342537



In [56]:
# CatBoost Regression

for train_index, test_index in kf.split(x):
    x_train, x_test = x.iloc[train_index], x.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    model = cb.CatBoostRegressor()  
    model.fit(x_train, y_train)
    
    y_pred = model.predict(x_test)
    
    print('R2 Score: ', r2_score(y_test, y_pred))
    print('MSE: ', mean_squared_error(y_test, y_pred))
    print('MAE: ', mean_absolute_error(y_test, y_pred))
    print('RMSE: ', np.sqrt(mean_squared_error(y_test, y_pred)))
    print('')

Learning rate set to 0.069967
0:	learn: 1.1080784	total: 5.68ms	remaining: 5.67s
1:	learn: 1.0637189	total: 9.14ms	remaining: 4.56s
2:	learn: 1.0251744	total: 12.7ms	remaining: 4.22s
3:	learn: 0.9880891	total: 16.4ms	remaining: 4.08s
4:	learn: 0.9544430	total: 19.8ms	remaining: 3.93s
5:	learn: 0.9238877	total: 23.2ms	remaining: 3.84s
6:	learn: 0.8961150	total: 26.7ms	remaining: 3.79s
7:	learn: 0.8712295	total: 30ms	remaining: 3.72s
8:	learn: 0.8484713	total: 33.5ms	remaining: 3.69s
9:	learn: 0.8279853	total: 37.2ms	remaining: 3.68s
10:	learn: 0.8095567	total: 41.3ms	remaining: 3.71s
11:	learn: 0.7928059	total: 44.9ms	remaining: 3.7s
12:	learn: 0.7777833	total: 48.8ms	remaining: 3.71s
13:	learn: 0.7642577	total: 52.5ms	remaining: 3.7s
14:	learn: 0.7524954	total: 56.1ms	remaining: 3.69s
15:	learn: 0.7413601	total: 59.8ms	remaining: 3.68s
16:	learn: 0.7314235	total: 63.4ms	remaining: 3.66s
17:	learn: 0.7223891	total: 66.9ms	remaining: 3.65s
18:	learn: 0.7149375	total: 70.6ms	remaining: 3.

In [57]:
# XGBoost Regression

for train_index, test_index in kf.split(x):
    x_train, x_test = x.iloc[train_index], x.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    
    model = xgb.XGBRegressor()  
    
    model.fit(x_train, y_train)
    
    y_pred = model.predict(x_test)
    
    print('R2 Score: ', r2_score(y_test, y_pred))
    print('MSE: ', mean_squared_error(y_test, y_pred))
    print('MAE: ', mean_absolute_error(y_test, y_pred))
    print('RMSE: ', np.sqrt(mean_squared_error(y_test, y_pred)))
    print('')

R2 Score:  0.6722546467923143
MSE:  0.445564175263346
MAE:  0.4686700995485309
RMSE:  0.6675059365004524

R2 Score:  0.6900881203863976
MSE:  0.41283530180042705
MAE:  0.4667644989454746
RMSE:  0.6425226080072413

R2 Score:  0.6858698923176153
MSE:  0.4259469238539197
MAE:  0.46534949859951
RMSE:  0.6526460938777767

R2 Score:  0.7030562774342286
MSE:  0.394968519788206
MAE:  0.4509713254608748
RMSE:  0.6284652096880192

R2 Score:  0.6958666517203231
MSE:  0.40549242100241517
MAE:  0.45657103172421093
RMSE:  0.6367828680189309



### Submission

In [58]:
# Load Test Data and Preprocess

X_test = pd.read_csv('x_test.csv')

print(X_test.shape)

(24759, 8)


In [59]:
# Model Ensemble 
models = [LinearRegression(), Lasso(), Ridge()]
preds = np.zeros((X_test.shape[0], len(models)))

for i, model in enumerate(models):
    model.fit(x, y)
    preds[:, i] = model.predict(X_test).reshape(-1)
    
mean = np.mean(preds, axis=1)
sample = pd.read_csv('sample_submission.csv')
sample['MedHouseVal'] = mean
sample.to_csv('linear_submission.csv', index=False)
    
    

    

In [61]:
# Non-Linear Model Ensemble 
models = [cb.CatBoostRegressor(), xgb.XGBRegressor(), RandomForestRegressor()]    
preds = np.zeros((X_test.shape[0], len(models)))


for i, model in enumerate(models):
    model.fit(x, y)
    preds[:, i] = model.predict(X_test).reshape(-1)
    
    
mean = np.mean(preds, axis=1)
sample = pd.read_csv('sample_submission.csv')
sample['MedHouseVal'] = mean
sample.to_csv('nonlinear_submission.csv', index=False)

Learning rate set to 0.072478
0:	learn: 1.1093075	total: 5.14ms	remaining: 5.14s
1:	learn: 1.0636942	total: 9.72ms	remaining: 4.85s
2:	learn: 1.0237494	total: 13.9ms	remaining: 4.62s
3:	learn: 0.9856351	total: 19.1ms	remaining: 4.75s
4:	learn: 0.9514502	total: 23.9ms	remaining: 4.76s
5:	learn: 0.9208488	total: 28.6ms	remaining: 4.74s
6:	learn: 0.8939079	total: 33.5ms	remaining: 4.75s
7:	learn: 0.8687573	total: 38.3ms	remaining: 4.75s
8:	learn: 0.8458937	total: 43.2ms	remaining: 4.75s
9:	learn: 0.8255383	total: 48.7ms	remaining: 4.82s
10:	learn: 0.8070087	total: 53.2ms	remaining: 4.78s
11:	learn: 0.7902635	total: 58.7ms	remaining: 4.83s
12:	learn: 0.7754019	total: 63.2ms	remaining: 4.8s
13:	learn: 0.7621507	total: 68.3ms	remaining: 4.81s
14:	learn: 0.7505459	total: 73.1ms	remaining: 4.8s
15:	learn: 0.7398906	total: 78.4ms	remaining: 4.82s
16:	learn: 0.7305993	total: 83.1ms	remaining: 4.8s
17:	learn: 0.7216685	total: 87.7ms	remaining: 4.79s
18:	learn: 0.7139720	total: 93.4ms	remaining: 4

  model.fit(x, y)


In [None]:
# Entire Ensemble

models = [LinearRegression(), Lasso(), Ridge(), cb.CatBoostRegressor(), xgb.XGBRegressor(), RandomForestRegressor()]
preds = np.zeros((X_test.shape[0], len(models)))

for i, model in enumerate(models):
    model.fit(x, y)
    preds[:, i] = model.predict(X_test).reshape(-1)
    
mean = np.mean(preds, axis=1)
sample = pd.read_csv('sample_submission.csv')
sample['MedHouseVal'] = mean
sample.to_csv('submission.csv', index=False)

Learning rate set to 0.072478
0:	learn: 1.1093075	total: 5.3ms	remaining: 5.29s
1:	learn: 1.0636942	total: 9.7ms	remaining: 4.84s
2:	learn: 1.0237494	total: 13.7ms	remaining: 4.56s
3:	learn: 0.9856351	total: 17.6ms	remaining: 4.38s
4:	learn: 0.9514502	total: 21.4ms	remaining: 4.25s
5:	learn: 0.9208488	total: 25.5ms	remaining: 4.22s
6:	learn: 0.8939079	total: 29.5ms	remaining: 4.18s
7:	learn: 0.8687573	total: 33.3ms	remaining: 4.13s
8:	learn: 0.8458937	total: 37.5ms	remaining: 4.13s
9:	learn: 0.8255383	total: 41.8ms	remaining: 4.14s
10:	learn: 0.8070087	total: 46.1ms	remaining: 4.14s
11:	learn: 0.7902635	total: 50.4ms	remaining: 4.15s
12:	learn: 0.7754019	total: 54.7ms	remaining: 4.15s
13:	learn: 0.7621507	total: 58.6ms	remaining: 4.13s
14:	learn: 0.7505459	total: 62.7ms	remaining: 4.12s
15:	learn: 0.7398906	total: 67.3ms	remaining: 4.14s
16:	learn: 0.7305993	total: 71.6ms	remaining: 4.14s
17:	learn: 0.7216685	total: 76.4ms	remaining: 4.17s
18:	learn: 0.7139720	total: 80.9ms	remaining: 

  model.fit(x, y)
  arr = asanyarray(a)


ValueError: could not broadcast input array from shape (24759,1) into shape (24759,)