## Modelling

### Model Imports / Data Splits

In [None]:
# Import Linear Regression Packages

from sklearn.linear_model import LinearRegression, Ridge, Lasso

# Import Decision Tree Packages

from sklearn.tree import DecisionTreeRegressor

# Import Random Forest Packages

from sklearn.ensemble import RandomForestRegressor

# Import XgBoost, LightGBM, and CatBoost Packages

import xgboost as xgb
import lightgbm as lgb
import catboost as cb

# Import Train Test Split and k-Fold Cross Validation

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, KFold

# Import Metrics

from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

In [None]:
# Create X and y
x = ##
y = ##

# Create K-Fold Cross Validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)
kf.get_n_splits(x)

# Create Train Test Split

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [None]:
x.head()
print(x.shape)

In [None]:
y.head()
print(y.shape)

### Linear Models

In [None]:
# Linear Regression

for train_index, test_index in kf.split(x):
    x_train, x_test = x.iloc[train_index], x.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    model = LinearRegression()
    model.fit(x_train, y_train)
    
    y_pred = model.predict(x_test)
    
    print('R2 Score: ', r2_score(y_test, y_pred))
    print('MSE: ', mean_squared_error(y_test, y_pred))
    print('MAE: ', mean_absolute_error(y_test, y_pred))
    print('RMSE: ', np.sqrt(mean_squared_error(y_test, y_pred)))
    print('')

In [None]:
# Lasso Regression

for train_index, test_index in kf.split(x):
    x_train, x_test = x.iloc[train_index], x.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    model = Lasso()
    model.fit(x_train, y_train)
    
    y_pred = model.predict(x_test)
    
    print('R2 Score: ', r2_score(y_test, y_pred))
    print('MSE: ', mean_squared_error(y_test, y_pred))
    print('MAE: ', mean_absolute_error(y_test, y_pred))
    print('RMSE: ', np.sqrt(mean_squared_error(y_test, y_pred)))
    print('')
    
print(y_pred)

In [None]:
# Ridge Regression

for train_index, test_index in kf.split(x):
    x_train, x_test = x.iloc[train_index], x.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    model = Ridge()
    model.fit(x_train, y_train)
    
    y_pred = model.predict(x_test)
    print(y_pred)
    
    print('R2 Score: ', r2_score(y_test, y_pred))
    print('MSE: ', mean_squared_error(y_test, y_pred))
    print('MAE: ', mean_absolute_error(y_test, y_pred))
    print('RMSE: ', np.sqrt(mean_squared_error(y_test, y_pred)))
    print('')

### Nonlinear Models

In [None]:
# Random Forest Regression

for train_index, test_index in kf.split(x):
    x_train, x_test = x.iloc[train_index], x.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    model =  RandomForestRegressor()
    model.fit(x_train, y_train)
    
    y_pred = model.predict(x_test)
    
    print('R2 Score: ', r2_score(y_test, y_pred))
    print('MSE: ', mean_squared_error(y_test, y_pred))
    print('MAE: ', mean_absolute_error(y_test, y_pred))
    print('RMSE: ', np.sqrt(mean_squared_error(y_test, y_pred)))
    print('')

In [None]:
# CatBoost Regression

for train_index, test_index in kf.split(x):
    x_train, x_test = x.iloc[train_index], x.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    model = cb.CatBoostRegressor()  
    model.fit(x_train, y_train)
    
    y_pred = model.predict(x_test)
    
    print('R2 Score: ', r2_score(y_test, y_pred))
    print('MSE: ', mean_squared_error(y_test, y_pred))
    print('MAE: ', mean_absolute_error(y_test, y_pred))
    print('RMSE: ', np.sqrt(mean_squared_error(y_test, y_pred)))
    print('')

In [None]:
# XGBoost Regression

for train_index, test_index in kf.split(x):
    x_train, x_test = x.iloc[train_index], x.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    
    model = xgb.XGBRegressor()  
    
    model.fit(x_train, y_train)
    
    y_pred = model.predict(x_test)
    
    print('R2 Score: ', r2_score(y_test, y_pred))
    print('MSE: ', mean_squared_error(y_test, y_pred))
    print('MAE: ', mean_absolute_error(y_test, y_pred))
    print('RMSE: ', np.sqrt(mean_squared_error(y_test, y_pred)))
    print('')

### Submission

In [None]:
# Load Test Data and Preprocess

X_test = pd.read_csv('test.csv')

In [None]:
# Model Ensemble 
models = [LinearRegression(), Lasso(), Ridge()]
preds = []

for model in models:
    model.fit(x, y)
    preds.append(model.predict(X_test))
    
mean = np.mean(preds, axis=0)
sample = pd.read_csv('sample_submission.csv')
#sample['Strength'] = mean
sample.to_csv('submission.csv', index=False)
    
    

    

In [None]:
# Non-Linear Model Ensemble 
models = [cb.CatBoostRegressor(), xgb.XGBRegressor(), RandomForestRegressor()]    
preds = []

for model in models:
    model.fit(x, y)
    preds.append(model.predict(X_test))
    
mean = np.mean(preds, axis=0)
sample = pd.read_csv('sample_submission.csv')
#sample['Strength'] = mean
sample.to_csv('submission.csv', index=False)

In [None]:
# Entire Ensemble

models = [LinearRegression(), Lasso(), Ridge(), cb.CatBoostRegressor(), xgb.XGBRegressor(), RandomForestRegressor()]
preds = []

for model in models:
    model.fit(x, y)
    preds.append(model.predict(X_test))
    
mean = np.mean(preds, axis=0)
sample = pd.read_csv('sample_submission.csv')
#sample['Strength'] = mean
sample.to_csv('submission.csv', index=False)