# Individual Models

## Pre-process

In [None]:
import utility
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.datasets import make_regression
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import Ridge
from sklearn.kernel_ridge import KernelRidge

ModuleNotFoundError: ignored

## Data Generation via SMOTE-NC

In [None]:
# this cell just computes the possible train/test splits and looks at how they affect the quality of data generated

splits = [0.1,0.2,0.25,0.3,0.4,0.5,0.6,0.7]
all_max_corr = []
all_mean_corr = []

for split in splits:

    # load and preprocess
    data = utility.load_and_preprocess('\kaggle_dataset\data.csv')

    drop_variables = ['elongation','roughness','tension_strength']
    X = data.drop(drop_variables, axis=1)
    y = data['tension_strength']
    #print(X)

    # Split data before data generation

    #train test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=split, random_state=5)
    y_8 = y_train
    y_train = X_train.loc[:,'material_pla']
    X_train.loc[:,'material_pla'] = y_8
    X_train.rename(columns = {'material_pla': 'tension_strength'},inplace=True)

    # use training data for generation
    X_gen, y_gen = utility.generate_data_smote(X=X_train, y=y_train, num_of_desired_samples=100) #desired samples per class 

    drop_variables = ['elongation','roughness','material_pla']
    X = data.drop(drop_variables, axis=1)
    X = pd.DataFrame(data=X.to_numpy())
    y = data['material_pla']

    # evaluate quality of generation by comparing to distribution of entire dataset

    max_corr, mean_corr = utility.evaluate_data_generation(X_orig=X, X_gen=X_gen, plot=False)
    
    all_max_corr.append(max_corr)
    all_mean_corr.append(mean_corr)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  return super(DataFrame, self).rename(**kwargs)


AttributeError: 'SMOTENC' object has no attribute '_validate_data'

In [None]:
#Visualise previous cell

plt.figure(figsize=(6,6))
plt.plot(splits,all_max_corr, label='Max absolute')
plt.plot(splits,all_mean_corr,label='Mean absolute')
plt.title('SMOTE data generation quality over different train/test splits')
plt.xlabel('Test data fraction of dataset')
plt.ylabel('Correlation difference')
plt.legend()

In [None]:
# USING BEST SPLIT:

# load and preprocess
data = utility.load_and_preprocess('\kaggle_dataset\data.csv')

drop_variables = ['elongation','roughness','tension_strength']
X = data.drop(drop_variables, axis=1)
y = data['tension_strength']
#print(X)

In [None]:
# Split data before data generation

#train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=101)

# If using bootstrap uncomment
# drop_variables = ['elongation','roughness','tension_strength']
# X_test = data.drop(drop_variables, axis=1)
# y_test = data['tension_strength']
# X_train = X
# y_train = y


y_8 = y_train
y_train = X_train.loc[:,'material_pla']
X_train.loc[:,'material_pla'] = y_8
X_train.rename(columns = {'material_pla': 'tension_strength'},inplace=True)
print(X_train)

In [None]:
# use training data for generation
X_gen, y_gen = utility.generate_data_smote(X=X_train, y=y_train, num_of_desired_samples=100) #desired samples per class 

#print(X_gen)

# original dataset
drop_variables = ['elongation','roughness','material_pla']
X_orig = data.drop(drop_variables, axis=1)
#X_orig = pd.DataFrame(data=X_orig.to_numpy())
#print(X_orig)


# evaluate quality of generation by comparing to distribution of entire dataset

mean_corr_diff = utility.evaluate_data_generation(X_orig=X_orig, X_gen=X_gen)


In [None]:
# reformatting generated data into regression format
y_generated = X_gen[8]
X_gen[8]=y_gen
X_generated = X_gen
print(X_generated)

In [None]:
# use the generated data to build models via CV
# use original data to evaluate model via .632 bootstrap

y_train = y_generated
X_train = X_generated


mean_corr_diff = utility.evaluate_data_generation(X_orig=X_test, X_gen=X_train)

In [None]:
print(type(y_test))

In [None]:
# normalise
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

## SVR Model

In [None]:
# model/hyperparameter selection

#parameter search - 5 fold cross validation
# C = 70 --> 100 samples
param_grid = {'C': [60], 'gamma': [0.3,0.2,0.1,0.01,0.001], 'kernel':['rbf','poly','sigmoid'] } 
grid = GridSearchCV(SVR(),param_grid,refit=True,verbose=2, cv=5)

#fit data to best model
grid.fit(X_train,y_train)
best_svr = grid.best_estimator_


In [None]:
# model evaluation - holdout test set

#make predictions on test data
y_test_pred = grid.predict(X_test)
y_train_pred = grid.predict(X_train)

#bootstrap evaluation



print('Train Results:\n')
train_mape, train_rmse, train_r2 = utility.evaluate(y_train_pred, y_train, plot=False)

#plot predictions
print('\nTest Results:\n')
test_mape, test_rmse, test_r2= utility.evaluate(y_test_pred, y_test)

print(best_svr)

## Ridge Regression

In [None]:
# model/hyperparameter selection

#parameter search - uses 5 fold cv
grid = RidgeCV(alphas=[0.001,0.0001,0.01,0.05,0.1,0.15,0.2, 0.5], cv=5).fit(X_train, y_train)

best_ridge = Ridge(alpha=grid.alpha_)

print(best_ridge)


In [None]:
# model evaluation
#make predictions on test data
y_test_pred = grid.predict(X_test)
y_train_pred = grid.predict(X_train)


print('Train Results:\n')
train_mape, train_rmse, train_r2 = utility.evaluate(y_train_pred, y_train, plot=False)

#plot predictions
print('\nTest Results:\n')
test_mape, test_rmse, test_r2= utility.evaluate(y_test_pred, y_test)



## Lasso Regression

In [None]:
# model/hyperparameter selection
from sklearn.linear_model import LassoCV
from sklearn.linear_model import Lasso


#parameter search - uses LOOCV
grid = LassoCV(alphas=[0.01,0.05,0.001,0.005,0.0001,0.1,0.3,0.31,0.32,0.33,0.34,0.5]).fit(X_train, y_train)

best_lasso = Lasso(alpha=grid.alpha_)
print(best_lasso)

In [None]:
# model evaluation

#make predictions on test data
y_test_pred = grid.predict(X_test)
y_train_pred = grid.predict(X_train)


print('Train Results:\n')
train_mape, train_rmse, train_r2 = utility.evaluate(y_train_pred, y_train, plot=False)

#plot predictions
print('\nTest Results:\n')
test_mape, test_rmse, test_r2= utility.evaluate(y_test_pred, y_test)



## Elastic Net regression

In [None]:
# model/hyperparameter selection
from sklearn.linear_model import ElasticNetCV
from sklearn.linear_model import ElasticNet


#parameter search - uses 5 fold cv
grid = ElasticNetCV(l1_ratio=[0.7,0.9,0.95,0.99,0.999],alphas=[0.4,0.5,0.55,0.6,0.65,0.7,0.8,0.9], cv=5).fit(X_train, y_train)

best_elastic = ElasticNet(alpha=grid.alpha_, l1_ratio=grid.l1_ratio_)

print(grid.alpha_)
print(grid.l1_ratio_)

In [None]:
# model evaluation

#make predictions on test data
y_test_pred = grid.predict(X_test)
y_train_pred = grid.predict(X_train)


print('Train Results:\n')
train_mape, train_rmse, train_r2 = utility.evaluate(y_train_pred, y_train, plot=False)

#plot predictions
print('\nTest Results:\n')
test_mape, test_rmse, test_r2= utility.evaluate(y_test_pred, y_test)



## Kernel Ridge Regression

In [None]:
# model/hyperparameter selection

#parameter search - 5 fold cross validation
# alpha = 0.2 --->100
param_grid = {'alpha': [0.2], 'gamma': [0.3,0.2,0.1,0.05], 'kernel':['rbf','poly','sigmoid'], 'degree':[1,2,3]} 
grid = GridSearchCV(KernelRidge(),param_grid,refit=True,verbose=2)

#fit data to best model
grid.fit(X_train,y_train)
best_krr = grid.best_estimator_
print(best_krr)


In [None]:
# model evaluation

#make predictions on test data
y_test_pred = grid.predict(X_test)
y_train_pred = grid.predict(X_train)


print('Train Results:\n')
train_mape, train_rmse, train_r2 = utility.evaluate(y_train_pred, y_train, plot=False)

#plot predictions
print('\nTest Results:\n')
test_mape, test_rmse, test_r2= utility.evaluate(y_test_pred, y_test)



## Random Forests Regressor

In [None]:
from sklearn.ensemble import RandomForestRegressor

#parameter search - 5 fold cross validation
param_grid = {'max_depth': [2,3,4,5,6,7,8,9,10], 'min_samples_split': [2,3]} 
grid = GridSearchCV(RandomForestRegressor(random_state=0,bootstrap=True),param_grid,refit=True,verbose=2)

#fit data to best model
grid.fit(X_train,y_train)
best_forest = grid.best_estimator_
print(best_forest)


In [None]:
# model evaluation

#make predictions on test data
y_test_pred = grid.predict(X_test)
y_train_pred = grid.predict(X_train)


print('Train Results:\n')
train_mape, train_rmse, train_r2 = utility.evaluate(y_train_pred, y_train, plot=False)

#plot predictions
print('\nTest Results:\n')
test_mape, test_rmse, test_r2= utility.evaluate(y_test_pred, y_test)



## MLP Regressor - not enough data to get meaningful results

In [None]:

# model/hyperparameter selection
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import GridSearchCV

param_grid = {'hidden_layer_sizes':[[5,10,10,1],[4,8,4,1],[2,3,6,1],[2,4,5,1],[2,20,20,1],[2,10,20,1],[5,5,5,5,1],[5,5,5,5,5,1],[8,7,6,5,4,3,2,1]], 'alpha':[1],'learning_rate_init':[0.01],'momentum':[0.2]}

#parameter search - 5 fold cross validation 
grid = GridSearchCV(MLPRegressor(batch_size=10,max_iter=1000, random_state =5, early_stopping=True), param_grid, refit=True, verbose=2, cv=3)

#fit data to best model
grid.fit(X_train,y_train)


In [None]:
# model evaluation

#make predictions on test data
y_test_pred = grid.predict(X_test)
y_train_pred = grid.predict(X_train)


print('Train Results:\n')
train_mape, train_rmse, train_r2 = utility.evaluate(y_train_pred, y_train, plot=False)

#plot predictions
print('\nTest Results:\n')
test_mape, test_rmse, test_r2= utility.evaluate(y_test_pred, y_test)

In [None]:
best_mlp= grid.best_estimator_

# Ensemble Models

## Adaboosting - just makes models overfit on training data, no real performance advantage

In [None]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import AdaBoostRegressor

boosted_models = []
estimators = [best_svr, best_krr, best_elastic, best_lasso, best_ridge, best_forest]

for e in estimators:
    
    #adaboost using svm model as base
    model =  AdaBoostRegressor(base_estimator=e,random_state=0,n_estimators=50)

    #parameter search - 5 fold cross validation
    param_grid = {'learning_rate':[0.05,0.1,0.15,0.2]} 
    grid = GridSearchCV(model,param_grid,refit=True, cv=5, verbose=2)

    #fit data to best model
    grid.fit(X_train,y_train)
    
    boosted_models.append(grid.best_estimator_)

    # model evaluation

    #make predictions on test data
    y_test_pred = grid.predict(X_test)
    y_train_pred = grid.predict(X_train)


    print('Train Results:\n')
    train_mape, train_rmse, train_r2 = utility.evaluate(y_train_pred, y_train, plot=False)

    #plot predictions
    print('\nTest Results:\n')
    test_mape, test_rmse, test_r2= utility.evaluate(y_test_pred, y_test)



## Bagging - simple average

In [None]:
from sklearn.ensemble import BaggingRegressor
grid = BaggingRegressor(base_estimator=best_svr, n_estimators=10, random_state=0).fit(X_train, y_train)

#make predictions on test data
y_test_pred = grid.predict(X_test)
y_train_pred = grid.predict(X_train)


print('Train Results:\n')
train_mape, train_rmse, train_r2 = utility.evaluate(y_train_pred, y_train, plot=False)

#plot predictions
print('\nTest Results:\n')
test_mape, test_rmse, test_r2= utility.evaluate(y_test_pred, y_test)


## Stacking - helps significantly when best 3 models are used: svr, krr, forest

## Currently the best model

In [None]:
from sklearn.ensemble import StackingRegressor
from sklearn.ensemble import RandomForestRegressor

estimators = [ ('svr', best_svr),('krr',best_krr) ,('forest',best_forest), ('mlp', best_mlp)]

reg = StackingRegressor(estimators=estimators).fit(X_train, y_train)

#reg.fit(X_train,y_train)
#print(reg.final_estimator.predict(X_test))

#make predictions on test data
y_test_pred = reg.predict(X_test)
y_train_pred = reg.predict(X_train)


print('Train Results:\n')
train_mape, train_rmse, train_r2 = utility.evaluate(y_train_pred, y_train, plot=False)

#plot predictions
print('\nTest Results:\n')
test_mape, test_rmse, test_r2= utility.evaluate(y_test_pred, y_test)
