# Train
This notebook is used to train the model. For The goal is to try multiple settings and models to find the best model. This model will be saved in a pickle file. 

## Content
* **Imports**: Imports of needed Python packages
* **Settings**: Hard coded variables needed to collect data like sources, tablenames, columnnames, etc. 
* **Funtions**: Resuable functions
* **Load data**: Load data
* **Pipelines**: Create pipelines
* **Train**: Train and fit best model
* **Evaluate**: Evaluate the model
    * **Scorings metric**
    * **Feature importance**
* **Save model**: Writing result to '../data'

## Requirements
The packages to be installed (besides standard Python packages) are:
* later invullen

# Imports

In [None]:
# zorgen voor de juiste modules
from os import listdir
from os.path import isfile, join
import pandas as pd
import numpy as np
from datetime import datetime
import pickle

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, cross_val_score, RepeatedKFold, GridSearchCV, cross_validate, KFold, cross_val_score
from sklearn.linear_model import LogisticRegression, LinearRegression, Ridge, Lasso, LassoCV, ElasticNet, BayesianRidge
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.svm import SVR
from sklearn.impute import SimpleImputer
from sklearn.neighbors import KNeighborsRegressor
from sklearn.decomposition import PCA

from xgboost import XGBRegressor

from ColumnSelector import ColumnSelector

# instellingen voor panda weergave aanpassen
pd.set_option('display.max_rows', 500) # alle rijen tonen
pd.set_option('display.max_columns', 500) # alle kolommen tonen
pd.set_option('display.width', 1000) # kolombreedte
pd.set_option("display.precision", 2)     # precisie van de kolommen aanpassen
pd.set_option('display.float_format', lambda x: '{:.3f}'.format(x)) # floats output tot 3 decimalen

# Settings

In [None]:
## Dataframe parameters
# locatie van dataset 
DF_LOCATION = 'C:/_NoBackup/Git/__JADS/WMO_execute_group_project/data/df_dataset_WMO.parquet.gzip'
# Location all data
datapath = '../data/'
# manier van laden dataset. Bijvoorbeeld read_parquet of read_csv
DF_READ = pd.read_parquet

## X & Y parameters
# de kolommen die uit de X dataset moeten worden gehaald. Dat is in ieder geval de y en eventueel nog meer kolommen.
# X_DROP_VALUES = ['wmoclienten', 'eenpersoonshuishoudens', 'huishoudenszonderkinderen', 'huishoudensmetkinderen']
X_DROP_VALUES = ['wmoclienten', 'percentagewmoclienten','wmoclientenper1000inwoners']
# de kolom die wordt gebruikt als y value
Y_VALUE = ['wmoclientenper1000inwoners']
# test size voor de train/test split
TEST_SIZE = 0.3
# random state voor de train/test split. Bijvoorbeeld random_state = 42 als vaste seed voor reproduceerbaarheid
RANDOM_STATE = 42

## Pipeline parameters
# strategy en waarde om te vullen bij lege categorische kolommen
NAN_VALUES_CAT_STRATEGY = 'constant'
NAN_VALUES_CAT_VALUES = 'Missing'
# waarden om in te vullen bij lege numerieke kolommen. Bijvoorbeeld mean of median
NAN_VALUES_NUM_STRATEGY = 'mean'
# 
#COLS_SELECT = ['aantalinwoners', 'mannen', 'vrouwen', 'k0tot15jaar'
#               , 'k15tot25jaar', 'k25tot45jaar', 'k45tot65jaar', 'k65jaarofouder', 'gescheiden'
#               , 'verweduwd', 'westerstotaal', 'sterftetotaal', 'gemiddeldehuishoudensgrootte'
#               , 'gemiddeldewoningwaarde', 'koopwoningen', 'huurwoningentotaal', 'inbezitwoningcorporatie'
#               , 'gemiddeldinkomenperinkomensontvanger', 'k40personenmetlaagsteinkomen', 'k20personenmethoogsteinkomen'
#               , 'actieven1575jaar', 'k40huishoudensmetlaagsteinkomen', 'k20huishoudensmethoogsteinkomen'
#               , 'huishoudensmeteenlaaginkomen', 'personenpersoortuitkeringaow', 'rucultuurrecreatieoverigediensten'
#               , 'personenautosperhuishouden', 'matevanstedelijkheid']
COLS_SELECT = None

## Model parameters
# manier van cross validate in de modellen. Bijvoorbeeld 10 of RepeatedKFold(n_splits=30, n_repeats=5, random_state=1)
CROSS_VALIDATE = 10
# manier van scoren in de modellen
MODEL_SCORING = 'neg_mean_squared_error'

## Scoring parameters
# Deze kunnen we later toevoegen als we meerdere manieren van scoren hebben. Dus niet alleen maar de RSMLE

# Functions

In [None]:
def get_latest_file(mypath='../data/'):
    """
    Method to get the latest file to preprare
    
    :params str mypath: String with the (respectively) directory where the data can be found. Default = '../data'
    :params bool train: Boolean to indicate if expected dataframe should be for preparing training data. Default = True
    
    return: pd.DataFrame
    """
    # Get list with file
    onlyfiles = sorted([f for f in listdir(mypath) if isfile(join(mypath, f))])
    filename = [s for s in onlyfiles if "df_prep_for_train_WMO" in s][-1]
    # Get list with last files
    df = pd.read_parquet(mypath+filename)
    return df

def get_latest_pickle_file(datapath='../data/'):
    """
    Method to get the latest file to preprare
    
    :params str datapath: String with the (respectively) directory where the data can be found. Default = '../data'
    :params bool train: Boolean to indicate if expected dataframe should be for preparing training data. Default = True
    
    return: pickle model
    """
    # Get list with file
    onlyfiles = sorted([f for f in listdir(datapath) if isfile(join(datapath, f))])
    # Get last file
    filename = [s for s in onlyfiles if "best_model" in s][-1]
    # Get list with last files
    model = pickle.load(open(datapath+filename, 'rb'))
    return model

# functie maken om op basis van de cv scores, het beste RMLSE model te selecteren 
def get_best_model_rmsle(cv_scores):
    """
    Return best (most conservative) model from cross_validate object.
    
    Uses np.argmax to find bottomright point == largest RMSE
    """
    index = np.argmax(np.sqrt(-cv_scores['train_neg_mean_squared_error']))
    model = cv_scores['estimator'][index]
    rmse = np.sqrt(mean_squared_error(y_test, model.predict(X_test)))
    return (rmse)

# Load data

In [None]:
df = get_latest_file(mypath=datapath)

## Stappen hieronder mogelijk verplaatsten naar prepare stap, later beoordelen

In [None]:
# droppen van de rijen waar de y_value leeg is, anders kunnen de modellen er niet mee overweg
df.dropna(
    axis=0,
    how='any',
    thresh=None,
    subset=Y_VALUE,
    inplace=True
)

In [None]:
# X en y aanmaken
X = df.drop(X_DROP_VALUES, axis=1)
# y = df[Y_VALUE]*100 # 0.01 -> 1.0 percentage
y = df[Y_VALUE] # 0.01 -> 1.0 percentage

In [None]:
# splitsen van X en y in train/test. 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = TEST_SIZE, random_state = RANDOM_STATE)

In [None]:
# splitsen van X_train in categorische en numerieke kolommen, om apart te kunnen transformeren
cat_cols = X_train.select_dtypes(include=['category']).columns
num_cols = X_train.select_dtypes(include=['int64','float64','float32','int32']).columns

# Pipelines

In [None]:
# pipelines (pl) maken voor imputing, scaling en OneHotEncoding per datatype 

# categorie met waarde die is gegeven aan "MISSING" toevoegen
for col in cat_cols:
    # need to add category for missings, otherwise error with OneHotEncoding (volgens mij ook met alleen imputing)
    X_train[col].cat.add_categories(NAN_VALUES_CAT_VALUES, inplace=True)
categories = [X_train[col].cat.categories for col in cat_cols]

# pipeline voor categorial datatype
pl_ppc_cat = make_pipeline(
     SimpleImputer(
         missing_values = np.nan
        ,strategy = NAN_VALUES_CAT_STRATEGY
        ,fill_value = NAN_VALUES_CAT_VALUES)
    ,OneHotEncoder(categories=categories)
)

# pipeline voor numeriek datatype
pl_ppc_num = make_pipeline(
      ColumnSelector(cols=COLS_SELECT)
    ,SimpleImputer(
         missing_values = np.nan
        ,strategy = NAN_VALUES_NUM_STRATEGY)
    ,StandardScaler()
    ,PCA() # PCA heeft behoorlijk wat (positieve) invloed op de scores
)

In [None]:
# pipelines maken om de preprocessing van de imputing te combineren
pl_ppc_total = make_column_transformer(
     (pl_ppc_cat, cat_cols)
    ,(pl_ppc_num, num_cols)
    ,remainder = 'drop'
)

In [None]:
# pipeline maken voor LinearRegression 
pl_lr = make_pipeline(
     pl_ppc_total
    ,LinearRegression()
)

In [None]:
# pipeline maken voor RidgeRegression 
pl_rr = make_pipeline(
     pl_ppc_total
    ,Ridge()
)

In [None]:
# pipeline maken voor Lasso
pl_lasso = make_pipeline(
      pl_ppc_total
     ,Lasso(alpha=0.001)
)

In [None]:
# pipeline maken voor KNN
pl_knn = make_pipeline(
      pl_ppc_total
     ,KNeighborsRegressor()
)

In [None]:
# pipeline maken voor SVM
pl_svm = make_pipeline(
      pl_ppc_total
     ,SVR()
)

In [None]:
# pipeline maken voor XGB
pl_xgb = make_pipeline(
      pl_ppc_total
     ,XGBRegressor()
) 

# Train

### Lineair Regression

In [None]:
# scores voor LR berekenen
lr_scores = cross_validate(
    pl_lr, X_train, y_train,
    cv = CROSS_VALIDATE,
    scoring=([MODEL_SCORING]),
    return_train_score=True,
    return_estimator=True,
)

### Ridge Regression

In [None]:
rr_scores = cross_validate(
    pl_rr, X_train, y_train,
    cv = CROSS_VALIDATE,
    scoring = ([MODEL_SCORING]),
    return_train_score=True,
    return_estimator=True,
)

### Lasso

In [None]:
lasso_scores = cross_validate(
    pl_lasso, X_train, y_train,
    cv = CROSS_VALIDATE,
    scoring = ([MODEL_SCORING]),
    return_train_score=True,
    return_estimator=True,
)

### K Nearest Neighbor

In [None]:
knn_scores = cross_validate(
    pl_knn, X_train, y_train,
    cv = CROSS_VALIDATE,
    scoring = ([MODEL_SCORING]),
    return_train_score=True,
    return_estimator=True,
)

### Support Vector Machines

In [None]:
svm_scores = cross_validate(
    pl_svm, X_train, y_train,
    cv = CROSS_VALIDATE,
    scoring = ([MODEL_SCORING]),
    return_train_score=True,
    return_estimator=True,
)

### XGBoost

In [None]:
xgb_scores = cross_validate(
    pl_xgb, X_train, y_train,
    cv = CROSS_VALIDATE,
    scoring = ([MODEL_SCORING]),
    return_train_score=True,
    return_estimator=True,
)

# Evaluate

## Calculate scores

In [None]:
# toevoegen van de beste scores, per model, aan een lijst
scores = []
scores.append(('Linear Regression',  get_best_model_rmsle(lr_scores)))
scores.append(('Ridge Regression',  get_best_model_rmsle(rr_scores)))
scores.append(('Lasso', get_best_model_rmsle(lasso_scores)))
scores.append(('K Nearest Neighbor', get_best_model_rmsle(knn_scores)))
scores.append(('Support Vector Machines', get_best_model_rmsle(svm_scores)))
scores.append(('XGBoost', get_best_model_rmsle(xgb_scores)))

In [None]:
# maken van een dataframe met daarin de gesorteerde scores per model
scores = pd.DataFrame(scores)
scores.columns = ['Algorithm', 'RMSE'] 
scores['RMSE'] = scores['RMSE'].map('{:,.10f}'.format)
scores = scores.sort_values('RMSE', ascending=True)
scores = scores.reset_index(drop=True)
scores

In [None]:
# and the winner is...
print(f"Het algoritme met de laagste RMSE is:\n\n{scores.iloc[0,0]}, met een RMSE van {scores.iloc[0,1]}")

## Feature importance

In [None]:
## TODO

# Save model

In [None]:
# Temporary: select best model (needs function)
BEST_SCORE = lasso_scores
index = np.argmax(np.sqrt(-BEST_SCORE['train_neg_mean_squared_error']))
model = BEST_SCORE['estimator'][index]

In [None]:
suffix_datetime = datetime.strftime(datetime.now(), format='%Y%m%d%H%M')
filename = f'../data/best_model_{suffix_datetime}.pickle'

pickle.dump(model, open(filename, 'wb'))

## Test load model

In [None]:
loaded_model = get_latest_pickle_file(datapath=datapath)

In [None]:
result = loaded_model.score(X_test, y_test)
print(result)

In [None]:
loaded_model.predict(X_test)