# Imports

In [None]:
# Python packages
import sys
sys.path.append('../../')
from datetime import datetime
import numpy as np
import pandas as pd
import pickle

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, cross_val_score, RepeatedKFold, GridSearchCV, cross_validate, KFold, cross_val_score
from sklearn.linear_model import LogisticRegression, LinearRegression, Ridge, Lasso, LassoCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.svm import SVR
from sklearn.impute import SimpleImputer
from sklearn.neighbors import KNeighborsRegressor
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestRegressor

from xgboost import XGBRegressor, XGBClassifier

# Custom functions
import src.settings as settings
import src.mapper_cols as mapper_cols
from src.run_all.main_get_data import get_data
from src.run_all.main_preprocess import preprocess_data
from src.utilities.utilities import get_latest_file, list_filenames

# instellingen voor panda weergave aanpassen
pd.set_option('display.max_rows', 500) # alle rijen tonen
pd.set_option('display.max_columns', 500) # alle kolommen tonen
pd.set_option('display.width', 1000) # kolombreedte
pd.set_option("display.precision", 2)     # precisie van de kolommen aanpassen
pd.set_option('display.float_format', lambda x: '{:.15f}'.format(x)) # floats output tot 15 decimalen

### Settings

In [None]:
# Location all data
datapath = '../../data/'
# Drop target values in X set
X_DROP_VALUES = settings.Y_TARGET_COLS
# de kolom die wordt gebruikt als y value
Y_VALUE = ['wmoclientenper1000inwoners']
# test size voor de train/test split
TEST_SIZE = 0.3
# random state voor de train/test split. Bijvoorbeeld random_state = 42 als vaste seed voor reproduceerbaarheid
RANDOM_STATE = 42

## Model parameters
# manier van cross validate in de modellen. Bijvoorbeeld 10 of RepeatedKFold(n_splits=30, n_repeats=5, random_state=1)
CROSS_VALIDATE = 5
# manier van scoren in de modellen
MODEL_SCORING = 'neg_mean_squared_error'
## Grid Search parameters

# parameters die gebruikt worden in de grid search
ALPHA = [0.001, 0.01, 0.1, 1, 10, 100, 1000]
NEIGHBORS = [3, 5, 11, 19]
NORMALIZE = [True, False]
KERNEL = ['linear', 'poly', 'rbf', 'sigmoid', 'precomputed']
GAMMA = [0.5, 1, 1.5, 2, 5]
N_ESTIMATORS = [50,100,200]
C_REGULARIZATION = [0.001, 0.01, 0,1, 1]

### Functions

In [None]:
def drop_nan_from_specific_columns (df,columns_to_check):
    """
    Drops all rows with nan values in specific columns in a dataframe
    """
    df.dropna(
        axis=0,
        how='any',
        thresh=None,
        subset=columns_to_check,
        inplace=True
    )
    
def split_clf_and_params(best_estimator_clf):
    """
    Takes best estimator[clf] and outputs a list with clf and parameters
    """
    clf_and_params = str(best_estimator_clf)
    clf_and_params = clf_and_params.replace(")", "")
    clf_and_params_split = clf_and_params.split("(")
    return clf_and_params_split

def rmse_from_neg_mean_squared_error(neg_mean_squared_error):
    """
    Calculates RMSE from the neq mean squared error
    """
    rmse = np.sqrt(-(neg_mean_squared_error))
    return (rmse)

# functie maken om op basis van de grid search best estimator, het beste RMSE model te selecteren 
def rmse_from_gridsearch_best_estimator(grid_search):
    """
    Calculates RMSE from the grid search best estimator
    """
    rmse = np.sqrt(mean_squared_error(y_test, grid_search.best_estimator_.predict(X_test)))
    return (rmse)

# Get data
This step will load and combine several tables from CBS statline. 

Note: This step takes a number of minutes and without changes to the settings will give the same result. Therefor this code is commented out and the original dataset is loaded. 

In [None]:
%%time 
# ## CREATE NEW DATASET
# df_get_data_WMO= get_data(save_all=True)

# ## HARDCODED
datapath = '../../data/'
filename = 'df_get_data_WMO_WIJK_HUISHOUDENS_BEVOLKING_HEFFING_202104042111.parquet.gzip'
df_get_data_WMO = pd.read_parquet(datapath + filename)

# ## SELECT LAST FILE
# datapath = '../../data/'
# df = get_latest_file(filename_str_contains='df_WMO_', datapath=datapath, filetype='parquet')

print(f"The shape of the dataframe from step 'Get Data': {df_get_data_WMO.shape}")
df_get_data_WMO.sample(5)

# Preprocess --> Create test sets
This step will transform (select columns, impute, scale) the dataframe to be used in train/predict. 

In [None]:
%%time 
df_preprocessed = preprocess_data(df=df_get_data_WMO, save_all=True)

print(f"The shape of the dataframe from step 'Preprocess': {df_preprocessed.shape}")
df_preprocessed.sample(5)

In [None]:
# # For testing train
# datapath = '../../data/'
# filename = 'df_preprocessed_202104042029_Boerenverstand_Maikel.parquet.gzip'
# df = pd.read_parquet(datapath + filename)

In [None]:
df = df_preprocessed.copy()

# Train

## Train model

#### Stappen hieronder mogelijk verplaatsten naar prepare stap, later beoordelen

In [None]:
# checken of er rijen in het dataframe zitten waarbij de Y_value leeg is. Die rijen worden eruit gehaald.
drop_nan_from_specific_columns(df,Y_VALUE)

In [None]:
# X en y aanmaken
X = df.drop(X_DROP_VALUES, axis=1)
y = df[Y_VALUE]
# splitsen van X en y in train/test. 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = TEST_SIZE, random_state = RANDOM_STATE)

## Gridsearch

In [None]:
# pipeline maken om in de grid search te kunnen gebruiken
pl_gs_total = Pipeline([('clf', LinearRegression())]) # Placeholder Estimator
    
# param grid waarin alle classifiers + hyper parameters kunnen worden opgenomen. 
# hier classifiers (modellen) + parameters toevoegen
param_grid_total = [{'clf': [LinearRegression()], 
                     'clf__normalize': NORMALIZE,},
                    
                    {'clf': [Ridge()],  
                     'clf__alpha': ALPHA},
                    
                    {'clf': [Lasso()], 
                     'clf__alpha': ALPHA},
                   
                    {'clf': [KNeighborsRegressor()],  
                     'clf__n_neighbors': NEIGHBORS},
                     
                  # {'clf': [SVR()], 
                  #  'clf__kernel': KERNEL,
                  #  'clf__C': C_REGULARIZATION},
                    
                   {'clf': [XGBRegressor()],  
                    'clf__gamma': GAMMA,
                    'clf__n_estimators': N_ESTIMATORS},                   
                   ]
    
# grid search aanmaken
grid_search_total = GridSearchCV(pl_gs_total, param_grid_total, cv=CROSS_VALIDATE,
                           scoring=MODEL_SCORING,
                           return_train_score=True)

In [None]:
%%time
#grid search uitvoeren
grid_search_total.fit(X_train, y_train)

### Evaluate

In [None]:
# de best estimator uit de grid search halen (beste train score)
print(f"Het model met de beste train score is:\n{grid_search_total.best_estimator_['clf']}")
# de RMSE berekenen voor de best estimator
print(f"Dit model heeft een train score RMSE van {rmse_from_neg_mean_squared_error(grid_search_total.best_score_)}") 
print(f"Dit model heeft een test score RMSE van  {rmse_from_gridsearch_best_estimator(grid_search_total)}")

### Save best model and best model properties

In [None]:
# opslaan van beste estimator vanuit de gridsearch naar een Pickle file
suffix_datetime = datetime.strftime(datetime.now(), format='%Y%m%d%H%M')
output_filename = f'../../data/best_model_{suffix_datetime}.pickle'
pickle.dump(grid_search_total.best_estimator_, open(output_filename, 'wb'))

In [None]:
# extra regel om tijdelijk een dummy bij input_filename te krijgen
input_filename = 'Hier komt uiteindelijk de input_filename_locatie'
# dictionary maken van alle properties die van het beste model moeten worden opgeslagen
best_model_properties_dict = {"Model": [split_clf_and_params(grid_search_total.best_estimator_['clf'])[0]],
                        "Gridsearch_Params": [split_clf_and_params(grid_search_total.best_estimator_['clf'])[1]],
                        "Train_RMSE": [rmse_from_neg_mean_squared_error(grid_search_total.best_score_)],
                        "Test_RMSE": [rmse_from_gridsearch_best_estimator(grid_search_total)],
                        "Number_of_features": [len(X.columns)],
                        "Y_value": Y_VALUE,
                        "Input_filename": [input_filename],
                        "Output_filename": [output_filename],
                                     }
best_model_properties = pd.DataFrame(best_model_properties_dict)

In [None]:
best_model_properties

In [None]:
# opslaan van beste model properties naar csv
best_model_properties.to_csv(f'../../data/log_train/best_model_properties_{suffix_datetime}.csv', index = False, header=True)

## Code voor combineren output CSV's voor visualisatie en vergelijking

In [None]:
# ## combineren van de verschillende best model properties csv's naar één dataframe
# all_filenames = list_filenames(settings.train['LOG_PATH'], filename_str_contains='.csv')
# combined_logging = pd.concat([pd.read_csv(f"{settings.train['LOG_PATH']}{f}") for f in all_filenames ])
# combined_logging

## Code voor testen pickle file & predict

In [None]:
# # pickle file inladen voor predict
# loaded_model = get_latest_file(output_filename_str_contains='best_model_', datapath=datapath, filetype='pickle')
# # hoe moet ik deze score interpreteren?
# result = loaded_model.score(X_test, y_test)
# print(result)
# loaded_model.predict(X_test)

In [None]:
# regel om te testen of opgeslagen pickle file overeen komt met model
#grid_search_total.best_estimator_.predict(X_test)

In [None]:
# dit is het beste model uit de grid search
#grid_search_total.best_estimator_