In [1]:
import sys
sys.path.insert(0,"Loading/")
sys.path.insert(0,"Preprocessing/")
sys.path.insert(0,"Modeling/")
sys.path.insert(0,"Evaluation/")
sys.path.insert(0,"Interpretability/")
sys.path.insert(0,"Monitoring/")
sys.path.insert(0,"Utils/")


import loading
import preprocessing
import modeling
import evaluation
import interpretability
import monitoring 
import utils as u

import json 
import pandas as pd
import numpy as np

#Package allowing to reload a package (or a script), without having to reload the whole notebook
import importlib

In [2]:
# Here we find the parameters needed for the run. You have to launch this cell only if you do not launch the main script.

path_conf ="../params/conf/conf.json"

# path_conf ='../conf/conf.json'
conf = json.load(open(path_conf, 'r'))

path_log = conf['path_log'] # "../log/my_log_file.txt"
log_level = conf['log_level'] # "DEBUG"

# Be careful to launch the logger only once, otherwise each lines will be duplicated
logger = u.my_get_logger(path_log, log_level, my_name="main_logger")

In [98]:
#Reload of the conf file (useful when you do modifications)
conf = json.load(open(path_conf, 'r'))

## Loading and Preprocessing

In [99]:
importlib.reload(loading)

#Reading of the dataset selected in the conf file
df = loading.read_csv_from_name(conf)

In [100]:
importlib.reload(preprocessing)
importlib.reload(u)

#Preprocessing of the selected dataset
df_preprocessed, X_columns, y_column = preprocessing.main_preprocessing_from_name(df,conf)

#Writting of the preprocessed dataset
loading.write_preprocessed_csv_from_name(df_preprocessed,conf)

'OK'

In [101]:
#Basic Splitting between train and test
X_train, X_test, y_train, y_test = preprocessing.basic_split( df_preprocessed , 0.25 , X_columns, y_column)

## Modeling

In [102]:
importlib.reload(modeling)
#Modelisation using the model selected in the conf file
clf, best_params = modeling.main_modeling_from_name(X_train,y_train,conf)
#Saving the model
u.save_model(clf, conf)

Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:  1.1min finished


'OK'

## Evaluation

In [103]:
#Independent step from the other, we reload what we need:
importlib.reload(loading)
importlib.reload(u)

#Loading of the model
clf = u.load_model(conf)
#Loading of the preprocessed dataset
df = loading.load_preprocessed_csv_from_name(conf)

#Basic Splitting:
y_column = u.get_y_column_from_conf(conf)
X_columns = [x for x in df.columns if x != y_column ]
X_train, X_test, y_train, y_test = preprocessing.basic_split( df , 0.25 , X_columns, y_column)

../Outputs/Models/drift_random_forest.sav


In [104]:
importlib.reload(evaluation)
#Computing metrics
dict_metrics = evaluation.main_evaluation(clf, X_train, y_train, X_test, y_test)
dict_metrics

{'f1_score': 0.8074476543864298,
 'accuracy': 0.76752,
 'recall': 0.8628478368618565,
 'precision': 0.7587323329805118,
 'confusion_matrix': {'tn': 7002, 'fp': 3875, 'fn': 1937, 'tp': 12186}}

## Monitoring

In [None]:
# Loading what you need ... etc.

# TEST AUTRES - TMP

In [33]:
#vérif Drift: 

import lightgbm as lgb
from sklearn.metrics import f1_score, accuracy_score,confusion_matrix
import numpy as np
params = {'objective': 'binary'}

y_train = df['class'].loc[:10000]
X_train =df[[x for x in df.columns if x != "class"]].loc[:10000]

dftrainLGB = lgb.Dataset(data=X_train, label=y_train, feature_name=list(X_train))

clf = lgb.train(
    params,
    dftrainLGB,
    num_boost_round=100    )

for i in np.arange(10): 
    sub_x_test = df[[x for x in df.columns if x != "class"]].loc[i*10000:(i+1)*10000]
    sub_y_test =  df['class'].loc[i*10000:(i+1)*10000]

    y_test_pred = np.array([clf.predict(sub_x_test) >= 0.5], dtype=np.float32)[0]
    print(i,f1_score(sub_y_test, y_test_pred, average='micro'))
