In [1]:
#basic package
import tqdm
import os
import numpy as np
import glob
import pandas as pd
import sys
import shutil
import time
import datetime as dt
import random

#modelling
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
#catboost, for a better support of categorical data
from catboost import CatBoostClassifier

#visuals
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
PACKAGE_PARENT = '../'
SCRIPT_DIR = os.path.dirname(os.path.realpath(os.path.join(os.getcwd(), os.path.expanduser('__file__'))))
sys.path.append(os.path.normpath(os.path.join(SCRIPT_DIR, PACKAGE_PARENT)))
import config_origins_chapter0 as config

# Download Input Data

In [3]:
#define parameters
path_extracted_data = config.path_extracted_data
path_initial_data = config.path_initial_data
id_run = config.id_run
dico_matching = config.dico_matching
dico_garden_opening_hour = config.dico_garden_opening_hour
path_extracted_data_cleaning = os.path.join(path_extracted_data,'Cleaning')
path_extracted_data_cleaning_model = os.path.join(path_extracted_data_cleaning,'model') 
#create a director if not existing
if not os.path.exists(path_extracted_data_cleaning_model):
    os.makedirs(path_extracted_data_cleaning_model)    
#features for model
li_cont_select = config.li_cont_select
li_bin = config.li_bin
li_cat_select = config.li_cat_select

In [15]:
df_cleaning = pd.read_csv(os.path.join(path_extracted_data_cleaning, id_run+'InputCleaning.csv'), parse_dates=['Timestamp', 'date'], 
                 dayfirst=True, sep=';')
df_cleaning = df_cleaning[df_cleaning['VideoAnalyse'].isin([0,1])]
df_cleaning = df_cleaning[df_cleaning['algo_type']=='cleaning'].copy()
display(df_cleaning['VideoAnalyse'].value_counts())
print(df_cleaning.shape)
df_cleaning.head(3)

1    3738
0     536
Name: VideoAnalyse, dtype: int64

(5139, 34)


Unnamed: 0,Timestamp,TagID,HenID,Trackingsystem_Zone,Observed_Zone,PenID,VideoAnalyse,system,date,signalstrength,...,duration_bounded_mn,previous_duration_bounded_mn,algo_type,BatchID,previousZone==NextZone,next_duration_bounded_mn,next2zone==Zone,previous2zone==Zone,zone3_match_exist,is_first_observation
0,2020-10-23 11:04:12,tag_138,hen_141,3_Zone,3_Zone,pen10,1,10 - 12,2020-10-23,16.0,...,0.133333,1.316667,cleaning,,True,0.766667,1.0,1.0,1.0,0.0
1,2020-10-18 10:39:01,tag_125,hen_54,3_Zone,2_Zone,pen10,0,10 - 12,2020-10-18,3.0,...,0.2,8.383333,cleaning,,True,8.533333,1.0,1.0,0.0,0.0
2,2020-10-23 11:42:56,tag_138,hen_141,4_Zone,4_Zone,pen10,1,10 - 12,2020-10-23,13.0,...,0.383333,3.316667,cleaning,,True,0.1,1.0,1.0,1.0,0.0


In [14]:
df_cleaning['Trackingsystem_Zone'].value_counts()

3_Zone    2393
2_Zone     916
5_Zone     561
4_Zone     404
Name: Trackingsystem_Zone, dtype: int64

In [6]:
display(df_cleaning[config.li_cont_select].head(3))
display(df_cleaning[config.li_cat_select].head(3))

Unnamed: 0,signalstrength,signalstzone2,duration_bounded_mn,next_duration_bounded_mn,previous_duration_bounded_mn,next2zone==Zone,previous2zone==Zone,zone3_match_exist
0,16.0,6.0,0.133333,0.766667,1.316667,1.0,1.0,1.0
1,3.0,2.0,0.2,8.533333,8.383333,1.0,1.0,0.0
2,13.0,12.0,0.383333,0.1,3.316667,1.0,1.0,1.0


Unnamed: 0,Trackingsystem_Zone,zone2_match,previous1_zone,next1_zone,system
0,3_Zone,3_Zone,3_Zone,3_Zone,10 - 12
1,3_Zone,2_Zone,2_Zone,2_Zone,10 - 12
2,4_Zone,3_Zone,3_Zone,3_Zone,10 - 12


# Split into training and validation (for parameter tuning) & dummies

In [8]:
####### train, val, test (no need of dummie)#######
#we need to keep into dataframe format in order to have the columns names to specify in the hyperparamters the names of the
#categorical variables
#split into train, test and validation
#for algo from sklearn, we should add val+train together, while for the Catboost we will not add them up
#train
df_X_train = df_cleaning.filter(li_cont_select+li_cat_select, axis=1)
df_y_train = df_cleaning[['VideoAnalyse']]

#split training into validation: for algo from sklearn, we should add val+train together, while for the Catboost we will not 
#add them up
val_size = 0.2; rs = 0
df_X_train_cb, df_X_val_cb, df_y_train_cb, df_y_val_cb = train_test_split(df_X_train, df_y_train, test_size=val_size, 
                                                                          random_state=rs)
print(df_X_train_cb.shape, df_y_train_cb.shape, df_X_val_cb.shape, df_y_val_cb.shape)

(3419, 13) (3419, 1) (855, 13) (855, 1)


# Catboost

In [9]:
li_dico = []

In [10]:
#Parameters fine tunning
#Info: https://catboost.ai/docs/concepts/python-reference_parameters-list.html
x1 = df_cleaning[df_cleaning['VideoAnalyse']==1].shape[0]
x0 = df_cleaning[df_cleaning['VideoAnalyse']==0].shape[0]
print(x1/x0) 
P = {'n_estimators': range(200,1000,100), #the more trees you add, the higher the risk of overfitting!! default:1000!
     'l2_leaf_reg': range(2,5),#default is 3
     'loss_function': ['Logloss'], #for binary classification
     'max_depth': range(4,7), #if to big: risk of overfitting default=6
     'min_data_in_leaf': range(1,4), #The minimum number of training samples in a leaf (equivalent to min_samples_leaf). default 1
     'class_weights': [{0:x1/x0, 1:1}], #similar as compute_class_weight
     'od_pval': [10**-6],#[x for x in np.linspace(start = 10**-6, stop = 10**-2, num = 5)], #The larger the value, the earlier overfitting is detected., defualt=0, but it is recommanded to be tested for these range
     'od_wait':[10,15], #default 20 =#iterations to continue the training after the iteration with the optimal metric value.
     'use_best_model':[True],
     'task_type':['CPU'],#is actually faster
     'cat_features':[np.array(li_cat_select)]
    }
#define the model with all these options and fit it (as many combination as asked for). 
#refit: fit an estimator using the best found parameters on the whole dataset, then we can extract it with best_estimator_
mod = GridSearchCV(estimator = CatBoostClassifier(), param_grid = P, cv = 3, verbose=True, refit=True)
mod.fit(df_X_train_cb, df_y_train_cb, eval_set=(df_X_val_cb,df_y_val_cb), verbose=False)
#extract best already fitted model and save it 
mod_final = mod.best_estimator_
display(mod_final.get_params())
mod_final.save_model(os.path.join(path_extracted_data_cleaning_model,'selected_Catboost'), format="cbm")
#145mn for 648 fits

6.973880597014926
Fitting 3 folds for each of 432 candidates, totalling 1296 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 1296 out of 1296 | elapsed: 374.1min finished


{'l2_leaf_reg': 2,
 'loss_function': 'Logloss',
 'od_pval': 1e-06,
 'od_wait': 10,
 'use_best_model': True,
 'class_weights': {0: 6.973880597014926, 1: 1},
 'task_type': 'CPU',
 'max_depth': 6,
 'n_estimators': 500,
 'cat_features': array(['Trackingsystem_Zone', 'zone2_match', 'previous1_zone',
        'next1_zone', 'system'], dtype='<U19'),
 'min_data_in_leaf': 1}

In [11]:
#save
li_dico.append({'algo':'best-Catboost', 'model_param_used':mod_final.get_params(), 
                'accuracy_training':round(mod_final.score(df_X_train_cb, df_y_train_cb),3),
                'accuracy_validation':round(mod_final.score(df_X_val_cb, df_y_val_cb),3)})
                #'accuracy_testing':round(mod_final.score(df_X_test_cb, df_y_test_cb),3)})
pd.DataFrame(li_dico)

Unnamed: 0,algo,model_param_used,accuracy_training,accuracy_validation
0,best-Catboost,"{'l2_leaf_reg': 2, 'loss_function': 'Logloss',...",0.998,0.992


# save final results

In [12]:
df_res = pd.DataFrame(li_dico)
print(df_res.shape)
df_res.to_csv(os.path.join(path_extracted_data_cleaning, 'CB_finetuning.csv'), sep=';', index=False)      

(1, 4)
