In [1]:
import tensorflow as tf
from tensorflow.keras import Model
from tensorflow.keras.utils import to_categorical
from tensorflow.python.keras.layers import deserialize, serialize
from tensorflow.keras.layers import Input, Embedding, LayerNormalization, concatenate, LSTM, BatchNormalization, Dense, Reshape
from tensorflow.python.keras.saving import saving_utils
from tensorflow.random import set_seed
from tensorflow.keras.optimizers import Nadam
from sklearn.utils import shuffle
from itertools import product

import scipy as sp
import pandas as pd
import numpy as np

import os, sys
import datetime
import json
import dill as pickle #более мощная библиотека позволяющая сохранять функции
import random
import copy

from IPython.display import display

from scipy.stats import kstest, anderson #lilliefors

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, ShuffleSplit, cross_val_score, learning_curve
from sklearn.model_selection import KFold, GridSearchCV, RandomizedSearchCV, StratifiedKFold
from sklearn.metrics import classification_report, f1_score, precision_score, recall_score, r2_score, roc_auc_score
from sklearn.utils import resample

from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier

import catboost as catb

import seaborn as sns
from matplotlib import pyplot as plt

import warnings
warnings.simplefilter("ignore")

%matplotlib inline
pd.set_option('display.max_columns', None)
from utils import short_model_score_report, scoring_report, report_by_product, slice_report
now_str = lambda : datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")

In [2]:
class model_dev:
    """
    Class builds container with predictive models based on parameters
    
    Parameters
    ----------
        
    df: pandas DataFrame
        Datafrae with model train data. Must include column with name equal to TARGET_NAME parameter 
    
    TARGET_NAME: string
        Name of column in train dataframe with training target (0 or 1 for classification)
    
    models: dictionary
        Python dictionary with models like {'model_title': model_object}. Where 'model_title': string model name; model_object: model object with preset parameters.
        Model objects must have folowing methods: fit, predict, predict_proba

    cat_columns: list of strings
        List of categorial columns name
        
    scaler_columns: list of strings
        List of numeric columns to be scaled
        
    scaler_type: string
        Type of scaler applyed to columns in scaler_columns list. 
        'std'  - standartization
        'norm' - normalization
        'none' - no scaling applyed
        Default value: 'std'
    
    target_class_ratio: float
        Taining data will be rebalanced to match portion of class '1' in data with target_class_ratio
        if target_class_ratio = -1, training data will not be rebalanced
        Default value: 0.5
        
    downsample: bool
        If False upsampleing will be applyed during training data rebalancing class '1' to target_class_ratio
        If True - downsampling
        Default value: False
                 
    SKF_splits: integer
        Number of Stratified K folds splits and consiquently number of models (each SKF split is used for validation while left data is used for model training)
        Default value: 5
                 
    sample_weight: bool
        If True wights of each sample in CatboostClassifier set to portion of target it's class in training data
        This allows to shift point of class division to 0.5
        Default value: True

    random_state: integer
        random_state fix random seed for model building
        Default value: 42
    
    """
    

    # Class for data scailing
    class Scaler:
        
        """
        Class scales input data
        
        Parameters
        ----------
        
        scaler_type: string
        Type of scaler applyed to columns in scaler_columns list. 
        'std'  - standartization
        'norm' - normalization
        'none' - no scaling applyed
        Default value: 'std'
        
        """
        

        def __init__(self, scaler_type='std'):
            self.scaler_type = scaler_type
            self.min = 0
            self.max = 0
            self.std = 0
            self.mean = 0

        def fit(self, data):
            
            self.min = np.min(data, axis=0)
            self.max = np.max(data, axis=0)
            self.std = np.std(data, axis=0)
            self.mean = np.mean(data, axis=0)
            return self

        def train(self, data):
            fit(self, data)

        def transform (self, data):
            if self.scaler_type == 'std':
                return self.compact_types((data - self.mean) / self.std)

            if self.scaler_type == 'norm':
                return self.compact_types((data - self.min) / (self.max-self.min))

            if self.scaler_type == 'none':
                return data

        def fit_transform(self, data):
            fit(self, data)
            transform (self, data)

        def compact_types(self, data):
            cols = data.select_dtypes(include='float64').columns
            data[cols] = data[cols].astype('float32')
            data.replace({-np.inf:np.nan, np.inf:np.nan}, inplace=True)
            return data
    
    def __init__(self,
                 df,
                 TARGET_NAME,
                 models={},
                 cat_columns=[],
                 scaler_columns=[],
                 scaler_type = 'std',
                 target_class_ratio=0.5,
                 downsample = False,
                 SKF_splits=5,
                 sample_weight=True,
                 random_state=42):
        
        # Vertion of model
        self.version = '1'
        
        self.df = df
        self.TARGET_NAME = TARGET_NAME
        self.df_bl_index = None
        self.df_bl_pr = None
        
        self.df_columns = list(df.columns)
        self.model_columns = None
        self.cat_columns = cat_columns
        
        self.scaler_columns = scaler_columns
        self.scaler_type = scaler_type

        self.target_class_ratio = target_class_ratio
        self.downsample = downsample

        self.SKF_splits = SKF_splits
        self.SKF_list = None

        self.sample_weight = sample_weight

        self.random_state = random_state


        self.models = []
        
        for model in models.keys():
            self.models.append({'model_name':model,
                                'model_class':models[model].__class__.__name__,
                                'model_sample': copy.deepcopy(models[model]),
                                'feature_importance':None,
                                'folds':[{'fold':i+1,
                                          'scaler':None,
                                          'trained_model':None,
                                          'feature_prediction_values_change':None,                                                
                                          'feature_loss_function_change':None,
                                          'feature_interaction_importance':None} for i in range(SKF_splits)]})


    # Models training
    def fit(self, fold_report=True, train_report=False):
        
        """
        Train models
        
        Parameters
        ----------
        
        fold_report: bool
            If true each fold metrics will be shown         
            Default value: True
        
        train_report: bool
            If true train data metrics will be shown 
            Default value: False   
        """
        
        
        now_str = lambda : datetime.datetime.now().strftime("%H:%M:%S")        

        # Making dataset for train results recording
        self.train_result = self.df[[self.TARGET_NAME]]
        for model in self.models:
            self.train_result[model['model_name']] = float("NaN")
            self.train_result[model['model_name'] + '_p'] =  float("NaN")

        # Balancing training data by target class (works only for classification models)
        self._balance_df_by_target()
        
        # Data preprocessing
        self.df_bl_pr = self._data_processing(self.df.loc[self.df_bl_index], training_mode=True)
        
        # Fixing model columns list
        self.model_columns = list(self.df_bl_pr.drop(columns = self.TARGET_NAME))
        self.cat_columns = list(set(self.cat_columns) & set(self.model_columns))
        self.scaler_columns = list(set(self.scaler_columns) & set(self.model_columns))
        
        # Forming SKF indexes for crossvalidation
        self._SKF(df=self.df_bl_pr)
        
        split = 0
        for train_index, valid_index in self.SKF_list:
            split += 1
            
            # Training and validation dataframes creation
            df_train_pr = self.df_bl_pr.loc[train_index].copy()
            df_valid_pr = self.df_bl_pr.loc[valid_index].copy()
            
            # Scaling data (critical for linear models)
            df_train_pr_sc = df_train_pr
            df_valid_pr_sc = df_valid_pr
            
            data_scaler = self.Scaler(scaler_type=self.scaler_type).fit(df_train_pr[self.scaler_columns])

            df_train_pr_sc[self.scaler_columns] = data_scaler.transform(df_train_pr[self.scaler_columns])
            df_valid_pr_sc[self.scaler_columns] = data_scaler.transform(df_valid_pr[self.scaler_columns])
            
            X_train = df_train_pr_sc.drop(columns=self.TARGET_NAME)
            y_train = df_train_pr_sc[self.TARGET_NAME]

            X_valid = df_valid_pr_sc.drop(columns=self.TARGET_NAME)
            y_valid = df_valid_pr_sc[self.TARGET_NAME]
            
            # removing unnecessary objects
            del(df_train_pr)
            del(df_train_pr_sc)
            del(df_valid_pr)
            del(df_valid_pr_sc)

            cat_feat_idx = list(np.where(X_train.columns.isin(self.cat_columns))[0])
            
            # Weights for each sample
            if self.sample_weight:
                class_weight_dict = dict(1 / (y_train.value_counts()/y_train.shape[0]))
                train_sample_weight = y_train.map(class_weight_dict)
            else:
                sample_weight = np.ones(y_train.shape[0])
            
            if len([1 for model in self.models if model['model_class'] in ['CatBoostClassifier', 'CatBoostRegressor']]) > 0:
                catb_valid_pool = catb.Pool(data=X_valid, label=y_valid, cat_features=cat_feat_idx)
                catb_train_pool = catb.Pool(data=X_train, label=y_train, cat_features=cat_feat_idx, weight=train_sample_weight)
            
            # Training models
            for model in self.models:

                model_name = model['model_name']
                model_to_train = copy.deepcopy(model['model_sample'])
                
                if model['model_class'] == 'CatBoostClassifier':
                    model_to_train.fit(X=catb_train_pool, eval_set=catb_valid_pool, plot=False, use_best_model=True)
                    # saving features importance data on validation datasets
                    model['folds'][split-1]['feature_prediction_values_change'] = model_to_train.get_feature_importance(data=catb_valid_pool)
                    model['folds'][split-1]['feature_loss_function_change'] = model_to_train.get_feature_importance(type='LossFunctionChange', data=catb_valid_pool)
                    model['folds'][split-1]['feature_interaction_importance'] = model_to_train.get_feature_importance(type='Interaction', data=catb_valid_pool)

                elif model['model_class'] == 'CatBoostRegressor':            
                    model_to_train.fit(X=catb_train_pool, eval_set=(X_valid, y_valid), plot=False, use_best_model=True)

                else:
                    #X_train.fillna(0, inplace=True)
                    #X_valid.fillna(0, inplace=True)
                    model_to_train.fit(X_train, y_train, sample_weight=train_sample_weight)

                # Saving trained model for each fold
                model['folds'][split-1]['scaler'] = copy.deepcopy(data_scaler)
                model['folds'][split-1]['trained_model'] = copy.deepcopy(model_to_train)
                
                # wrighting model predictions on validation dataset
                y_valid_pred_proba = model_to_train.predict_proba(X_valid)[:,1]
                y_valid_pred = np.round(y_valid_pred_proba).astype(int)
                self.train_result.loc[valid_index, model_name + '_p'] = y_valid_pred_proba
                self.train_result.loc[valid_index, model_name] = y_valid_pred

                # Making report for each fold
                if fold_report:
                    if (model_name == self.models[0]['model_name']):
                        print('\nFOLD ' + str(split) + ' REPORT')

                    short_model_score_report(y_true=y_valid, 
                                             y_pred_proba=y_valid_pred_proba, 
                                             name=model_name + '_' +str(split) + '_valid', 
                                             header=(model_name == self.models[0]['model_name']),
                                             model_type='classification')

                    #  Making report for training data
                    if train_report:
                        short_model_score_report(y_true=y_train, 
                                                 y_pred_proba=model_to_train.predict_proba(X_train)[:,1], 
                                                 name=len(model_name + '_' +str(split)) * ' ' + ' train',
                                                 header=False,
                                                 model_type='classification')

        
        # Making final report (all folds))
        print('\nFINAL REPORT ({} folds AVG)'.format(self.SKF_splits))
        slice_index = self.df_bl_index
        
        # Result of each model on all folds
        for model in self.models:

            short_model_score_report(y_true=self.train_result[self.TARGET_NAME].loc[slice_index],
                                     y_pred_proba=self.train_result[model['model_name']+'_p'].loc[slice_index],
                                     name=model['model_name'],
                                     header=(model['model_name'] == self.models[0]['model_name']),
                                     model_type='classification')
        
        # Average result of all models
        y_valid = self.df[self.TARGET_NAME].loc[slice_index]
        y_valid_pred_proba = np.sum(self.train_result[[model['model_name'] + '_p' for model in self.models]],axis=1) / len(self.models)
        y_valid_pred_proba = y_valid_pred_proba.loc[slice_index]
        y_valid_pred = np.round(y_valid_pred_proba).astype(int)

        short_model_score_report(y_true=y_valid,
                                 y_pred_proba=y_valid_pred_proba,
                                 name='MIX RESULT ',
                                 header=False,
                                 model_type='classification')

        # Calculation of final scores and probabilities for all models
        p_cols = [col for col in self.train_result.columns if col[-2:]=='_p']
        self.train_result['proba'] = np.sum(self.train_result[p_cols], axis=1)
        self.train_result['score'] = 1000 - np.sum(self.train_result[p_cols], axis=1) * 1000

        # Adding calculations for records which where not taken for model training after balancing
        # unscored_result = self.predict(input_type='df', input_data=self.df[~self.df.index.isin(self.df_bl_index)], output_type='df')
        # self.train_result.loc[unscored_result.index, unscored_result.columns] = unscored_result
        
        # Calculation of average feature importance
        for model in self.models:
            if model['model_class'] == 'CatBoostClassifier':
                model['feature_importance'] = self._catb_feature_importance(model)

        
        # Cleaning memory
        self.df = None
        self.df_bl_pr = None

        
    # Data preprocessing
    def _data_processing(self, df, training_mode=False):

        return df

    
    # Prediction making method
    def predict(self,
                input_data=None,
                input_type='df',
                output_type='df',        # 'df', 'json', 'csv'
                output_file_name=None,   # название файла в случае если output_type=='csv'
                prod_mode=True,          # If true returns only score (without probability by models)
                features=False,          # в выходной массив добавляются занчения признаков
                shap_values=False):      # в выходной массив добавлюются shap значения с суффиксом _SHAP
        
        
        """
        Generating predictions
        
        Parameters
        ----------
        
        input_data: pandas DataFrame or string
            Input data which includes features for model to make predictions. Can be different format types based on 'input_type' parameter
            
        input_type: string
            Description of input data type, can be:
            'df'        - input_data is pandas DataFrame object
            'csv'       - input_data is CSV filename
            'json'      - input data is JSON string
            'json_file' - input data is JSON file (for test purpose only, inefficient because parse file as stdin)
            Default value: 'df'
        
        output_type: string
            Format of method return:
            'df'        - returns pandas DataFrame object
            'json'      - returns JSON string with predictions
            'csv'       - returns CSV file with predictions
            Default value: 'df'        
        
        output_file_name: string
            Filename for predictions CSV file in case if output_type is set to 'csv'
            Default value: None
            
        prod_mode: bool
            Format of predictions generated. 
            If True, method returns only 'score' for each input record calclated as (1 - probability) * 1000
            If False, method in addition to 'score' returns target class probability for each model and resulted everage probability
            
        features: bool
            If True, method returns model features in addition to predictions
            
        shap_values: bool
            If True, method returns shap values for each feature in addition to predictions
            
        """                         
        
        # Формирование предсказания из входного датафрейма и контейнера с моделями
        def predict_from_df(df_test, shap_values=False, features=False):

            model_name_lst = [model['model_name'] for model in self.models]

            # Формирование датасета для записи результатом работы моделей
            df_test_result = pd.DataFrame(index=df_test.index)
            
            #Обработка данных функцией и выстраивание одинаковой очередности и кол-ва признаков                
            df_test_pr = self._data_processing(df_test.copy(), training_mode=False)[self.model_columns]
            
            for model in self.models:
                # Формирование дополнительныъ колонок для записи результатом работы моделей
                df_test_result[model['model_name']] = np.zeros(df_test.shape[0])
                df_test_result[model['model_name'] + '_p'] = np.zeros(df_test.shape[0])    
                    
                for fold in model['folds']:
                    
                    data_scaler = fold['scaler']
                    tr_model = fold['trained_model']

                    # Масштабирование данных для моделей (критично для линейных моделей)
                    df_test_pr_sc = df_test_pr.copy()
                    df_test_pr_sc[self.scaler_columns] = data_scaler.transform(df_test_pr[self.scaler_columns])
                    X_test = df_test_pr_sc
                    
                    # Обработка пропусков
                    if model['model_class'] != 'CatBoostClassifier':
                        X_test.replace({-np.inf:0, np.inf:0, np.nan:0}, inplace=True)

                    # Запись предсказаний моделей в журнал
                    y_pred_proba = tr_model.predict_proba(X_test)[:,1]

                    if shap_values:
                        cat_feat_idx = list(np.where(X_test.columns.isin(self.cat_columns))[0])
                        catb_valid_pool=catb.Pool(data=X_test, label=None, cat_features=cat_feat_idx)
                        if fold['fold'] == 1:
                            shap_array = tr_model.get_feature_importance(type='ShapValues', data=catb_valid_pool)
                        else:
                            shap_array += tr_model.get_feature_importance(type='ShapValues', data=catb_valid_pool)

                    df_test_result[model['model_name'] + '_p'] += y_pred_proba
                    df_test_result[model['model_name']] += np.round(y_pred_proba)

                
                df_test_result = df_test_result / len(model['folds'])                    
                    
                if shap_values and (model['model_class'] == 'CatBoostClassifier'):
                    shap_array = shap_array / len(model['folds'])
                    shap_cols = [model['model_name'] + '_shap_' + col for col in self.model_columns] + [model['model_name'] + '_shap_BASE_VALUE']
                    shap_df =  pd.DataFrame(data=shap_array, columns=shap_cols, index=df_test_result.index)
                    shap_df[model['model_name'] + '_shap_RESULT_VALUE'] = np.sum(shap_df.iloc[:,:-1], axis=1)
                    df_test_result = pd.concat((df_test_result, shap_df), axis=1)
                          
            # Округление до 0 знаков среднего значения предскзаний модели на всех фолдах 
            df_test_result[model_name_lst] =  np.round(df_test_result[model_name_lst]).astype(int)
            p_cols = [col for col in df_test_result.columns if col[-2:]=='_p']
            df_test_result['proba'] = np.sum(df_test_result[p_cols], axis=1)
            df_test_result['score'] = np.round(1000 - np.sum(df_test_result[p_cols], axis=1) * 1000).astype(int)

            if prod_mode:
                drop_cols=['proba'] + model_name_lst + p_cols
                df_test_result.drop(columns = drop_cols, inplace=True)
            
            if features:
                df_test_result = pd.concat((df_test[self.model_columns],
                                            df_test_result),
                                           axis=1)
                
            return df_test_result

        
        # JSON string as input
        def predict_from_json(df_json, shap_values=False, features=False):
            
            # Converting JSON to pandas DataFrame
            df = pd.DataFrame(json.loads(input_data)['data'])
            
            # Making predictions
            df_pred = predict_from_df(df.set_index('dataItemId'), shap_values=shap_values, features=features)

            return df_pred
        
        
        # JSON file as STDIN input (redundunt)
        def predict_from_json_file(file_name, shap_values=False, features=False):
            read_stdin = ''
            open_counter = 0
            json_counter = 0
            pred =pd.DataFrame()

            with open(file_name, 'r') as stdandar_input:
                for line in stdandar_input:
                    for s in line:
                        open_counter += (s == '{')
                        open_counter -= (s == '}')
                        read_stdin += s
                        if open_counter == 0:
                            if read_stdin[0] == '{':
                                json_counter += 1
                                if pred.shape[0] == 0:
                                    pred = predict_from_json(df_json=read_stdin, shap_values=shap_values, features=features)
                                else:
                                    pred = pd.concat([pred, predict_from_json(df_json=read_stdin, shap_values=shap_values, features=features)], axis=0)
                            read_stdin = ''
            return pred

        # Input data processing
        if input_type == 'df':
            df_result = predict_from_df(input_data, shap_values=shap_values, features=features)

        elif input_type == 'csv':
            with open(input_data, encoding='utf8') as f:
                h = f.readline()
            if h.count(';')>0: 
                df_result = predict_from_df(pd.read_csv(input_data,';'), shap_values=shap_values, features=features)
            else:
                df_result = predict_from_df(pd.read_csv(input_data,','), shap_values=shap_values, features=features)
            
        elif input_type == 'json':
            df_result = predict_from_json(input_data, shap_values=shap_values, features=features)

        elif input_type == 'json_file':
            df_result = predict_from_json_file(input_data, shap_values=shap_values, features=features)

        # returning predictions
        if output_type == 'df':
            return df_result

        if output_type == 'json':                                     

            # converting numpy data types to JSON compatible data types function
            def convert_to_json_type(x):
                if str(type(x))[:-4] == "<class 'numpy.int":
                    x = int(x)
                elif str(type(x))[:-4] == "<class 'numpy.float'>":
                    x = float(x)
                else:
                    x = str(x) 
                return x

            df_result.reset_index(inplace=True)
            cols = list(df_result.columns)

            pred_dict = {'version': self.version, 
                         'data': [{col: convert_to_json_type(df_result[col].values[i]) for col in cols} for i in range(df_result.shape[0])]}

            return json.dumps(pred_dict)    

        if output_type == 'csv':
            df_result.to_csv(output_file_name)
            return 'Data exported to file: ' + output_file_name

    # split data into stratified folds
    def _SKF(self, df):
        TARGET_NAME = self.TARGET_NAME
        n_splits = self.SKF_splits
        max_layers_qty = 2
        random_state = self.random_state
        
        if n_splits==1:
            return [[np.array(df.index), np.array(df.index)]]

        # Determine qty of layers
        layers_qty = min(max_layers_qty, np.unique(df[TARGET_NAME]).shape[0])
        # calculating split points
        split_points = np.linspace(np.min(df[TARGET_NAME]), np.max(df[TARGET_NAME]), layers_qty+1)
        #split_points = np.quantile(np.sort(np.unique(df[TARGET_NAME])), np.linspace(0, 1, layers_qty+1))

        # forming list with indexes and list with training and validation folds
        layers=[]
        SKF_list = [[np.array([]),np.array([])] for i in range(n_splits)]

        np.random.seed(random_state) 

        for i in range(len(split_points)-1):
            # determining indexes to be incuded in layer
            if i == len(split_points)-2:
                layer_index = np.array(df.loc[(df[TARGET_NAME]>=split_points[i]) & (df[TARGET_NAME]<=split_points[i+1])].index)
            else:
                layer_index = np.array(df.loc[(df[TARGET_NAME]>=split_points[i]) & (df[TARGET_NAME]< split_points[i+1])].index)

            np.random.shuffle(layer_index)
            b_qty = layer_index.shape[0] // n_splits

            for j in range(n_splits):
                if j == n_splits - 1:              
                    SKF_list[j][1] = np.hstack((SKF_list[j][1], layer_index[b_qty * j:]))
                    SKF_list[j][0] = np.hstack((SKF_list[j][0], layer_index[0:b_qty * j]))
                else:
                    SKF_list[j][1] = np.hstack((SKF_list[j][1], layer_index[b_qty * j: b_qty * (j+1)]))
                    SKF_list[j][0] = np.hstack((SKF_list[j][0], layer_index[0: b_qty * j], layer_index[b_qty * (j+1):]))
        
        self.SKF_list = SKF_list
    
    
    # Balancing by taget class
    def _balance_df_by_target(self):
        
        df_bl = self.df
        
        if self.target_class_ratio != -1:
            
            counts_0 = (df_bl[self.TARGET_NAME] == 0).sum()
            counts_1 = (df_bl[self.TARGET_NAME] == 1).sum()

            if self.downsample:
                if self.target_class_ratio > (counts_1 / (df_bl.shape[0])):
                    keep_to_balance = int(counts_0 - df_bl.shape[0] + counts_1 / self.target_class_ratio)
                    cut_class = 0
                else:
                    keep_to_balance = int(counts_1 - df_bl.shape[0] + counts_0 /(1 - self.target_class_ratio))
                    cut_class = 1

                sample = df_bl[df_bl[self.TARGET_NAME] == cut_class].sample(n=keep_to_balance, replace=False, random_state=self.random_state)
                df_bl = pd.concat([sample, df_bl[df_bl[self.TARGET_NAME] != cut_class]], axis=0)
            
            else:
                if self.target_class_ratio > (counts_1 / (df_bl.shape[0])):
                    disbalance = int(counts_0 / (1 - self.target_class_ratio) - df_bl.shape[0])
                    add_class = 1
                else:
                    disbalance = int(counts_1 / self.target_class_ratio - df_bl.shape[0])
                    add_class = 0

                sample = df_bl[df_bl[self.TARGET_NAME] == add_class].sample(n=disbalance, replace=True, random_state=self.random_state)
                df_bl = pd.concat([sample, df_bl], axis=0)

                df_bl = df_bl.astype(dtype=dict(self.df.dtypes))
        
        self.df_bl_index = df_bl.index

    
    # Feature importance for catboostclassifier models
    def _catb_feature_importance(self, catb_trained_model):
        
        SKF_number = len(catb_trained_model['folds'])
        cols = self.model_columns
        n_cols = len(cols)

        col_dict = {i: self.model_columns[i] for i in range(n_cols)}

        feat_importance = pd.DataFrame({'PredictionValuesChange':np.zeros(n_cols),
                                        'FeatureLossFunctionChange':np.zeros(n_cols)}, index=cols)

        for fold in catb_trained_model['folds']:
            feat_importance['PredictionValuesChange'] += pd.DataFrame({'feture_importance': fold['trained_model'].get_feature_importance()}, index=self.model_columns)['feture_importance'] / SKF_number
            feat_importance['FeatureLossFunctionChange'] += pd.DataFrame({'feture_importance': fold['feature_loss_function_change']}, index=self.model_columns)['feture_importance'] / SKF_number
    
        if catb_trained_model['folds'][0]['trained_model'].get_all_params()['depth'] == 1:
            feat_importance['feature_interaction'] = np.zeros(feat_importance.shape[0])
            return feat_importance.sort_values(by='PredictionValuesChange', ascending=False)
        
        # making pare wise feature imortance table
        fi_interaction = pd.DataFrame(data=catb_trained_model['folds'][0]['feature_interaction_importance'], columns=['f1', 'f2', 'strength'])
        fi_interaction['feature_1'] = fi_interaction['f1'].map(col_dict)
        fi_interaction['feature_2'] = fi_interaction['f2'].map(col_dict)
        fi_interaction['strength_cum'] = np.cumsum(fi_interaction['strength'])
        fi_interaction['n_unique'] = None
        fi_interaction['unique_features'] = None
        for i in range(fi_interaction.shape[0]):
            fi_interaction.loc[i, 'n_unique'] = np.unique(np.vstack((fi_interaction.iloc[0:i]['f1'].values, fi_interaction.iloc[0:i]['f2'].values))).shape[0]
            unique_fetures = np.unique(np.vstack((fi_interaction.iloc[0:i]['f1'].values, fi_interaction.iloc[0:i]['f2'].values))).astype('int')
            fi_interaction.loc[i, 'unique_features'] = str(list(map(col_dict.get, unique_fetures)))[1:-1]

        fi_interaction=fi_interaction[['f1','feature_1', 'f2', 'feature_2', 'strength', 'strength_cum', 'n_unique', 'unique_features']]

        feat_importance['feature_interaction'] = np.zeros(feat_importance.shape[0])

        for i in feat_importance.index:
            feat_importance.loc[i, 'feature_interaction'] = fi_interaction.loc[fi_interaction['feature_1']==i, 'strength'].sum() + fi_interaction.loc[fi_interaction['feature_2']==i, 'strength'].sum()
        feat_importance['feature_interaction'] =  feat_importance['feature_interaction']  / 2

        return feat_importance.sort_values(by='PredictionValuesChange', ascending=False)
    

    # Plot feature importance for catboostclassifier models
    def plot_catb_feature_importance(self, catb_feature_importance_df, top_n=20):
        
        """
        Plot feature importance for catboostclassifier models
        
        Parameters
        ----------
        
        catb_feature_importance_df: pandas DataFrame
            Dataframe with catboost feature importance data generated while model training (i-model feature importance: model.models[i]['feature_importance'])
            
        top_n: integer
            Number of top features to be ploted
            Default value: 20
        
        """
        
        feat_importance_short = catb_feature_importance_df.iloc[:top_n,:]
        fig, ax = plt.subplots(nrows=1, ncols=3)
        fig.set_size_inches (18 , top_n * 0.5)
        plt.subplots_adjust(wspace=0.0, hspace=0.0)
        ax = ax.flatten()

        sns.barplot(feat_importance_short['PredictionValuesChange'], feat_importance_short.index, ax=ax[0])
        ax[0].set_title('Feature importance\n*Prediction Values Change*')
        ax[0].set_xlabel('Feature importance')
        ax[0].set_ylabel('Features')

        sns.barplot(feat_importance_short['feature_interaction'], feat_importance_short.index, ax=ax[1])
        ax[1].set_title('Feature importance\n** Feature interaction')
        ax[1].set_xlabel('Feature importance')
        ax[1].set_yticks([])

        sns.barplot(x=feat_importance_short['FeatureLossFunctionChange'], y= feat_importance_short.index, ax=ax[2])
        ax[2].set_title('Feature importance\n*** FeatureLoss Function Change')
        ax[2].set_xlabel('Feature importance')
        ax[2].set_ylabel('Features')
        ax[2].set_yticks([])

        for ax_i in ax:
            ax_i.spines['top'].set_visible(False)
            ax_i.spines['right'].set_visible(False)
            ax_i.spines['bottom'].set_visible(False)
            ax_i.spines['left'].set_visible(False)
        plt.show();

        print('* For each feature, PredictionValuesChange shows how much on average the prediction changes \
    if the feature value changes. The bigger the value of the importance the bigger on average is the \
    change to the prediction value, if this feature is changed')
        print('\nFeature importance values are normalized so that the sum of importances of all features is equal \
    to 100. This is possible because the values of these importances are always non-negative.')
        print('\nFormula values inside different groups may vary significantly in ranking modes. \
    This might lead to high importance values for some groupwise features, even though these \
    features dont have a large impact on the resulting metric value.')
        print('\n** TTL Sum for feature interaction is 100. Calculated as sum of importance of all paired combination with given factor devided by 2')
        print('\n*** For each feature the value represents the difference between the loss value of the model with this \
    feature and without it. The model without this feature is equivalent to the one that would have been trained \
    if this feature was excluded from the dataset.')
        print('\nThis feature importance approximates the difference between metric values calculated on the following models: \
    \n - The model with the -th feature excluded \
    \n - The original model with all features')
    
   
    # Saving model container
    def save(self, file_name, prod_mode=False):
        
        save_model = copy.deepcopy(self)        
        
        if prod_mode:
            save_model.SKF_list = None
            save_model.df_bl_index = None
            save_model.df_bl_pr = None
            save_model.train_result = None
        
            for m in save_model.models:
                m['feature_importance'] = None
                for fold in m['folds']:
                    fold['feature_prediction_values_change'] = None
                    fold['feature_loss_function_change'] = None
                    fold['feature_interaction_importance'] = None

        with open(file_name, 'wb') as f:
            pickle.dump(save_model, f)    
        del(save_model)
        print('Model saved to file:', file_name)
    
    # Exporting production script
    def generate_prod_script(self):
        script_text = ""
        script_text += r"#!/usr/bin/env python" + '\n'
        script_text += r"# coding: utf-8" + '\n'
        script_text += r"# noinspection PyUnresolvedReferences" + '\n'
        script_text += r"import sys, json, os, dill as pickle, pandas as pd, numpy as np" + '\n'
        script_text += r"import warnings" + '\n'        
        script_text += r"warnings.simplefilter('ignore')" + '\n'
        script_text += r"" + '\n'
        script_text += r"model_dev = None" + '\n'
        script_text += r"class_name = '" + self.__class__.__name__[:-4] + "'" + '\n'
        script_text += r"" + '\n'
        script_text += r"class " + self.__class__.__name__[:-4] + "():" + '\n'
        script_text += r"" + '\n'
        script_text += r"    def __init__(self):" + '\n'
        script_text += r"" + '\n'
        script_text += r"        # Загрузка модели" + '\n'
        script_text += r"        global model_dev" + '\n'
        script_text += r"        if not model_dev:" + '\n'
        script_text += r"            model_file_name = 'models' + os.sep + self.__class__.__name__" + '\n'
        script_text += r"            model_dev = pickle.load(open(model_file_name, 'rb'))" + '\n'
        script_text += r"" + '\n'
        script_text += r"        self.model_dev = model_dev" + '\n'
        script_text += r"" + '\n'
        script_text += r"    def predict(self, input_data=None, input_type='json', output_type='json', output_file_name=None, features=False, shap_values=False, prod_mode=True):" + '\n'
        script_text += r"" + '\n'
        script_text += r"        pred = self.model_dev.predict(input_type=input_type,              # 'df', 'csv', 'json', 'json_file'" + '\n'
        script_text += r"                                      input_data=input_data,              # объект 'df',название файла 'csv', Строка 'json', файл с 'json' строками " + '\n'
        script_text += r"                                      output_type=output_type,            # 'df', 'json', 'csv'" + '\n'
        script_text += r"                                      output_file_name=output_file_name,  # название файла в случае если output_type=='csv'" + '\n'
        script_text += r"                                      prod_mode=prod_mode,                # If true returns only score (without probability by models)" + '\n'
        script_text += r"                                      features=features,                  # в выходной массив добавляются занчения признаков" + '\n'
        script_text += r"                                      shap_values=shap_values)            # в выходной массив добавлюются shap значения с суффиксом _SHAP" + '\n'
        script_text += r"        return pred" + '\n'
        script_text += r"" + '\n'
        script_text += r"if __name__ == '__main__' and (sys.argv[0][-12:] != '_launcher.py'):" + '\n'
        script_text += r"" + '\n'
        script_text += r"    if (len(sys.argv) != 2) or (sys.argv[1].lower() in ['help','-help', 'h', '-h']):" + '\n'
        script_text += r"        help_msg = '\nHELP:     To use script put as an argument path to csv data file that should be scored\n'" + '\n'
        script_text += r"        help_msg += '\nEXAMPLE:  python ' + sys.argv[0] + ' data_file.csv\n'" + '\n'
        script_text += r"        help_msg += '\nNOTICE:   Scored data file name will have suffix _scored \n'" + '\n'
        script_text += r"        print(help_msg)" + '\n'
        script_text += r"" + '\n'
        script_text += r"    else: " + '\n'
        script_text += r"        input_file_name = sys.argv[1]" + '\n'
        script_text += r"        output_file_name = sys.argv[1][:-4] + '_scored.csv'" + '\n'
        script_text += r"" + '\n'
        script_text += r"        if os.path.exists(input_file_name):" + '\n'
        script_text += r"            with open(input_file_name, encoding='utf-8') as f:" + '\n'
        script_text += r"                h = f.readline()" + '\n'
        script_text += r"                sep = (';' if (h.count(';') > 10) else ',')" + '\n'
        script_text += r"" + '\n'
        script_text += r"            target_cols = ['FID', 'RETRO_DATE', 'ErrorCode', 'ExclusionCode', 'ID', 'PASSPORT_SER', 'PASSPORT_NUM', 'ACCT_NUM', 'Number', 'OWN_SCORE', 'DEFAULT', 'Score']" 
        script_text += r"" + '\n'
        script_text += r"            df = pd.read_csv(input_file_name, sep=sep, dtype={col:str for col in target_cols}, encoding='utf-8')" + '\n'
        script_text += r"" + '\n'
        script_text += r"            pred = vars()[class_name]().predict(input_type='df'," + '\n'        
        script_text += r"                                                input_data=df," + '\n'        
        script_text += r"                                                output_type='df')" + '\n'
        script_text += r"" + '\n'
        script_text += r"            df['ErrorCode'] = None" + '\n'
        script_text += r"            df['ExclusionCode'] = None" + '\n'
        script_text += r"            df['Score'] = pred['score']" + '\n'
        script_text += r"" + '\n'
        script_text += r"            output_cols = [col for col in df.columns if col.lower() in [c.lower() for c in target_cols]]" + '\n'        
        script_text += r"            output_cols.sort(key=lambda s: [c.lower() for c in target_cols].index(s.lower()))" + '\n'        
        script_text += r"" + '\n'
        script_text += r"            df[output_cols].to_csv(output_file_name, index=False, encoding='utf-8', sep=',')" + '\n'
        script_text += r"" + '\n'
        script_text += r"            print('\nScored data file was saved at: ' + output_file_name)" + '\n'
        script_text += r"" + '\n'        
        script_text += r"        else:" + '\n'        
        script_text += r"            print('\nFile ' + input_file_name + ' does not exist\n')" + '\n'
        
        prod_script_file_name = self.__class__.__name__[:-10] +'.py'
        with open(prod_script_file_name, 'w', encoding='utf-8') as f:
            f.write(script_text)
        
        print('Production script saved to file ', prod_script_file_name)

### PARAMETERS

In [3]:
PATH_TO_DATA = '/opt/exchange/PROJECTS/BCG_GAMMA/(data)/'

date_data_start = '2019-01-01 07:00:00'
date_data_finish = '2021-03-31'

date_train_start = '2019-01-01'
date_train_finish = '2021-01-01'

date_valid_start = '2019-01-01'
date_valid_finish = '2021-04-01'

df_filename = 'icl_train.csv'
feat_filename = 'features.csv'

random_state = 42

max_seq_len = 90 * 24 # retrospective steps review

# Seqenced data LSTM model preparation dictionary
data_dict = {'TARGET_NAME':'qty',
             'RNN_NUM_FEAT':['time_of_the_day', 'lune_phase', 'school_holiday', 'max_temp', 'min_temp', 'precipitation', 'days_off', 'qty'],
             'RNN_CAT_FEAT':['weekday', 'day_month', 'week', 'month', 'hour_day'],
             'DIR_NUM_FEAT':['time_of_the_day', 'lune_phase', 'school_holiday', 'max_temp', 'min_temp', 'precipitation', 'days_off'],
             'DIR_CAT_FEAT':['weekday', 'day_month', 'week', 'month', 'hour_day', 'zone', 'reason'], # reason must be last feature in the list
             'CAT':[{'NAME':'weekday',   'PROJECTION':3,  'DICT_SIZE':7},
                    {'NAME':'day_month', 'PROJECTION':4,  'DICT_SIZE':32},
                    {'NAME':'week',      'PROJECTION':8,  'DICT_SIZE':54},
                    {'NAME':'month',     'PROJECTION':4,  'DICT_SIZE':13},
                    {'NAME':'hour_day',  'PROJECTION':6,  'DICT_SIZE':24},
                    {'NAME':'zone',      'PROJECTION':9,  'DICT_SIZE':6},
                    {'NAME':'reason',    'PROJECTION':10, 'DICT_SIZE':10}]}

reason_dict = {0: 'Перевозка плановая',
               1: 'Перевозка экстренная',
               2: 'без сознания',
               3: 'боли в животе',
               4: 'выс. давление( боли в сердце)',
               5: 'выс. давление( голов.боль,головокруж)',
               6: 'выс. темп.',
               7: 'задыхается',
               8: 'плохо',
               9: 'плохо с сердцем'}

zone_dict ={0: 'П/станция 1',
            1: 'П/станция 2',
            2: 'П/станция 3',
            3: 'П/станция 6',
            4: 'П/станция 8',
            5: 'П/станция 9'}

In [4]:
%%time
# DATA LOADING AND PREPROCESSING

df_flat=pd.concat([pd.DataFrame(data={'date':pd.date_range(start=date_data_start, end=date_data_finish, freq='H'), 'zone':zone}) for zone in range(6)], axis=0)

df_feat = pd.read_csv(f'{PATH_TO_DATA}{feat_filename}', index_col=0)

# Data transformation df_feat
mapper={'Максимальная температура, С':'max_temp',
        'Минимальная температура, С':'min_temp',
        'Осадки, часы':'precipitation'}

df_feat.rename(columns=mapper, inplace=True)
df_feat['date'] = pd.to_datetime(df_feat['date'], format='%Y-%m-%d %H:%M:%S')

# Merging with df_feat
cols = ['weekday', 'day_month', 'week', 'month', 'hour_day', 'time_of_the_day', 'lune_phase', 'school_holiday', 'max_temp', 'min_temp', 'precipitation', 'days_off']
int_cols = ['weekday', 'day_month', 'week', 'month', 'hour_day', 'time_of_the_day', 'school_holiday', 'max_temp', 'min_temp', 'precipitation', 'days_off']

for col in cols:
    df_flat[col] = df_flat['date'].map(df_feat.set_index('date')[col])
for col in int_cols:
    df_flat[col] = df_flat[col].fillna(0).astype('int8')

# DF_start processing
df_start=pd.read_csv(f'{PATH_TO_DATA}{df_filename}')
df_start = df_start[['date', 'zone'] + list([reason_dict[key] for key in reason_dict.keys()])]
df_start = df_start[df_start['zone'].isin([zone_dict[key] for key in zone_dict.keys()])]
df_start['zone'] = df_start['zone'].map({zone_dict[key]:key for key in zone_dict.keys()})
df_start.rename(columns={reason_dict[key]:key for key in reason_dict.keys()}, inplace=True)
df_start['date'] = pd.to_datetime(df_start['date'], format='%Y-%m-%d %H:%M:%S')
    
# Merging with df_start
df_flat = pd.merge(left=df_flat,
                   right=df_start,
                   how='left',
                   on=['date', 'zone'])
df_flat.loc[:,'zone':] = df_flat.loc[:,'zone':].fillna(-1).astype('int8')

del(df_feat, df_start)

CPU times: user 1.68 s, sys: 212 ms, total: 1.9 s
Wall time: 1.9 s


In [5]:
%%time
# df and df_ preparartion
feat_lst = data_dict['DIR_NUM_FEAT'] + data_dict['DIR_CAT_FEAT']
df_lst = []

for col in reason_dict.keys():
    df_lst.append(df_flat[['date'] + feat_lst[:-1]])
    df_lst[-1][data_dict['TARGET_NAME']] = df_flat[col]
    df_lst[-1]['reason'] = col
    df_lst[-1]['reason'] = df_lst[-1]['reason'].astype('int16')
df = pd.concat(df_lst, axis=0)[['date'] + feat_lst + [data_dict['TARGET_NAME']]].reset_index()
df.rename(columns={'index':'index_flat'}, inplace=True)

del(df_lst)

# forming df for training

df_ = df[df['date'] > df_flat['date'].drop_duplicates().sort_values().iloc[max_seq_len]]
df_ = df_.reset_index()

# Converting numeric features to lists
for col in data_dict['RNN_NUM_FEAT']:
    df[col] = df[col].map(lambda x:[x])

CPU times: user 6.43 s, sys: 662 ms, total: 7.09 s
Wall time: 7.09 s


In [6]:
df_['qty'] = (df_['qty'] > 0).astype('int8')

In [7]:
df_

Unnamed: 0,index,index_flat,date,time_of_the_day,lune_phase,school_holiday,max_temp,min_temp,precipitation,days_off,weekday,day_month,week,month,hour_day,zone,reason,qty
0,2161,2161,2019-04-01 08:00:00,1,0,0,5,-1,0,0,0,1,14,4,8,0,0,1
1,2162,2162,2019-04-01 09:00:00,1,0,0,5,-1,0,0,0,1,14,4,9,0,0,1
2,2163,2163,2019-04-01 10:00:00,1,0,0,5,-1,0,0,0,1,14,4,10,0,0,1
3,2164,2164,2019-04-01 11:00:00,1,0,0,5,-1,0,0,0,1,14,4,11,0,0,1
4,2165,2165,2019-04-01 12:00:00,1,0,0,5,-1,0,0,0,1,14,4,12,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1050775,1180435,118039,2021-03-30 20:00:00,0,0,0,6,-4,0,0,1,30,13,3,20,5,9,0
1050776,1180436,118040,2021-03-30 21:00:00,0,0,0,6,-4,0,0,1,30,13,3,21,5,9,0
1050777,1180437,118041,2021-03-30 22:00:00,0,0,0,6,-4,0,0,1,30,13,3,22,5,9,0
1050778,1180438,118042,2021-03-30 23:00:00,0,0,0,6,-4,0,0,1,30,13,3,23,5,9,0


In [8]:
# General
TARGET_NAME = "qty"
cat_columns = ['weekday', 'zone', 'reason']

In [14]:
%%time
# подготовка моделей
catbm = catb.CatBoostClassifier(eval_metric='AUC',
                                silent=True,
                                iterations=1000,
                                random_state=21)

models = {'catbm':catbm}

model = model_dev(df=df_.drop(columns=['index', 'index_flat', 'date']),                                # датасет
                   TARGET_NAME=TARGET_NAME,                    # Имя столбца с целевой переменной
                   models=models,                              # словарь моделей (обязательно  должны быть методы fit, predict)   
                   cat_columns=cat_columns,                    # Названия столбцов с категориальными признакми
                   scaler_columns=[],              # Названия столбцов по которым необходимо проводить масштабирование
                   scaler_type='std',                          # Тип масштабирования признаков ('none', 'std', 'norm')
                   target_class_ratio=-1,                     # коэффициент для балансровки соотношения классов целевой переменной
                   downsample=False,                            # при щначении True балансирвка осуществляется через downsampling 
                   SKF_splits=5,                               # кол-во фолдов для валидации
                   sample_weight=True,                         # указание веса каждого наблюдения для модели CatboostClassifier 
                   random_state=42)                            # Random_State

CPU times: user 16.1 ms, sys: 13 µs, total: 16.1 ms
Wall time: 16 ms


In [15]:
model.fit(train_report=True)


FOLD 1 REPORT
[4mModel                                        Qty     1 %  f1 score    Recall  Precission   ROC AUC      Gini[0m
catbm_1_valid                             210155  19.09%    0.4634    0.7283      0.3398    0.7758    0.5516
        train                             840625  19.09%    0.4682    0.7359      0.3433    0.7815     0.563

FOLD 2 REPORT
[4mModel                                        Qty     1 %  f1 score    Recall  Precission   ROC AUC      Gini[0m
catbm_2_valid                             210155  19.09%    0.4615    0.7278      0.3379    0.7734    0.5468
        train                             840625  19.09%    0.4678     0.738      0.3425    0.7823    0.5646

FOLD 3 REPORT
[4mModel                                        Qty     1 %  f1 score    Recall  Precission   ROC AUC      Gini[0m
catbm_3_valid                             210155  19.09%    0.4613    0.7247      0.3384    0.7739    0.5478
        train                             840625  19.09%   

In [16]:
model.save(file_name='prob_model_boosting', prod_mode=True)

Model saved to file: prob_model_boosting
