In [0]:
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'
import numpy as np 
import fasttext 
import csv
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.utils import resample
import lightgbm as lgb
import json
import matplotlib.pyplot as plt
import datetime
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.decomposition import PCA
from sklearn.metrics import roc_auc_score
from random import randrange
import random, warnings
from shutil import copyfile
import pickle

folder_name='xxxxx'



In [0]:
# helper function for train_and_add_fasttext
def get_and_join_embeddings(X_, model_embed, text_col):
    """
    Output: (pandas Dataframe) 
    dataset with text embeddings added, using trained
    fasttext model
    _____________________________________________________
    Parameters
    X_: (pandas Dataframe) input data
    model_embed: fasttext model
    text_col: (string) name of text column to be embedded
    """
    X_[text_col] = X_[text_col].fillna("") # safeguard for NA
    embeddings = pd.DataFrame([model_embed.get_sentence_vector(str(st)) for st in X_[text_col]]) 
    embeddings.columns = [text_col + "_fasttext_{}".format(i) for i in range(100)]
    embeddings.index = X_.index # keep index intact after joining
    X_ = X_.join(embeddings)
    #print(X_.loc[0, [text_col, text_col + "_fasttext_0", text_col + "_fasttext_1", text_col + "_fasttext_2", text_col + "_fasttext_3"]])
    return X_

# helper function for train_and_add_fasttext
def classify(x, th_1, th_2):
    """
    Output: string
    (label High / Med / Low corresponding to the value x,
    using th_1 and th_2 as determining thresholds)
    _____________________________________________________
    Parameters:
    x: (float) value to be classified
    th_1: (float) separation low / medium 
    th_2: (float) separation medium / high
    """
    if x < th_1:
        return "__label__Low"
    if x < th_2:
        return "__label__Medium"
    else:
        return "__label__High" 


def train_and_add_fasttext( X_train, X_valid, X_test, col_for_label, text_col):
    """
    Output: (pandas Dataframe tuple)
    X_train, X_valid, X_test with fasttext embeddings 
    trained and added
    _____________________________________________________
    Parameters
    X_train: (pandas Dataframe) train data
    X_valid: (pandas Dataframe) valid data
    X_test: (pandas Dataframe) test data
    col_for_label: (string) column used for creating labels
    text_col: (string) name of text column to be embedded
    
    """
    # create balanced labels
    full_data_set = pd.concat([X_train, X_valid, X_test])
    th1 = full_data_set[col_for_label].quantile(1/3)
    th2 = full_data_set[col_for_label].quantile(2/3)
    
    X_train["Classification_balanced"]= X_train[col_for_label].apply(lambda x: classify(x, th1, th2)) 
    X_valid["Classification_balanced"]= X_valid[col_for_label].apply(lambda x: classify(x, th1, th2)) 
    X_test["Classification_balanced"]= X_test[col_for_label].apply(lambda x: classify(x, th1, th2)) 
    
    # build dataset
    data_train_text = pd.DataFrame(X_train[text_col]).join(X_train["Classification_balanced"])
    data_test_text = pd.DataFrame(X_test[text_col]).join(X_test["Classification_balanced"])

    # save set for immediate use (training), overwriting previous save
    data_folder = "sd_fasttext"
    if not os.path.isdir(r'/dbfs/mnt/'+folder_name+'/raw data/'+data_folder):
        os.mkdir(r'/dbfs/mnt/'+folder_name+'/raw data/'+data_folder)

    data_train_text.to_csv(r'/dbfs/mnt/'+folder_name+'/raw data/'+ data_folder + 'temp_text_train.txt', 
                          index = False, 
                          sep = ' ',
                          header = None, 
                          quoting = csv.QUOTE_NONE, 
                          quotechar = "", 
                          escapechar = " ")

    # train fasttext model
    model_embeddings = fasttext.train_supervised(r'/dbfs/mnt/'+folder_name+'/raw data/'+ data_folder+'temp_text_train.txt', 
                                                 wordNgrams = 2, 
                                                 epoch=10, 
                                                 dim=100)

    # Include embeddings in dataset 
    X_train = get_and_join_embeddings(X_train, model_embeddings, text_col)
    X_valid = get_and_join_embeddings(X_valid, model_embeddings, text_col)
    X_test = get_and_join_embeddings(X_test, model_embeddings, text_col)
    
    return (X_train, X_valid, X_test)
  
def train_save_and_add_fasttext( X_train, X_valid, col_for_label, text_col, write_folder='/tmp/', output_folder='xxxx', output_suffix=''):
    """
    Output: (pandas Dataframe tuple)
    X_train, X_valid, X_test with fasttext embeddings 
    trained and added
    _____________________________________________________
    Parameters
    X_train: (pandas Dataframe) train data
    X_valid: (pandas Dataframe) valid data
    X_test: (pandas Dataframe) test data
    col_for_label: (string) column used for creating labels
    text_col: (string) name of text column to be embedded
    
    """
    # create balanced labels
    full_data_set = pd.concat([X_train, X_valid])
    th1 = full_data_set[col_for_label].quantile(1/3)
    th2 = full_data_set[col_for_label].quantile(2/3)
    
    X_train["Classification_balanced"]= X_train[col_for_label].apply(lambda x: classify(x, th1, th2)) 
    X_valid["Classification_balanced"]= X_valid[col_for_label].apply(lambda x: classify(x, th1, th2)) 
    
    # build dataset
    data_train_text = pd.DataFrame(X_train[text_col]).join(X_train["Classification_balanced"])

    # save set for immediate use (training), overwriting previous save
    data_folder = "sd_fasttext/"
    if not os.path.isdir(write_folder+data_folder):
        os.mkdir(write_folder+data_folder)

    data_train_text.to_csv(write_folder+ data_folder + 'temp_text_train.txt', 
                          index = False, 
                          sep = ' ',
                          header = None, 
                          quoting = csv.QUOTE_NONE, 
                          quotechar = "", 
                          escapechar = " ")

    # train fasttext model
    model_embeddings = fasttext.train_supervised(write_folder + data_folder+'temp_text_train.txt', 
                                                 wordNgrams = 2, 
                                                 epoch=10, 
                                                 dim=100)
    
    model_embeddings.save_model(write_folder + 'fasttext_model.bin')
    copyfile(write_folder + 'fasttext_model.bin', output_folder+'fasttext_model' + output_suffix + '.bin')
    print('Saved fasttext model in ' + output_folder+'fasttext_model' + output_suffix + '.bin')
    
    # Include embeddings in dataset 
    X_train = get_and_join_embeddings(X_train, model_embeddings, text_col)
    X_valid = get_and_join_embeddings(X_valid, model_embeddings, text_col)
    
    return (X_train, X_valid)

def train_and_add_LDA( X_train, y_train, X_valid, X_test, ncomp):
    """
    Output: (array tuple)
    X_train, X_valid, X_test with LDA trained and added
    _____________________________________________________
    Parameters
    X_train: (array) train data
    y_train: (1D array) training labels
    X_valid: (array) valid data
    X_test: (array) test data
    ncomp: (int) nb dimensions of LDA output
    
    """
    lda = LinearDiscriminantAnalysis(n_components=ncomp)
    X_train_lda = lda.fit_transform(X_train, y_train)
    X_valid_lda = lda.transform(X_valid)
    X_test_lda = lda.transform(X_test)
    return np.c_[X_train, X_train_lda], np.c_[X_valid, X_valid_lda], np.c_[X_test, X_test_lda]

def train_save_and_add_LDA( X_train, y_train, X_valid, ncomp, write_folder='/tmp/', output_folder='xxxx', output_suffix=''):
    """
    Output: (array tuple)
    X_train, X_valid, X_test with LDA trained and added
    _____________________________________________________
    Parameters
    X_train: (array) train data
    y_train: (1D array) training labels
    X_valid: (array) valid data
    X_test: (array) test data
    ncomp: (int) nb dimensions of LDA output
    
    """
    lda = LinearDiscriminantAnalysis(n_components=ncomp)
    lda_model = lda.fit(X_train, y_train)
    with open(write_folder + 'lda_model' + output_suffix + '.pkl','wb') as file:
        pickle.dump(lda_model, file)
    copyfile(write_folder + 'lda_model' + output_suffix + '.pkl', output_folder + 'lda_model' + output_suffix + '.pkl')
    print('Saved LDA model in ' + output_folder + 'lda_model' + output_suffix + '.pkl')
    with open(write_folder + 'lda_model' + output_suffix + '.pkl','rb') as file:
        lda_model = pickle.load(file)
    X_train_lda = lda_model.transform(X_train)
    X_valid_lda = lda_model.transform(X_valid)
    return np.c_[X_train, X_train_lda], np.c_[X_valid, X_valid_lda]


In [0]:
def split_random_train_test(index_length, train_pct=0.75, seed = 1):
    random.seed(seed)
    ind_train = random.sample( range(index_length), int(np.floor(train_pct * index_length)) ) 
    ind_train.sort()
    mask=np.full(index_length,True,dtype=bool)
    mask[ind_train] = False
    ind_test = np.array(range(index_length))[mask].tolist()
    return [ind_train, ind_test]
    
def split_random_train_test_valid(index_length, train_pct=0.75, test_pct = 0.15, seed = 1):
    random.seed(seed)
    ind_train = random.sample( range(index_length), int(np.floor(train_pct * index_length)) ) 
    ind_train.sort()
    mask=np.full(index_length,True,dtype=bool)
    mask[ind_train] = False
    ind_test = np.array(range(index_length))[mask].tolist()    
    ind_test = random.sample(ind_test, int(np.floor(test_pct *index_length)) )
    mask[ind_test] = False
    ind_valid = np.array(range(index_length))[mask].tolist() 
    return [ind_train, ind_test, ind_valid]

def prepare_data_for_training(path_to_data = folder_name + 'data_prepped.csv',
                                                              method = 'random', # no alternative implemented here, but could be chronological time series splits
                                                              seed = 1,
                                                              column_for_classification = 'Gross_Incurred_Detrended',
                                                              quantile_high = 0.90,
                                                              quantile_medium = 0.80,
                                                              proportion_train = 0.80,
                                                              proportion_test = 0.15,
                                                              cutoff_claim_loss_date = 2020, # Claims with Claim_Loss_Date from this point onwards are discarded,
                                                              path_to_output_train = folder_name + 'data_train.csv',
                                                              path_to_output_valid = folder_name + 'data_valid.csv',
                                                              path_to_output_test = folder_name + 'data_test.csv'
                                                             ):
    """
    Output: (array tuple)
    data_train, data_valid, data_test: DataFrames that together equal the total data set (with adjustments such as removing undevelopped claims, sorting, and labels added)
    _____________________________________________________
    Parameters
    path_to_date: (string) path to csv file with data (required to be clean first using prepare_dataset())
    method: (string) currently always 'random', in the future alternative splits such as time series splits can be implemented here
    seed: (int) seed to be used for randomly dividing data into train, valid and test
    quantile_high: (float) quantile for high claims (default: 0.90, i.e. only top 10% of claims are high)
    quantile_medium: (float) quantile for medium claims (default: 0.80, i.e. 80% of claims are lower than medium)
    proportion_train: (float) proportion of data that is used for training
    proportion_test: (float) proportion of data that is used for testing. Note that 1 - proportion_train - proportion_test will be used for validation
    cutoff_claim_loss_date: (string) cut-off date after which claims are discarded for training due to not being sufficiently developed
    path_to_output_train: (string) path to where the train data set is saved as csv
    path_to_output_valid: (string) path to where the valid data set is saved as csv
    path_to_output_test: (string) path to where the test data set is saved as csv
    """
    data = pd.read_csv(path_to_data, low_memory=False) # read data
    data = data[data["Claim_Loss_Date"]<str(cutoff_claim_loss_date)] # cut-off claims that aren't sufficiently  developed
    data = data.sort_values(by="Claim_Loss_Date").reset_index(drop=True) # sort by Claim_Loss_Date
   
    # Calculate thresholds and set levels for the given high and medium quantile
    high_threshold = data[column_for_classification].quantile(quantile_high) ### Threshold between medium and high <= change here
    medium_threshold = data[column_for_classification].quantile(quantile_medium) ### Threshold between low and medium <= change here
    data['Classification'] = 'Low'
    data.loc[medium_threshold < data[column_for_classification],'Classification'] = 'Medium'
    data.loc[high_threshold < data[column_for_classification],'Classification'] = 'High'
    
    # Classification for training (binary, high vs non-high, 1 or 0)
    data['Classification_for_training'] = 0
    data.loc[high_threshold < data[column_for_classification],'Classification_for_training'] = 1
    
    # Split data into train, valid and test
    index_length = len(data)
    random.seed(seed)    # set seed so the same split can be reproduced
    ind_train = random.sample( range(index_length), int(np.floor(proportion_train * index_length)) ) 
    ind_train.sort()
    mask=np.full(index_length,True,dtype=bool)
    mask[ind_train] = False
    ind_test = np.array(range(index_length))[mask].tolist()    
    ind_test = random.sample(ind_test, int(np.floor(proportion_test *index_length)) )
    mask[ind_test] = False
    ind_valid = np.array(range(index_length))[mask].tolist() 
    data_train, data_valid, data_test = data.reindex(ind_train),data.reindex(ind_valid), data.reindex(ind_test)
    data_train.to_csv(path_to_output_train, index=False)
    data_valid.to_csv(path_to_output_valid, index=False)
    data_test.to_csv(path_to_output_test, index=False)
    return data_train, data_valid, data_test


def set_model_settings(lgbm_params = {'num_leaves': 31, 'objective': 'binary', 'boosting':'dart', 'num_iterations':1000, 'verbose': 0 },
                                    path_to_model_columns = folder_name + 'Model_v14.08.2022/model_columns.csv',
                                    add_Fasttext = True,
                                    add_LDA = True,
                                    path_to_output = folder_name + 'Model_v14.08.2022/model_settings.pkl',
                                    write_folder = '/tmp/'
                                   ):
    """
    Updates and saves settings for LGBM model
    Output: (dictionary) dictionary of model settings
    _____________________________________________________
    Parameters
    lgbm_params: (dictionary) dictionary with lgbm training settings
    path_to_model_columns: (string) csv file with columns to include for training, as well as which columns are to be treated as categorical
    add_Fasttext: (bool) include Fasttext embedding in model or not
    add_LDA: (bool) include LDA in model or not
    path_to_output: (string) pkl file where model settings will be stored
    write_folder: (string) folder where intermediate steps can temporariliy be written to (necessary due to Databricks restrictions)
    """
    columns_df = pd.read_csv(path_to_model_columns) # read csv file with which columns to include
    model_columns  = columns_df["columns"].to_list()
    cat_columns = columns_df[columns_df["is_cat_feature"]==True]["columns"].to_list() # get which columns are to be treated as categorical
    cat_feat_encoding_python = [model_columns.index(c) for c in cat_columns] 
    lgbm_params['categorical_feature']=cat_feat_encoding_python 
    model_settings = {'lgbm_params':lgbm_params, 'model_columns': model_columns, 'add_Fasttext':add_Fasttext, 'add_LDA':add_LDA} # model settings are combined here
    with open(write_folder + 'model_settings.pkl','wb') as file:
        pickle.dump(model_settings, file) # save model settings to temporary folder
    copyfile(write_folder + 'model_settings.pkl', path_to_output) # copy model settings to final folder
    print()
    print('model settings saved in ' + path_to_output )  
    print(model_settings)
    print()
    return model_settings


def train_lgbm_binary(data_train,  data_valid,
                    lgb_params, 
                    cols, 
                    text_col_to_embed="concatenated_text", 
                    add_LDA=False,
                    add_Fasttext=True,
                    col_for_label="Gross_Incurred_Detrended",
                    col_with_label="Classification_for_training",
                    time_col_name="Claim_Loss_Date",
                    write_folder='/tmp/', 
                    output_folder='xxxx', 
                    output_suffix=''):
    """
    Output: (LGBM.Booster) Fitted LGBM Model
    _____________________________________________________
    Parameters
    data: (pandas Dataframe) full dataset  
    lgb_params: (dict) parameters for the LGBM classifier
    cols: (string list) columns to use for training the model
    text_col_to_embed: (string) column to embed in fasttext
    add_LDA: (bool) if True, add LDA column before training LGBM
    add_Fasttext: (bool) if True, add fasttext embedding before training LGBM
    col_for_label: (string) column used for generating labels
    col_with_label: (string) column with the current (actual) labels
    time_col_name: (string) column used to sort timeseries
    """
    print("Training model with training set of size " + str(data_train.shape[0]) + " and validation set of size " + str(data_valid.shape[0]) )
    print("-----------------------------------------------------------------------------------")
    print()
    
    # Select columns
    y_train, y_valid = data_train[col_with_label], data_valid[col_with_label]
    cols_full = cols + [text_col_to_embed, col_for_label]
    X_train, X_valid = data_train[cols_full], data_valid[cols_full]
    #print(X_train.dtypes)
    
    # add fasttext embeddings
    if add_Fasttext: 
        print()
        print("Training Fasttext embedding...")
        X_train, X_valid = train_save_and_add_fasttext(X_train, X_valid, col_for_label=col_for_label, text_col=text_col_to_embed, write_folder = write_folder, output_folder = output_folder, output_suffix = output_suffix) 
        X_train = X_train.drop(columns="Classification_balanced")
        X_valid = X_valid.drop(columns="Classification_balanced")
    X_train = X_train.drop(columns=[text_col_to_embed, col_for_label])
    X_valid = X_valid.drop(columns=[text_col_to_embed, col_for_label])
    
    # Make array
    y_train, y_valid = y_train.to_numpy(), y_valid.to_numpy()    
    X_train, X_valid = X_train.to_numpy(), X_valid.to_numpy()
 
    # add LDA features
    if add_LDA: 
        print()
        print("Training Linear Discriminant Analysis...")
        X_train, X_valid = train_save_and_add_LDA( X_train,y_train, X_valid, ncomp=1, write_folder = write_folder, output_folder = output_folder, output_suffix = output_suffix) 
        
    import warnings
    warnings.filterwarnings('ignore')
    # Light GBM model
    bst = lgb.LGBMClassifier(**lgb_params)   # Initialise lgbm
    print()
    print("Training lgbm...")
    callbacks = [lgb.early_stopping(10, verbose=-1), lgb.log_evaluation(period=100)] # Settings for printing output during training
    bst.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], callbacks=callbacks) # Actual training
    
    # Save results
    with open(write_folder + 'lgbm_model' + output_suffix + '.pkl','wb') as file:
        pickle.dump(bst, file)
    copyfile(write_folder + 'lgbm_model' + output_suffix + '.pkl', output_folder + 'lgbm_model' + output_suffix + '.pkl')
    print()
    print('lgbm model saved in ' + output_folder + 'lgbm_model' + output_suffix + '.pkl' )        
    return bst

def test_lgbm_binary(data_test,
                    cols, 
                    text_col_to_embed="concatenated_text", 
                    add_Fasttext=True,
                    add_LDA=False,
                    output_folder='xxxx', 
                    output_suffix='',
                    col_predicted_prob = 'predicted_prob'
                    ):
    """
    Output: (Dataframe) Dataframe with added predicted probabilities high vs non-high
    _____________________________________________________
    Parameters
    data_test: (pandas Dataframe) full dataset      
    cols: (string list) columns to use for training the model
    text_col_to_embed: (string) column to embed in fasttext
    add_LDA: (bool) if True, add LDA column before training LGBM
    add_Fasttext: (bool) if True, add fasttext embedding before training LGBM    
    """
    X_test = data_test[cols + [text_col_to_embed]] # remove unnecessary columns
    if add_Fasttext:        
        fasttext_model = fasttext.load_model(output_folder + 'fasttext_model' +output_suffix + '.bin')
        X_test = get_and_join_embeddings(X_test, fasttext_model, text_col_to_embed)         
    X_test = X_test.drop(columns=text_col_to_embed)
    #print(X_test.dtypes)
    X_test = X_test.to_numpy()
    if add_LDA:
        with open(output_folder + 'lda_model' + output_suffix + '.pkl','rb') as file:
            lda_model = pickle.load(file)
            X_test = np.c_[X_test, lda_model.transform(X_test)] 
    with open(output_folder + 'lgbm_model' + output_suffix + '.pkl','rb') as file:
        lgb_model = pickle.load(file)
    prob_test = lgb_model.predict_proba(X_test)
    data_test[col_predicted_prob] = prob_test[:,1]
    return data_test


In [0]:
# Train model
def train_model(path_to_data_train = folder_name + 'data_train.csv',
                path_to_data_valid = folder_name + 'data_valid.csv',
                path_to_model_settings = folder_name + 'model_settings.pkl', 
                output_folder = folder_name,
                output_suffix = "_v14.08.2022",
                col_for_label='Gross_Incurred_Detrended'
               ):
    """
    Trains a binary LGBM classifier using cross validation
    Output: (LGBM.Booster) Trained model
    _____________________________________________________
    Parameters
    path_to_data_train: (string) csv file with training data  
    path_to_data_valid: (string) csv file with validation data
    path_to_model_settings: (string) pkl file with model settings
    output_folder: (string) folder where model files will be stored
    output_suffix: (string) (optional) suffix to be added to model file names
    """
    data_train = pd.read_csv(path_to_data_train, low_memory=False) # reads data set used for training
    data_valid = pd.read_csv(path_to_data_valid, low_memory=False) # reads data set used for validation

    with open(path_to_model_settings,'rb') as file:
        model_settings = pickle.load(file) # loads model settings
    
    lgb_params = model_settings['lgbm_params']
    bst = train_lgbm_binary(data_train = data_train,  
                               data_valid=data_valid, 
                               lgb_params = model_settings['lgbm_params'], 
                               cols = model_settings['model_columns'], 
                               text_col_to_embed='concatenated_text', 
                               add_Fasttext=model_settings['add_Fasttext'], 
                               add_LDA=model_settings['add_LDA'], 
                               col_for_label=col_for_label, 
                               col_with_label = 'Classification_for_training', 
                               time_col_name='Claim_Loss_Date', 
                               write_folder='/tmp/', # temporary writing folder because Databricks has restrictions on writing to files
                               output_folder=output_folder, 
                               output_suffix=output_suffix)
    return bst

In [0]:
def load_and_apply_lgbm(path_to_data = folder_name + 'unlabelled_data_prepped.csv',
                        model_folder = '/dbfs/mnt/'+folder_name+'/clean_gbm/',
                        model_suffix = "v14.08.2022",
                        path_to_model_settings = folder_name + 'model_settings.pkl', 
                        path_to_output = folder_name + 'unlabelled_data_predicted_probs.csv',
                        col_predicted_prob = 'predicted_prob'
                       ):
    """
    Applies a trained binary LGBM classifier to a given data set
    Output: (pandas DataFrame) data set with a columns with predicted probabilities added
    _____________________________________________________
    Parameters
    path_to_data: (string) csv file with data for which probabilities will be predicted  
    model_folder: (string) folder where model files are stored
    model_suffix: (string) suffix that was added to model files
    path_to_model_settings: (string) pkl file with model settings
    path_to_output: (string) csv file where data set with added predicted probabilities will be stored
    """
    data_test = pd.read_csv(path_to_data, low_memory=False) # read data to apply model to

    with open(path_to_model_settings,'rb') as file:
        model_settings = pickle.load(file) # load model settings
    
    data_tested = test_lgbm_binary(data_test = data_test, 
                                      cols = model_settings['model_columns'], 
                                      text_col_to_embed="concatenated_text", 
                                      add_Fasttext=model_settings['add_Fasttext'], # Currently turned off
                                      add_LDA=model_settings['add_LDA'], 
                                      output_folder=model_folder, 
                                      output_suffix=model_suffix,
                                      col_predicted_prob=col_predicted_prob
                                  )
    
    data_tested.to_csv(path_to_output, index=False) # save data set with added probabilities
    return data_tested
    

In [0]:
def tp_tn_fp_fn(predicted, actual):
    """
    Output: (int tuple)
    for binary classification evaluation
    number of true positives, true negatives, false positives, 
    false negatives 
    _____________________________________________________
    Parameters:
    predicted: (int list) list of predicted classes
    actual: (int list) list of labels
    """
    tp, tn, fp, fn = 0, 0 , 0 , 0
    for i in range(len(predicted)):
        if predicted[i] == 0:
            if predicted[i] == actual[i]:
                tn += 1
            else:
                fn += 1
        else:
            if predicted[i] == actual[i]:
                tp += 1
            else:
                fp += 1
    return tp, tn, fp, fn

def precision(tp, tn, fp, fn):
    """
    Output: (float)
    precision based on true / false +ve, true / false -ve
    _____________________________________________________
    Parameters:
    tp, tn, fp, fn : (all int) number of true positives, 
                    true negatives, false positives, false 
                    negatives 
    """
    return tp/(tp + fp)

def recall(tp, tn, fp, fn):
    """
    Output: (float)
    recall based on true / false +ve, true / false -ve
    _____________________________________________________
    Parameters:
    tp, tn, fp, fn : (all int) number of true positives, 
                    true negatives, false positives, false 
                    negatives 
    """
    return tp/(tp + fn)

def total_predict(tp, tn, fp, fn):
    """
    Output: (float)
    proportion predicted 1 based on true / false +ve and
    true / false -ve
    _____________________________________________________
    Parameters:
    tp, tn, fp, fn : (all int) number of true positives, 
                    true negatives, false positives, false 
                    negatives 
    """
    return (tp + fp)/(tp+tn+fp+fn)

# helper functions for prediction 

def predict(y, level, ind):
    """
    Output: (int list)
    list of int corresponding to prediction 1 or 0 for a class
    indicated by ind
    _____________________________________________________
    Parameters:
    y: (iterable) sequence of probability class predictions
    level: (float) between (0, 1) - probability threshold 
            to predict 1
    ind: (int) if 0: y is a list of probabilities
               else: y is a list of vectors of probabilities,
                     and ind gives the index of the class we 
                     want to predict
    """
    if ind == 0: 
        return [int(u>level) for u in y]
    return [int(u[ind]>level) for u in y]


# helper function for find_thresh
def calc_total_predict(pred_weight, actual, level):
    """
    Output: (float)
    amount predicted 1 based on 1 dimensional vector of 
    class prediction probabilities, labels, and probability
    level
    _____________________________________________________
    Parameters:
    pred_weight: (float list) prediction probabilities of class 1 
    actual: (int list) list of labels
    level: (float) between (0, 1) - probability threshold 
            to predict 1
    """
    pred = predict(pred_weight, level, 0)
    tp, tn, fp, fn = tp_tn_fp_fn(pred, actual)
    return total_predict(tp, tn, fp, fn)

def find_thresh(pred_weight, actual, level_a, level_b, predicted=0.1, error=0.001):
    """
    Output: (float)
    probability threshold level needed to reach an amount 
    of 1 prediction (within error on amount)
    (obtained by dichotomy search)
    _____________________________________________________
    Parameters:
    pred_weight: (float list) prediction probabilities of class 1 
    actual: (int list) list of labels
    level_a: (float) between (0, 1) - probability threshold lower bound
    level_a: (float) between (0, 1) - probability threshold upper bound
    predicted: (float) amount of prediction 1 aimed
    error: (float) absolute error allowed on amount of prediction 1        
    """
    pos_a = calc_total_predict(pred_weight, actual, level_a) 
    pos_b = calc_total_predict(pred_weight, actual, level_b)
    
    if abs(pos_a - predicted) < error :
        return level_a, pos_a
    elif abs(pos_b - predicted) < error :
        return level_b, pos_b
    
    level_c = (level_a + level_b)/2
    pos_c = calc_total_predict(pred_weight, actual, level_c)
    
    if abs(pos_c - predicted) < error :
        return level_c, pos_c
    elif pos_c > predicted:
        return find_thresh(pred_weight, actual, level_c, level_b, predicted=predicted)
    elif pos_c < predicted:
        return find_thresh(pred_weight, actual, level_a, level_c, predicted=predicted)
    return

def get_probability_threshold(data, col_predicted_prob='predicted_prob', target_pct = 0.10):
    data = data.sort_values(col_predicted_prob).reset_index(drop=True)
    return data.loc[int((1-target_pct)*data.shape[0])][col_predicted_prob]

def get_probability_thresholds(data, col_predicted_prob='predicted_prob', quantiles = [0.10, 0.20]):
    data = data.sort_values(col_predicted_prob).reset_index(drop=True)
    return [data.loc[int((1-q)*data.shape[0])][col_predicted_prob] for q in quantiles]


def predict_labels_from_probs(path_to_data = 'unlabelled_data_predicted_probs', 
                              probability_thresholds = [0.3], # when list give in 
                              col_predicted_prob='predicted_prob',
                              col_predicted_label='predicted_label',
                              path_to_output = folder_name + 'data_predicted_labels.csv'):
    data = pd.read_csv(path_to_data, low_memory=False)
    probability_thresholds.sort()
    data[col_predicted_label] = 0
    for i in range(len(probability_thresholds)):
        data[col_predicted_label] = data[col_predicted_label] + data[col_predicted_prob].apply(lambda x: int(x>probability_thresholds[i]))
    data.to_csv(path_to_output,index=False)
    return data

def print_performance_metrics(path_to_data = folder_name + 'data_predicted_labels.csv',
                         col_actual_labels = 'Classification_for_training',
                         col_predicted_label = 'predicted_label',
                         col_predicted_probability = 'predicted_prob'):
    data = pd.read_csv(path_to_data, low_memory=False)
    TP, TN, FP, FN = tp_tn_fp_fn(data[col_predicted_label], data[col_actual_labels]) # TRUE/FALSE POSITIVE/NEGATIVE
    Recall = recall(TP, TN, FP, FN)
    Precision = precision(TP, TN, FP, FN)
    Ratio_high = total_predict(TP, TN, FP, FN)
    AUC = roc_auc_score(data[col_actual_labels],data[col_predicted_probability])
    performance_metrics = {'AUC':AUC,
           'recall':Recall,
           'precision':Precision,
           'ratio_high':Ratio_high,
            'TP':TP,
            'TN':TN,
            'FP':FP,
            'FN':FN
           }
    # print the results
    print()
    print("Test performance measures:")
    print(performance_metrics) 
    return performance_metrics
    
def print_model_diagnostics(model, model_settings):
    print()
    print("Model has " + str(model.n_features_) + " features.")
    
    
    import matplotlib.pyplot as plt
    y = trained_model.feature_importances_
    #x = pd.read_csv(path_to_model_columns)["columns"].to_list()
    x = model_settings['model_columns']
    if model_settings['add_Fasttext']:
        x = x + ["fasttext_{}".format(i) for i in range(100)]
    if model_settings['add_LDA']:   
        x = x + ["LDA"] 
    fig = plt.figure(figsize=(8, len(x)/4))
    ax = fig.add_subplot(1,1,1)
    plt.barh(x,y)


In [0]:
def update_columns_to_include(path_to_output = folder_name + 'model_columns.csv'):
    columns = [
                                            "Worker_Age", 
                                            "Weekly_Rate_First13_Indexed",  
                                            "Claim_Loss_Date_Num", 
                                            "Worker_Gender",
                                            "Worker_Marital_Status",
                                            "Risk_State",
                                            "Full/Part_Time_Flag",
                                            "Policy_Duration",
                                            "Claim_Notified_Delay", 
                                            "Liability_Delay",
                                            "strain",
                                            "laceration",
                                            "injury",
                                            "body",
                                            "sprain",
                                            "pain",
                                            "fracture",
                                            "contusion",
                                            "burn",
                                            "crush",
                                            "bruise",
                                            "abrasion",
                                            "tear",
                                            "cut",
                                            "wound",
                                            "bite",
                                            "anxiety",
                                            "broken",
                                            "puncture",
                                            "shock",
                                            "whiplash",
                                            "fall",
                                            "amputation", 
                                            "Injury_Mechanism_Grouped",
                                            "Body_Location_Grouped", 
                                            "Industry", 
                                            "Occupation_Name_Grouped", 
                                            "Liability_Status_Grouped", 
                                            "Weekly_Rate_Known",
                                            "Work_Resumed_Within_30days_Flag", 
                                            "Duty_Status", 
                                            "SEIFA", 
                                            "Time_Since_Orig_Policy_Inception", 
                                            "incurred_t30_indexed"]
    cat_columns = [
                                            "Worker_Gender",
                                            "Worker_Marital_Status",
                                            "Risk_State", 
                                            "Full/Part_Time_Flag",
                                            "Injury_Mechanism_Grouped",
                                            "Body_Location_Grouped", 
                                            "Industry", 
                                            "Occupation_Name_Grouped", 
                                            "Liability_Status_Grouped",
                                            "Weekly_Rate_Known",
                                            "Work_Resumed_Within_30days_Flag", 
                                            "Duty_Status",
                                            "strain",
                                            "laceration",
                                            "injury",
                                            "body",
                                            "sprain",
                                            "pain",
                                            "fracture",
                                            "contusion",
                                            "burn",
                                            "crush",
                                            "bruise",
                                            "abrasion",
                                            "tear",
                                            "cut",
                                            "wound",
                                            "bite",
                                            "anxiety",
                                            "broken",
                                            "puncture",
                                            "shock",
                                            "whiplash",
                                            "fall",
                                            "amputation", 
                                            "SEIFA"]
    is_cat_feature = [(c in cat_columns) for c in columns]
    print(cat_columns)
    print(is_cat_feature)
    cols_to_include = pd.DataFrame({'columns': columns, 'is_cat_feature': is_cat_feature})
    cols_to_include.to_csv(path_to_output, index=False)
    return


# Test
#update_columns_to_include('/dbfs/mnt/augiprojects-20220225workerscomp/for sharing/Model_v14.08.2022/model_columns.csv')
#model_columns  = pd.read_csv('/dbfs/mnt/augiprojects-20220225workerscomp/for sharing/Model_v14.08.2022/model_columns.csv')["columns"].to_list()

