In [1]:
from settings import *

In [15]:
import pandas as pd
import numpy as np
import re
import pickle
import time
from tqdm import tqdm
import seaborn as sns
import matplotlib.pyplot as plt
from itertools import permutations
import json, os
from collections import OrderedDict
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score, precision_score, recall_score
from sklearn.preprocessing import OneHotEncoder


from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

class Genetic_Algorithm:
    def __init__(self, df, options : dict, combi : list , caseid : str, activity : str, ts : str, label : dict, other_features : list, phase):
        self.df = df
        self.options = options
        self.combi = combi
        self.caseid = caseid
        self.activity = activity 
        self.ts = ts
        self.label = label
        self.other_features = other_features
        
             
    def feature_type(self):
        df = self.df
        feature_dict = {}
        for feature in self.other_features:
            for case, group in df.groupby(self.caseid):
                if len(group[feature].unique()) != 1:
                    feature_dict[feature] = 'event'
                    break
                else:
                    feature_dict[feature] = 'case'
        self.feature_types = feature_dict
        
    def add_label(self, df):       
        Label = []
        if 'Activity' in self.label:
            label_col = self.label['Activity']                
            for case, group in df.groupby(self.caseid):
                for i in range(len(group)):
                    if label_col in group[self.activity].tolist():
                        Label.append(1)
                    else:
                        Label.append(0)
            label_df = pd.DataFrame(Label, columns = ['Label'])
            df = pd.concat([df, label_df], axis=1)

        elif 'column' in self.label:
            label_col = self.label['column']
            df = df.rename(columns={label_col : 'Label'})
        self.df = df
        return df
                

    def rand_choice(self, options, key : str):
        if  type(options[key]) == tuple:
            return np.random.randint(options[key][0], options[key][1])

        elif type(options[key]) == list:
            return options[key][np.random.randint(0, len(options[key]))]
               
        
    def initial_populations(self, N : int, rand_state = 2022) -> dict:
        initial_pop = []
        if 'params' in self.combi:
            for model in list(self.options['params'].keys()):
                for n in range(N):
                    result = {}
                    result['bucketing'] = self.rand_choice(self.options, 'bucketing')
                    result['encoding'] = self.rand_choice(self.options, 'encoding')
                    result['drop_act'] = self.rand_choice(self.options, 'drop_act')
                    result[model] = {}
                    for hp in list(self.options['params'][model].keys()):                             
                        result[model][hp] = self.rand_choice(self.options['params'][model], hp)
                    initial_pop.append(result)
        else:
            for n in range(N):
                result = {}
                result['bucketing'] = rand_choice(self.options, 'bucketing')
                result['encoding'] = rand_choice(self.options, 'encoding')
                result['drop_act'] = rand_choice(self.options, 'drop_act')
                initial_pop.append(result)                
        
        self.population = initial_pop
        
        return initial_pop
    
    def select_population(self, population, fitness, N) -> list:
#         population = self.population
        sum_fit = sum(fitness)
        selection_probs = [fitness[c]/sum_fit for c in range(len(population))]

        return list(np.random.choice(population, int(N), p=selection_probs))

    
    # Roullette wheel selection
    def select_param(self, population, fitness, hp : bool) -> list: 
#         population = self.population
        sum_fit = sum(fitness)
        selection_probs = [fitness[c]/sum_fit for c in range(len(fitness))]

        if hp == False:
            return np.random.choice(population, 2, p=selection_probs)

        else:
            p1 = np.random.choice(population, p=selection_probs)
            hp_space = [c for c in range(len(population)) if list(p1.keys())[-1] in list(population[c].keys())]
            hp_sum_fit = sum([fitness[c] for c in hp_space])
            hp_selection_probs = [fitness[c]/hp_sum_fit for c in hp_space]
            p2 = population[np.random.choice(hp_space, p=hp_selection_probs)]
            return [p1, p2]
        
        
    # cp : crossover probability ~ (0,1) -> 0.9
    def crossover(self, population, fitness, num_offering : int, cp : float) -> dict:
#         population = self.population
        result = []
        n = int(num_offering*cp)
        hp_options = [True, False]
        for _ in range(n):
            hp_option = hp_options[np.random.randint(0, 2)]
            child = {}
            p1, p2 = self.select_param(population, fitness, hp=hp_option)
            co_point = np.random.randint(low=0, high=len(p1))
            for idx, key in enumerate(list(p2.keys())):
                if idx < co_point:
                    child[key] = p1[key]
                else:
                    child[key] = p2[key]
            result.append(child)
#         self.population = population.extend(result)
        
        return result
    

    # mp : crossover probability ~ (0,1) -> 0.03
    def mutation(self, population, N, mp : float):
#         population = self.population
        
        n = int(N*mp)

        result = []
        for _ in range(n):
            child = {}
            parent = population[np.random.choice(len(population))]
            params = [list(parent[key].keys())+[key] if key in list(self.options['params'].keys()) 
                      else key for key in list(parent.keys())]
            params.extend(params.pop())

            ml_model = params[-1]
            mut_param = params[np.random.randint(low=0, high=len(params)-2)]
            if mut_param in list(self.options.keys()):
                parent[mut_param] = self.rand_choice(self.options, mut_param)
            else:
                parent[ml_model][mut_param] = self.rand_choice(self.options['params'][ml_model], mut_param)
            result.append(parent)

        return result

    
    
    def fitness(self, tot_score, failure_rate, tot_time):
        highest_acc_pop = np.argmax(tot_score)

        # fitness = (acc + (1-failure_rate) + time_cost + acc_decrease)/4
        tot_fitness = [round((tot_score[i] + (1-failure_rate[i]) + (max(tot_time)-tot_time[i])/(max(tot_time)-min(tot_time)) 
                   + (tot_score[i]-min(tot_score))/(max(tot_score)-min(tot_score)))/4, 2) for i in range(len(tot_score))]
        
        return tot_fitness
    
        
    def indexbased_encoding(self, df, prefix):
        #df = self.df
        new_df = pd.DataFrame()
        for column in df.columns:
            if (column == 'Label') or (column == self.caseid) or ((column in self.feature_types) and (self.feature_types[column] == 'case')):
                case_df = df.groupby(self.caseid, as_index = False).first()[column]
                new_df = pd.concat([new_df, case_df], axis=1)
            elif (column == self.activity) or ((column in self.feature_types) and (self.feature_types[column] == 'event')):
                col_name = [str(column)+str(i+1) for i in range(prefix)]
                col_list = []
                for case, group in df.groupby(self.caseid):
                    group = group.sort_values(by=self.ts, ascending = True).reset_index(drop=True)
                    col_list.append(group[column].tolist())
                col_list = np.array(col_list)
                event_df = pd.DataFrame(col_list, columns = col_name)
                new_df = pd.concat([new_df, event_df], axis=1)

        self.df = new_df
        return self.one_hot_encoding(new_df)
        
    def aggregated_encoding(self, df):
        # df = self.df
        new_df = pd.DataFrame()
        for column in df.columns:       
            if (column == 'Label') or (column == self.caseid) or ((column in self.feature_types) and (self.feature_types[column] == 'case')):
                case_df = df.groupby(self.caseid, as_index = False).first()[column]
                new_df = pd.concat([new_df, case_df], axis=1)
            elif (column == self.activity) or ((column in self.feature_types) and (self.feature_types[column] == 'case')):
                result = []
                if df[column].dtype == int or df[column].dtype == float:
                    if (column in self.feature_types) and (self.feature_types[column] == 'case'):
                        col_name = column
                        case_df = df.groupby(self.caseid, as_index = False).first()
                        not_nan = [num for num in list(case_df[column]) if num != np.nan]
                        fir_point, sec_point = np.percentile(not_nan,[33,67])

                        for val in case_df[column].values:
                            if val < fir_point:
                                result.append('Low')
                            elif fir_point <= val < sec_point:
                                result.append('Medium')
                            elif val >= sec_point:
                                result.append('High')
                            else:
                                result.append('Nan')
                    else:
                        col_name = [str(column)+'-'+point for point in ['Low', 'Medium', 'High', 'Nan']]
                        not_nan = [num for num in list(df[column]) if num != np.nan]
                        fir_point, sec_point = np.percentile(not_nan,[33,67])
                        for case, group in df.groupby(self.caseid):
                            col_list = [0]*len(col_name)
                            for val in group[column].values:
                                if val < fir_point:
                                    col_list[0] += 1
                                elif fir_point <= val < sec_point:
                                    col_list[1] += 1
                                elif val >= sec_point:
                                    col_list[2] += 1
                                else:
                                    col_list[3] += 1
                            result.append(col_list)
                                    
                else:
                    col_name = df[column].unique()

                    for case, group in df.groupby(self.caseid):
                        group = group.sort_values(by=self.ts, ascending = True).reset_index(drop=True)
                        col_list = [0]*len(col_name)
                        key = list(group[column].value_counts().keys())
                        val = group[column].value_counts().values
                        for k in key:
                            col_list[key.index(k)] += val[key.index(k)]
                        result.append(col_list)

                result = np.array(result)
                event_df = pd.DataFrame(result, columns = col_name)
                new_df = pd.concat([new_df, event_df], axis=1)
        self.df = new_df
        return new_df

    
    def last_state_encoding(self, df, window):
        # df = self.df        
        event = {}
        caseid = []
        next_event = []
        for k in range(window):
            dict_index = 'event{}'.format(k+1)
            event[dict_index] = []
            for case, group in df.groupby(self.caseid):
                df1 = list(group[self.activity])
                L = len(df1) - window + k
                for j in range(k,L):
                    event[dict_index].append(df1[j])
                    if k == window -1:
                        caseid.append(case)
                        next_event.append(df1[j+1])
        
        df_1 = pd.DataFrame(caseid, columns = [self.caseid])
        df_2 = pd.DataFrame.from_dict(event)
        df_3 = pd.DataFrame(next_event, columns = ['Label'])
        new_df = pd.concat([df_1, df_2, df_3], axis=1)        
        self.df = new_df
        
        return new_df
    
    
    def encoding(self, df, key, prefix):
        if key == "index":
            return self.indexbased_encoding(df, prefix)
        
        elif key == "aggregate":
            return self.aggregated_encoding(df)
        
        else:
            return self.last_state_encoding(df, 3)
    
    def one_hot_encoding(self, df):
        #df = self.df
        for column in df.columns:
            if not np.issubdtype(df[column], np.number):
                one_hot = pd.get_dummies(df[column], prefix=column, prefix_sep='=')
                #print("Encoded column:{} - Different keys: {}".format(column, one_hot.shape[1]))
                df = df.drop(column, axis=1)
                df = df.join(one_hot)
        #print("Categorical columns encoded")
        self.df = df
        return df
     
    
    def train_test_set_split(self, df, encoding):
                
        df_train, df_test = train_test_split(df, test_size=0.2, random_state=1, shuffle=False)
#         print(f'Training samples: {len(df_train)} \nTest samples: {len(df_test)}')
        X_train = df_train.drop('Label', axis=1)
        y_train = df_train['Label']
        X_test = df_test.drop('Label', axis=1)
        y_test = df_test['Label']
        
        if encoding == 'last_state':
            ohe = OneHotEncoder(handle_unknown='ignore',sparse=False)
            ohe.fit(X_train.values)
            X_train = ohe.transform(X_train.values)
            X_test = ohe.transform(X_test.values)

            #X_train = self.one_hot_encoding(X_train)
            #X_test = self.one_hot_encoding(X_test)
        
        else:
            ratio = len(df_train[df_train['Label'] == 1]) / len(df_train[df_train['Label'] == 0])     
#             print(f'Ratio of target in training set 0 : 1 = 1:{ratio:.4f}')

            # For imbalanced data
            if ratio < 0.33:
                sm = SMOTE(random_state=0)
                sm_X_train, sm_y_train = sm.fit_resample(X_train, y_train)
                print('After OverSampling, the shape of train_X: {}'.format(sm_X_train.shape))
                print('After OverSampling, the shape of train_y: {} \n'.format(sm_y_train.shape))
                print("After OverSampling, counts of label '1': {}".format(sum([sm_y_train[i]==1 for i in range(len(sm_y_train))])))
                print("After OverSampling, counts of label '0': {}".format(sum([sm_y_train[i]==0 for i in range(len(sm_y_train))])))
                X_train = sm_X_train
                y_train = sm_y_train
                            
        self.X_train = X_train
        self.y_train = y_train
        self.X_test = X_test
        self.y_test = y_test
        
        return X_train, y_train, X_test, y_test
     
    def model_fit(self):
        #['Logistic Regression','Decision Tree','Random Forest','LightGBM','Xgboost','CatBoost']
        models = {'Decision Tree' : DecisionTreeClassifier(), 'Random Forest' : RandomForestClassifier(), 'LightGBM' : LGBMClassifier(), 'Xgboost' : XGBClassifier()}
        tot_score = []
        for model in models:
            score = []

            model.fit(self.X_train, self.y_train)

            score.append(accuracy_score(self.y_test, model.predict(self.X_test)))
            score.append(precision_score(self.y_test, model.predict(self.X_test)))
            score.append(recall_score(self.y_test, model.predict(self.X_test)))
            score.append(f1_score(self.y_test, model.predict(self.X_test)))
            score.append(roc_auc_score(self.y_test, model.predict_proba(self.X_test)[:, 1]))

            print(f'Model is {model} \nAccuracy: {score[0]:.4f}, Precision: {score[1]:.4f}, Recall: {score[2]:.4f}, F1: {score[3]:.4f}, AUC: {score[4]:.4f}')
            tot_score.append(score)

        self.tot_score = tot_score

        # plot score df
        score_df = pd.DataFrame(tot_score, index = models, columns = ['Accuracy', 'Precision', 'Recall', 'F1 score', 'AUC'])
        score_df.plot(kind="bar",figsize=(9,8))
        plt.xticks(rotation='horizontal')
        plt.show()

        self.score_df = score_df
        return
    
    def decision_tree(self, hp, X_train, y_train, X_test, y_test):
        model = DecisionTreeClassifier(max_depth = hp['max_depth'],
                           min_samples_leaf= hp['min_samples_leaf'],
                           criterion = hp['criterion'])
        model.fit(X_train, y_train)
        score  = round((accuracy_score(y_test, model.predict(X_test))+roc_auc_score(y_test, model.predict_proba(X_test)[:, 1]))/2, 4)
        return score
    
    def random_forest(self, hp, X_train, y_train, X_test, y_test):
        model = RandomForestClassifier(n_estimators=hp['n_estimators'], 
                           max_depth= hp["max_depth"],
                           max_features= hp["max_features"], 
                           bootstrap= hp["bootstrap"],
                           criterion= hp["criterion"])
        model.fit(X_train, y_train)
        score  = round((accuracy_score(y_test, model.predict(X_test))+roc_auc_score(y_test, model.predict_proba(X_test)[:, 1]))/2, 4)
        return score
    
    def light_gbm(self, hp, X_train, y_train, X_test, y_test):
        model = LGBMClassifier(max_depth= hp["max_depth"],
                           num_leaves= hp["num_leaves"], 
                           min_child_samples= hp["min_child_samples"])
        model.fit(X_train, y_train)
        score  = round((accuracy_score(y_test, model.predict(X_test))+roc_auc_score(y_test, model.predict_proba(X_test)[:, 1]))/2, 4)
        return score
    
    def xgboost(self, hp, X_train, y_train, X_test, y_test):
        model = XGBClassifier(max_depth = hp["max_depth"],
                           n_estimators = hp["n_estimators"], 
                           learning_rate = hp["learning_rate"])
        model.fit(X_train, y_train)
        score  = round((accuracy_score(y_test, model.predict(X_test))+roc_auc_score(y_test, model.predict_proba(X_test)[:, 1]))/2, 4)
        return score
        
    def select_best_model(self):
        score_df = self.score_df
        arg_index = np.argmax(score_df['Accuracy'] + score_df['AUC'])
        print(f'Best model is {score_df.index[arg_index]} \nAccuracy: {score_df.iloc[arg_index][0]:.4f}, Precision: {score_df.iloc[arg_index][1]:.4f}, Recall: {score_df.iloc[arg_index][2]:.4f}, F1: {score_df.iloc[arg_index][3]:.4f}, AUC: {score_df.iloc[arg_index][4]:.4f}')
    
    
    def main(self):
        
        populations = self.initial_populations(20)
        tot_score = []
        result_df, prefix_idx = whole_bucket(df)
        for pop in populations:
            #df1 = drop_activity(df, pop['drop_act'])
            bucket_list = bucketing(result_df, prefix_idx, pop['bucketing'])
            for bucket in bucket_list:
                score = []
                df1 = result_df[bucket[0]:bucket[1]]
                df1 = self.encoding(df1, pop['encoding'])
                X_train, y_train, X_test, y_test = self.train_test_set_split(df1, pop['encoding'])
                if 'Decision Tree' in pop:
                    score.append(self.decision_tree(pop['Decision Tree'], X_train, y_train, X_test, y_test))
                elif 'Random Forest' in pop:
                    score.append(self.random_forest(pop['Random Forest'], X_train, y_train, X_test, y_test))
                elif 'LightGBM' in pop:
                    score.append(self.light_gbm(pop['LightGBM'], X_train, y_train, X_test, y_test))
                else: 
                    score.append(self.xgboost(pop['Xgboost'], X_train, y_train, X_test, y_test))
                tot_score.append(score)
        print(tot_score)        
        new_df, prefix_idx = whole_bucket(df)


In [20]:
df = pd.read_csv('/Users/yeon1\GA_based_AutoML\dataset\dataset\BPIC17.csv')

In [21]:
# Create these file in 'drop_act+prefix_extraction.jpynb'

with open('./df_list1.pkl', 'rb') as f:
    df_list1 = pickle.load(f)
    
with open('./df_list2.pkl', 'rb') as f:
    df_list2 = pickle.load(f)
    
with open('./df_list3.pkl', 'rb') as f:
    df_list3 = pickle.load(f)
    
with open('./df_list4.pkl', 'rb') as f:
    df_list4 = pickle.load(f)

FileNotFoundError: [Errno 2] No such file or directory: './df_list1.pkl'

# experiment 1

In [6]:
#selection rate
sr = 0.1
#crossover rate
cr = 0.9
#mutation rate
mr = 0.01

max_iter = 20

In [15]:
"""
GA : Genetic Algorithm
RS : Random Search
"""
phase = 1 # drop_activity(GA) + encoding(GA) + bucketing(GA) + ML params(GA) + hyperparameter opt(GA)
# phase = 2 # drop_activity(GA) + encoding(GA) + bucketing(GA) + ML params(GA) + hyperparameter opt(RS)
# phase = 3 # drop_activity(GA) + encoding(GA) + bucketing(GA) + ML params(RS) + hyperparameter opt(RS)
# phase = 4 # drop_activity(RS) + encoding(RS) + bucketing(RS) + ML params(RS) + hyperparameter opt(RS)

GA = Genetic_Algorithm(df, options, combi, caseid, activity, ts, label, other_features, phase)
GA.feature_type()

In [18]:
cnt = 0

# #selection rate
# sr_param = [0.1, 0.2, 0.3]
# #crossover rate
# cr_param = [0.9, 0.8, 0.7]
# #mutation rate
# mr_param = [0.1, 0.05, 0.01]

best_pop = []
# param_test = {'fitness':[], 'time':[], 'score':[], 'failure rate':[]}

populations = GA.initial_populations(5)

# for i in range(3):
#     sr = sr_param[i]
#     cr = cr_param[i]
#     for j in range(3):
#         print('trial =',i+j+1)
#         mr = mr_param[j]
generation_fitness = []
generation_score = []
generation_time = []
generation_failure_rate = []
for n_iter in range(max_iter):
    new_population = []
    tot_score = []
    tot_time = []
    failure_rate = []
    for pop in tqdm(populations):
        start_time = time.time()
        new_population.append(pop)
        print(pop)
        if pop['drop_act'] == 2:
            df_list = df_list1
        elif pop['drop_act'] == 4:
            df_list = df_list2
        elif pop['drop_act'] == 6:
            df_list = df_list3
        else:
            df_list = df_list4

        if pop['encoding'] == 'last_state':
            if pop['bucketing'] < 5:
                pop['bucketing'] = 5
            bound_list = prefix_bound(pop['bucketing'], drop_low=True)
        else:
            bound_list = prefix_bound(pop['bucketing'])

        score = []
        for bounds in bound_list:
            lower, upper = bounds
            merge_df = pd.DataFrame()
            print('bucket : ',(lower, upper))
            for idx in range(lower, upper):
                prefix_df = df_list[idx-2]
                prefix_df = GA.encoding(prefix_df, pop['encoding'], idx)
                merge_df = pd.concat([merge_df, prefix_df], sort=False)

            merge_df = merge_df.fillna(0)

            X_train, y_train, X_test, y_test = GA.train_test_set_split(merge_df, pop['encoding'])    

            #start = time.time()
            if 'Decision Tree' in pop:
                score.append(GA.decision_tree(pop['Decision Tree'], X_train, y_train, X_test, y_test))
            elif 'Random Forest' in pop:
                score.append(GA.random_forest(pop['Random Forest'], X_train, y_train, X_test, y_test))
            elif 'LightGBM' in pop:
                score.append(GA.light_gbm(pop['LightGBM'], X_train, y_train, X_test, y_test))
            else: 
                score.append(GA.xgboost(pop['Xgboost'], X_train, y_train, X_test, y_test))                    

        end_time = time.time()

        elapsed_time = round(end_time-start_time,2)

        tot_time.append(elapsed_time)
        tot_score.append(round(sum(score)/len(score),2))

        min_proba = 0.7
        failure_rate.append(len([i for i in score if i < min_proba])/len(score))

    fitness = GA.fitness(tot_score, failure_rate, tot_time)
    best_pop.append(populations[np.argmax(fitness)])

    N = len(new_population)
    pop1 = GA.select_population(new_population, fitness, N*sr)
    pop2 = GA.crossover(new_population, fitness, N, cr)
    pop3 = GA.mutation(new_population, N, mr)

    populations = pop1 + pop2 + pop3

    generation_fitness.append(round(sum(fitness)/len(fitness),4))
    generation_time.append(sum(tot_time))
    generation_score.append(round(sum(tot_score)/len(tot_score),4))
    generation_failure_rate.append(round(sum(failure_rate)/len(failure_rate),4))
    
    if cnt > 5:
        break
    
    elif len(generation_fitness) > 1:
        if abs(generation_fitness[-1]-generation_fitness[-2]) < 0.001:
            break 
        elif (generation_fitness[-1]-generation_fitness[-2]) < 0:
            cnt += 1


  0%|          | 0/20 [00:00<?, ?it/s]

{'bucketing': 1, 'encoding': 'index', 'drop_act': 4, 'Decision Tree': {'max_depth': 6, 'min_samples_leaf': 53, 'criterion': 'gini'}}





NameError: name 'prefix_bound' is not defined

In [None]:
# print(generation_fitness)
# print(sum(generation_time))
# print(generation_score)
# print(generation_failure_rate)
# print(best_pop[-1])
# print(tot_score[np.argmax(fitness)])

# Experiment 2

### - Genetic Algorithm

In [None]:
"""
GA : Genetic Algorithm
RS : Random Search
"""
# phase = 1 # drop_activity(GA) + encoding(GA) + bucketing(GA) + ML params(GA) + hyperparameter opt(GA)
phase = 2 # drop_activity(GA) + encoding(GA) + bucketing(GA) + ML params(GA) + hyperparameter opt(RS)
# phase = 3 # drop_activity(GA) + encoding(GA) + bucketing(GA) + ML params(RS) + hyperparameter opt(RS)
# phase = 4 # drop_activity(RS) + encoding(RS) + bucketing(RS) + ML params(RS) + hyperparameter opt(RS)

GA = Genetic_Algorithm(df, options, combi, caseid, activity, ts, label, other_features, phase)
GA.feature_type()


In [None]:
cnt = 0

best_pop = []

populations = GA.initial_populations(5)


generation_fitness = []
generation_score = []
generation_time = []
generation_failure_rate = []
for n_iter in range(max_iter):
    new_population = []
    tot_score = []
    tot_time = []
    failure_rate = []
    for pop in tqdm(populations):
        start_time = time.time()
        new_population.append(pop)
        print(pop)
        if pop['drop_act'] == 2:
            df_list = df_list1
        elif pop['drop_act'] == 4:
            df_list = df_list2
        elif pop['drop_act'] == 6:
            df_list = df_list3
        else:
            df_list = df_list4

        if pop['encoding'] == 'last_state':
            if pop['bucketing'] < 5:
                pop['bucketing'] = 5
            bound_list = prefix_bound(pop['bucketing'], drop_low=True)
        else:
            bound_list = prefix_bound(pop['bucketing'])

        score = []
        for bounds in bound_list:
            lower, upper = bounds
            merge_df = pd.DataFrame()
            print('bucket : ',(lower, upper))
            for idx in range(lower, upper):
                prefix_df = df_list[idx-2]
                prefix_df = GA.encoding(prefix_df, pop['encoding'], idx)
                merge_df = pd.concat([merge_df, prefix_df], sort=False)

            merge_df = merge_df.fillna(0)

            X_train, y_train, X_test, y_test = GA.train_test_set_split(merge_df, pop['encoding'])    

            #start = time.time()
            if pop['models'] == 'Decision Tree':
                model = DecisionTreeClassifier()
                model.fit(X_train, y_train)
                score.append(round((accuracy_score(y_test, model.predict(X_test))+roc_auc_score(y_test, model.predict_proba(X_test)[:, 1]))/2, 4))
                
            elif pop['models'] == 'Random Forest':
                model = RandomForestClassifier()
                model.fit(X_train, y_train)
                score.append(round((accuracy_score(y_test, model.predict(X_test))+roc_auc_score(y_test, model.predict_proba(X_test)[:, 1]))/2, 4))
            elif pop['models'] == 'LightGBM':
                model = LGBMClassifier()
                model.fit(X_train, y_train)
                score.append(round((accuracy_score(y_test, model.predict(X_test))+roc_auc_score(y_test, model.predict_proba(X_test)[:, 1]))/2, 4))
            else: 
                model = XGBClassifier()
                model.fit(X_train, y_train)
                score.append(round((accuracy_score(y_test, model.predict(X_test))+roc_auc_score(y_test, model.predict_proba(X_test)[:, 1]))/2, 4))
    
        end_time = time.time()

        elapsed_time = round(end_time-start_time,4)

        tot_time.append(elapsed_time)
        tot_score.append(round(sum(score)/len(score),4))

        min_proba = 0.7
        failure_rate.append(len([i for i in score if i < min_proba])/len(score))

    fitness = GA.fitness(tot_score, failure_rate, tot_time)
    best_pop.append(populations[np.argmax(fitness)])

    N = len(new_population)
    pop1 = GA.select_population(new_population, fitness, N*sr)
    pop2 = GA.crossover(new_population, fitness, N, cr)
    pop3 = GA.mutation(new_population, N, mr)

    populations = pop1 + pop2 + pop3

    generation_fitness.append(round(sum(fitness)/len(fitness),4))
    generation_time.append(sum(tot_time))
    generation_score.append(round(sum(tot_score)/len(tot_score),4))
    generation_failure_rate.append(round(sum(failure_rate)/len(failure_rate),4))
    
    if cnt > 5:
        break
    
    elif len(generation_fitness) > 1:
        if abs(generation_fitness[-1]-generation_fitness[-2]) < 0.001:
            break 
        elif (generation_fitness[-1]-generation_fitness[-2]) < 0:
            cnt += 1
            

### - Random Search

In [None]:
model = {'Decision Tree' : DecisionTreeClassifier(), 'Random Forest' : RandomForestClassifier(), 'LightGBM' : LGBMClassifier(), 'Xgboost' : XGBClassifier()}

start_time = time.time()

pop = best_pop[-1]

best_params = []

if pop['drop_act'] == 2:
    df_list = df_list1
elif pop['drop_act'] == 4:
    df_list = df_list2
elif pop['drop_act'] == 6:
    df_list = df_list3
else:
    df_list = df_list4

bound_list = prefix_bound(pop['bucketing'])

score = []
for bounds in bound_list:
    lower, upper = bounds
    merge_df = pd.DataFrame()
    print('bucket : ',(lower, upper))
    for idx in range(lower, upper):
        prefix_df = df_list[idx-2]
        prefix_df = GA.encoding(prefix_df, pop['encoding'], idx)
        merge_df = pd.concat([merge_df, prefix_df], sort=False)

    merge_df = merge_df.fillna(0)

    X_train, y_train, X_test, y_test = GA.train_test_set_split(merge_df, pop['encoding']) 

    space = options['params'][pop['models']]

    search = RandomizedSearchCV(RandomForestClassifier(), space, n_iter=20, scoring='roc_auc', n_jobs=-1, random_state=1)
    result = search.fit(X_train, y_train)
    random_search_params = result.best_params_
    best_params.append(random_search_params)

    # print('Best Score: %s' % result.best_score_)
    print('Best Hyperparameters: %s' % result.best_params_)
    
    model = RandomForestClassifier(**random_search_params, random_state = 42)
    model.fit(X_train, y_train)
    score.append(round((accuracy_score(y_test, model.predict(X_test))+roc_auc_score(y_test, model.predict_proba(X_test)[:, 1]))/2, 4))


end_time = time.time()
elapsed_time = round(end_time-start_time,4)

# print(elapsed_time)
# print(score)

# Experiment 3
### - Genetic Algorithm

In [None]:
"""
GA : Genetic Algorithm
RS : Random Search
"""
# phase = 1 # drop_activity(GA) + encoding(GA) + bucketing(GA) + ML params(GA) + hyperparameter opt(GA)
# phase = 2 # drop_activity(GA) + encoding(GA) + bucketing(GA) + ML params(GA) + hyperparameter opt(RS)
phase = 3 # drop_activity(GA) + encoding(GA) + bucketing(GA) + ML params(RS) + hyperparameter opt(RS)
# phase = 4 # drop_activity(RS) + encoding(RS) + bucketing(RS) + ML params(RS) + hyperparameter opt(RS)

GA = Genetic_Algorithm(df, options, combi, caseid, activity, ts, label, other_features, phase)
GA.feature_type()


In [None]:
populations = GA.initial_populations(20)

best_pop = []
generation_fitness = []
generation_score = []
generation_time = []
generation_failure_rate = []
for n_iter in range(max_iter):
    new_population = []
    tot_score = []
    tot_time = []
    failure_rate = []
    for pop in tqdm(populations):
        new_population.append(pop)
        start_time = time.time()
        print(pop)
        if pop['drop_act'] == 2:
            df_list = df_list1
        elif pop['drop_act'] == 4:
            df_list = df_list2
        elif pop['drop_act'] == 6:
            df_list = df_list3
        else:
            df_list = df_list4
            
        if pop['encoding'] == 'last_state':
            bound_list = prefix_bound(pop['bucketing'], drop_low=True)
        else:
            bound_list = prefix_bound(pop['bucketing'])

        score = []
        for bounds in bound_list:
            lower, upper = bounds
            merge_df = pd.DataFrame()
            print('bucket : ',(lower, upper))
            for idx in range(lower, upper):
                prefix_df = df_list[idx-2]
                prefix_df = GA.encoding(prefix_df, pop['encoding'], idx)
                merge_df = pd.concat([merge_df, prefix_df], sort=False)

            merge_df = merge_df.fillna(0)

            X_train, y_train, X_test, y_test = GA.train_test_set_split(merge_df, pop['encoding'])    

            #start = time.time()
            model = XGBClassifier()
            model.fit(X_train, y_train)
            score.append(round((accuracy_score(y_test, model.predict(X_test))+roc_auc_score(y_test, model.predict_proba(X_test)[:, 1]))/2, 4))
            

        end_time = time.time()

        elapsed_time = round(end_time-start_time,2)
        tot_time.append(elapsed_time)
        tot_score.append(round(sum(score)/len(score),2))

        min_proba = 0.7
        failure_rate.append(len([i for i in score if i < min_proba])/len(score))

    fitness = GA.fitness(tot_score, failure_rate, tot_time)
    best_pop.append(populations[np.argmax(fitness)])

    N = len(new_population)
    pop1 = GA.select_population(new_population, fitness, N*sr)
    pop2 = GA.crossover(new_population, fitness, N, cr)
    pop3 = GA.mutation(new_population, N, mr)
    
    populations = pop1 + pop2 + pop3
    
    generation_fitness.append(round(sum(fitness)/len(fitness),2))
    generation_time.append(sum(tot_time))
    generation_score.append(round(sum(tot_score)/len(tot_score),2))
    generation_failure_rate.append(round(sum(failure_rate)/len(failure_rate),2))

### - Random Search

In [None]:
# random search
models = {'Decision Tree' : DecisionTreeClassifier(), 'Random Forest' : RandomForestClassifier(), 'LightGBM' : LGBMClassifier(), 'Xgboost' : XGBClassifier()}

start_time = time.time()

pop = best_pop[-1]

if pop['drop_act'] == 2:
    df_list = df_list1
elif pop['drop_act'] == 4:
    df_list = df_list2
elif pop['drop_act'] == 6:
    df_list = df_list3
else:
    df_list = df_list4

bound_list = prefix_bound(pop['bucketing'])

score = {'DT':[], 'RF':[], 'LGBM':[], 'XGB':[]}
best_param = {'DT':[], 'RF':[], 'LGBM':[], 'XGB':[]}
avg_score = []

for bounds in bound_list:
    lower, upper = bounds
    merge_df = pd.DataFrame()
    print('bucket : ',(lower, upper))
    for idx in range(lower, upper):
        prefix_df = df_list1[idx-2]
        prefix_df = GA.encoding(prefix_df, pop['encoding'], idx)
        merge_df = pd.concat([merge_df, prefix_df], sort=False)

    merge_df = merge_df.fillna(0)

    X_train, y_train, X_test, y_test = GA.train_test_set_split(merge_df, pop['encoding']) 
    avg = 0
    for model in models.keys():
        if model == 'Decision Tree':
            space = options['params']['Decision Tree']
            search = RandomizedSearchCV(models['Decision Tree'], space, n_iter=20, scoring='roc_auc', n_jobs=-1, random_state=1)
            result = search.fit(X_train, y_train)
            random_search_params = result.best_params_
            best_param['DT'].append(random_search_params)
            model = DecisionTreeClassifier(**random_search_params, random_state = 42)
            model.fit(X_train, y_train)
            sc = round((accuracy_score(y_test, model.predict(X_test))+roc_auc_score(y_test, model.predict_proba(X_test)[:, 1]))/2, 4)
            score['DT'].append(sc)
            avg += sc
        elif model == 'Random Forest':
            space = options['params']['Random Forest']
            search = RandomizedSearchCV(models['Random Forest'], space, n_iter=20, scoring='roc_auc', n_jobs=-1, random_state=1)
            result = search.fit(X_train, y_train)
            random_search_params = result.best_params_
            best_param['RF'].append(random_search_params)
            model = RandomForestClassifier(**random_search_params, random_state = 42)
            model.fit(X_train, y_train)
            sc = round((accuracy_score(y_test, model.predict(X_test))+roc_auc_score(y_test, model.predict_proba(X_test)[:, 1]))/2, 4)            
            score['RF'].append(sc)
            avg += sc
        elif model == 'LightGBM':
            space = options['params']['LightGBM']
            search = RandomizedSearchCV(models['LightGBM'], space, n_iter=20, scoring='roc_auc', n_jobs=-1, random_state=1)
            result = search.fit(X_train, y_train)
            random_search_params = result.best_params_
            best_param['LGBM'].append(random_search_params)
            model = LGBMClassifier(**random_search_params, random_state = 42)
            model.fit(X_train, y_train)
            sc = round((accuracy_score(y_test, model.predict(X_test))+roc_auc_score(y_test, model.predict_proba(X_test)[:, 1]))/2, 4)
            score['LGBM'].append(sc)
            avg += sc
        elif model == 'Xgboost':
            space = options['params']['Xgboost']
            search = RandomizedSearchCV(models['Xgboost'], space, n_iter=20, scoring='roc_auc', n_jobs=-1, random_state=1)
            result = search.fit(X_train, y_train)
            random_search_params = result.best_params_
            best_param['XGB'].append(random_search_params)
            model = XGBClassifier(**random_search_params, random_state = 42)
            model.fit(X_train, y_train)
            sc = round((accuracy_score(y_test, model.predict(X_test))+roc_auc_score(y_test, model.predict_proba(X_test)[:, 1]))/2, 4)
            score['XGB'].append(sc)
            avg += sc
            
    avg_score.append(round(avg/4, 4))

end_time = time.time()
elapsed_time = round(end_time-start_time,4)

print(elapsed_time)
print(score)
print(np.mean(score))

# Experiment 4
### - Random search

In [None]:
"""
GA : Genetic Algorithm
RS : Random Search
"""
# phase = 1 # drop_activity(GA) + encoding(GA) + bucketing(GA) + ML params(GA) + hyperparameter opt(GA)
# phase = 2 # drop_activity(GA) + encoding(GA) + bucketing(GA) + ML params(GA) + hyperparameter opt(RS)
# phase = 3 # drop_activity(GA) + encoding(GA) + bucketing(GA) + ML params(RS) + hyperparameter opt(RS)
phase = 4 # drop_activity(RS) + encoding(RS) + bucketing(RS) + ML params(RS) + hyperparameter opt(RS)

GA = Genetic_Algorithm(df, options, combi, caseid, activity, ts, label, other_features, phase)
GA.feature_type()

In [None]:
score = []

start_time = time.time()

populations = GA.initial_populations(100)

for pop in tqdm(populations):
    print(pop)
    if pop['drop_act'] == 2:
        df_list = df_list1
    elif pop['drop_act'] == 4:
        df_list = df_list2
    elif pop['drop_act'] == 6:
        df_list = df_list3
    else:
        df_list = df_list4

    bound_list = prefix_bound(pop['bucketing'])

    for bounds in bound_list:
        lower, upper = bounds
        merge_df = pd.DataFrame()
        print('bucket : ',(lower, upper))
        for idx in range(lower, upper):
            prefix_df = df_list[idx-2]
            prefix_df = GA.encoding(prefix_df, pop['encoding'], idx)
            merge_df = pd.concat([merge_df, prefix_df], sort=False)

        merge_df = merge_df.fillna(0)

        X_train, y_train, X_test, y_test = GA.train_test_set_split(merge_df, pop['encoding'])

        #start = time.time()
        if 'Decision Tree' in pop:
            score.append(GA.decision_tree(pop['Decision Tree'], X_train, y_train, X_test, y_test))
        elif 'Random Forest' in pop:
            score.append(GA.random_forest(pop['Random Forest'], X_train, y_train, X_test, y_test))
        elif 'LightGBM' in pop:
            score.append(GA.light_gbm(pop['LightGBM'], X_train, y_train, X_test, y_test))
        else: 
            score.append(GA.xgboost(pop['Xgboost'], X_train, y_train, X_test, y_test))

end_time = time.time()
tot_time = round(end_time-start_time,4)
tot_score = (round(sum(score)/len(score),4))

best_pop = populations[np.argmax(score)]
best_score = max(score)

print('best instance is ', best_pop)        
print('best score is ', best_score)        
print('total time is ', tot_time)