## Generate Data

### generate_customer_profiles_table

In [1]:
def generate_customer_profiles_table(n_customers, random_state=42):
    
    np.random.seed(random_state)
        
    customer_id_properties = []
    
    # Generate customer properties from random distributions 
    for customer_id in range(n_customers):
        
        x_customer_id = np.random.uniform(0,100)
        y_customer_id = np.random.uniform(0,100)
        
        mean_amount = np.random.uniform(5,100) # Arbitrary (but sensible) value 
        std_amount = mean_amount/2 # Arbitrary (but sensible) value
        
        mean_nb_tx_per_day = np.random.uniform(0,4) # Arbitrary (but sensible) value 
        
        customer_id_properties.append([customer_id,
                                      x_customer_id, y_customer_id,
                                      mean_amount, std_amount,
                                      mean_nb_tx_per_day])
        
    customer_profiles_table = pd.DataFrame(
        customer_id_properties,
        columns=[
            'customer_id',
            'x_customer_id', 'y_customer_id',
            'mean_amount', 'std_amount',
            'mean_nb_tx_per_day'
        ])
    
    return customer_profiles_table

### generate_terminal_profiles_table

In [2]:
def generate_terminal_profiles_table(n_terminals, random_state=42):
    
    np.random.seed(random_state)
        
    terminal_id_properties = []
    
    # Generate terminal properties from random distributions 
    for terminal_id in range(n_terminals):
        
        x_terminal_id = np.random.uniform(0,100)
        y_terminal_id = np.random.uniform(0,100)
        
        terminal_id_properties.append([
            terminal_id,
            x_terminal_id, y_terminal_id
        ])
                                       
    terminal_profiles_table = pd.DataFrame(
        terminal_id_properties,
        columns=['terminal_id', 'x_terminal_id', 'y_terminal_id']
    )
    
    return terminal_profiles_table

### get_list_terminals_within_radius

In [3]:
def get_list_terminals_within_radius(customer_profile, x_y_terminals, r):
    
    # Use numpy arrays in the following to speed up computations
    
    # Location (x,y) of customer as numpy array
    x_y_customer = \
        customer_profile[['x_customer_id','y_customer_id']].values.astype(float)
    
    # Squared difference in coordinates between customer and terminal locations
    squared_diff_x_y = np.square(x_y_customer - x_y_terminals)
    
    # Sum along rows and compute suared root to get distance
    dist_x_y = np.sqrt(np.sum(squared_diff_x_y, axis=1))
    
    # Get the indices of terminals which are at a distance less than r
    available_terminals = list(np.where(dist_x_y < r)[0])
    
    # Return the list of terminal IDs
    return available_terminals
    

### generate_transactions_table

In [4]:
def generate_transactions_table(customer_profile, nb_days=10):
    
    customer_transactions = []
    
    random.seed(int(customer_profile['customer_id']))
    np.random.seed(int(customer_profile['customer_id']))
    
    # For all days
    for day in range(nb_days):
        
        # Random number of transactions for that day 
        nb_tx = np.random.poisson(customer_profile.mean_nb_tx_per_day)
        
        # If nb_tx positive, let us generate transactions
        if nb_tx > 0:
            
            for tx in range(nb_tx):
                
                # Time of transaction: Around noon, std 20000 seconds. 
                # This choice aims at simulating the fact that 
                # most transactions occur during the day.
                time_tx = int(np.random.normal(86400 / 2, 20000))
                
                # If transaction time between 0 and 86400, 
                # let us keep it, otherwise, let us discard it
                if (time_tx > 0) and (time_tx < 86400):
                    
                    # Amount is drawn from a normal distribution  
                    amount = np.random.normal(
                        customer_profile.mean_amount, 
                        customer_profile.std_amount
                    )
                    
                    # If amount negative, draw from a uniform distribution
                    if amount < 0:
                        
                        amount = np.random.uniform(
                            0, customer_profile.mean_amount*2
                        )
                    
                    amount = np.round(amount, decimals=2)
                    
                    if len(customer_profile.available_terminals) > 0:
                        
                        terminal_id = random.choice(
                            customer_profile.available_terminals
                        )
                    
                        customer_transactions.append([
                            time_tx + day * 86400,
                            day,
                            customer_profile['customer_id'],
                            terminal_id,
                            amount
                        ])
            
    customer_transactions = pd.DataFrame(
        customer_transactions,
        columns=[
            'tx_time_seconds', 'tx_time_days',
            'customer_id', 'terminal_id', 'tx_amount'
        ])
    
    if len(customer_transactions) > 0:
        
        customer_transactions['tx_datetime'] = \
            pd.to_datetime(
                customer_transactions["tx_time_seconds"],
                unit='s', origin=start_date
            )

        customer_transactions = \
            customer_transactions[[
                'tx_datetime','customer_id', 'terminal_id',
                'tx_amount','tx_time_seconds', 'tx_time_days'
            ]]
    
    return customer_transactions

### add_frauds 

In [5]:
def add_frauds(customer_profiles_table, terminal_profiles_table, data):
    
    # By default, all transactions are genuine
    data['tx_fraud'] = 0
    data['tx_fraud_scenario'] = 0
    
    # Scenario 1
    data.loc[data['tx_amount'] > 220, 'tx_fraud'] = 1
    data.loc[data['tx_amount'] > 220, 'tx_fraud_scenario'] = 1
    nb_frauds_scenario_1 = data['tx_fraud'].sum()
    
    # Scenario 2
    for day in range(data['tx_time_days'].max()):
        
        compromised_terminals = \
            terminal_profiles_table['terminal_id'].sample(n=2,
                                                          random_state=day)
        
        compromised_transactions = data[
            (data['tx_time_days'] >= day) &
            (data['tx_time_days'] < day + 28) &
            (data['terminal_id'].isin(compromised_terminals))
        ]
                            
        data.loc[compromised_transactions.index, 'tx_fraud'] = 1
        data.loc[compromised_transactions.index, 'tx_fraud_scenario'] = 2
    
    nb_frauds_scenario_2 = data['tx_fraud'].sum() - nb_frauds_scenario_1
    
    # Scenario 3
    for day in range(data['tx_time_days'].max()):
        
        compromised_customers = \
            customer_profiles_table['customer_id'].sample(n=3, random_state=day) \
                                                    .values
        
        compromised_transactions = data[
            (data['tx_time_days'] >= day) &
            (data['tx_time_days'] < day + 14) &
            (data['customer_id'].isin(compromised_customers))
        ]
        
        nb_compromised_transactions = len(compromised_transactions)
        
        random.seed(day)
        
        index_fauds = random.sample(
            list(compromised_transactions.index.values), 
            k=int(nb_compromised_transactions / 3)
        )
        
        data.loc[index_fauds, 'tx_amount'] = \
            data.loc[index_fauds,'tx_amount'] * 5
        
        data.loc[index_fauds, 'tx_fraud'] = 1
        data.loc[index_fauds, 'tx_fraud_scenario'] = 3
        
                             
    nb_frauds_scenario_3 = \
        data['tx_fraud'].sum() - nb_frauds_scenario_2 - nb_frauds_scenario_1
    
    return data                 

### generate_dataset

In [6]:
def generate_dataset(
        start_date, random_state=42, n_customers=10000,
        n_terminals=1000000, nb_days=90, r=5):
    
    customer_profiles_table = \
        generate_customer_profiles_table(n_customers,
                                         random_state=random_state)
    
    terminal_profiles_table = \
        generate_terminal_profiles_table(n_terminals,
                                         random_state=random_state+1)
    
    x_y_terminals = \
        (terminal_profiles_table[['x_terminal_id','y_terminal_id']]
         .values
         .astype(float))

    customer_profiles_table['available_terminals'] = \
        (customer_profiles_table
         .apply(lambda x : get_list_terminals_within_radius(
                              x,
                              x_y_terminals=x_y_terminals,
                              r=r
                           ),
                axis=1))
            
    customer_profiles_table['nb_terminals'] = \
        customer_profiles_table.available_terminals.apply(len)
            
    df = (customer_profiles_table
          .groupby('customer_id')
          .apply(lambda x : generate_transactions_table(x.iloc[0], nb_days))
          .reset_index(drop=True))
    
    # Sort transactions chronologically
    df = df.sort_values('tx_datetime')
    # Reset indices, starting from 0
    df.reset_index(inplace=True, drop=True)
    df.reset_index(inplace=True)
    df.rename(columns={'index':'transaction_id'}, inplace=True)
    
    return customer_profiles_table, terminal_profiles_table, df

### read_from_files

First use in [Chapter 3, Baseline Feature Transformation](Baseline_Feature_Transformation).

In [7]:
# Load a set of pickle files, put them together in a single DataFrame, and order them by time
# It takes as input the folder DIR_INPUT where the files are stored, and the BEGIN_DATE and END_DATE
def read_from_files(DIR_INPUT, BEGIN_DATE, END_DATE, sort='transaction_id'):
    
    files = [os.path.join(DIR_INPUT, f) for f in os.listdir(DIR_INPUT) \
             if f >= BEGIN_DATE + '.pkl' and f <= END_DATE + '.pkl']

    frames = []
    
    for f in files:
        df = pd.read_pickle(f)
        frames.append(df)
        del df
        
    df_final = pd.concat(frames)
    
    df_final = df_final.sort_values(sort)
    df_final.reset_index(drop=True, inplace=True)
    
    #  Note: -1 are missing values for real world data 
    df_final = df_final.replace([-1],0)
    
    return df_final

### save_object


In [8]:
#Save oject as pickle file
def save_object(obj, filename):
    with open(filename, 'wb') as output:
        pickle.dump(obj, output, pickle.HIGHEST_PROTOCOL)


## Data Transformation

### is_weekend, is_night

In [9]:
def is_weekend(tx_datetime):
    
    # Transform date into weekday (0 is Monday, 6 is Sunday)
    weekday = tx_datetime.weekday()
    # Binary value: 0 if weekday, 1 if weekend
    is_weekend = weekday>=5
    
    return int(is_weekend)

In [10]:
def is_night(tx_datetime):
    
    # Get the hour of the transaction
    tx_hour = tx_datetime.hour
    # Binary value: 1 if hour less than 6, and 0 otherwise
    is_night = tx_hour<=6
    
    return int(is_night)

### get_customer_spending_behaviour_features

In [11]:
def get_customer_spending_behaviour_features(customer_transactions,
                                             windows_size_in_days=[1,7,30]):
    
    # Let us first order transactions chronologically
    customer_transactions = customer_transactions.sort_values('tx_datetime')
    
    # The transaction date and time is set as the index, 
    # which will allow the use of the rolling function 
    customer_transactions.index = customer_transactions['tx_datetime']
    
    # For each window size
    for window_size in windows_size_in_days:
        
        # Compute the sum of the transaction amounts and 
        # the number of transactions for the given window size
        SUM_AMOUNT_TX_WINDOW = customer_transactions['tx_amount'] \
                                   .rolling(str(window_size) + 'd').sum()
        
        NB_TX_WINDOW = customer_transactions['tx_amount'] \
                           .rolling(str(window_size) + 'd').count()
    
        # Compute the average transaction amount for the given window size
        # NB_TX_WINDOW is always >0 since current transaction is always included
        AVG_AMOUNT_TX_WINDOW = SUM_AMOUNT_TX_WINDOW / NB_TX_WINDOW
    
        # Save feature values
        customer_nb_name = 'customer_id_nb_tx_' + str(window_size) + 'day_window'
        customer_avg_name = 'customer_id_avg_amount_' + str(window_size) + 'day_window'
        
        customer_transactions[customer_nb_name] = list(NB_TX_WINDOW)
        customer_transactions[customer_avg_name] = list(AVG_AMOUNT_TX_WINDOW)
    
    # Reindex according to transaction IDs
    customer_transactions.index = customer_transactions['transaction_id']
        
    # And return the dataframe with the new features
    return customer_transactions

### get_count_risk_rolling_window

In [12]:
def get_count_risk_rolling_window(terminal_transactions,
                                  delay_period=7,
                                  windows_size_in_days=[1,7,30],
                                  feature='terminal_id'):
    
    terminal_transactions = terminal_transactions.sort_values('tx_datetime')
    
    terminal_transactions.index = terminal_transactions['tx_datetime']

    name_delay = str(delay_period) + 'd'
    
    nb_fraud_delay = terminal_transactions['tx_fraud'].rolling(name_delay).sum()
    nb_tx_delay = terminal_transactions['tx_fraud'].rolling(name_delay).count()
    
    for window_size in windows_size_in_days:

        name_window = str(delay_period+window_size) + 'd'
        
        nb_fraud_delay_window = \
            terminal_transactions['tx_fraud'].rolling(name_window).sum()
        nb_tx_delay_window = \
            terminal_transactions['tx_fraud'].rolling(name_window).count()
    
        nb_fraud_window = nb_fraud_delay_window - nb_fraud_delay
        nb_tx_window = nb_tx_delay_window - nb_tx_delay
        risk_window = nb_fraud_window / nb_tx_window

        name_nb = feature + '_nb_tx_' + str(window_size) + 'day_window'
        name_risk = feature + '_risk_' + str(window_size) + 'day_window'
        
        terminal_transactions[name_nb] = list(nb_tx_window)
        terminal_transactions[name_risk] = list(risk_window)
        
    terminal_transactions.index = terminal_transactions['transaction_id']
    
    # Replace NA values with 0 (all undefined risk scores where NB_TX_WINDOW is 0) 
    terminal_transactions.fillna(0, inplace=True)
    
    return terminal_transactions

### get_train_test_set

In [13]:
def get_train_test_set(
        data, start_date_training, delta_train=7, 
        delta_delay=7, delta_test=7, sampling_ratio=1.0, random_state=42):

    end_date_training = (start_date_training
                        + dt.timedelta(days=delta_train))
    
    # Get the training set data
    train_df = data[
            (data['tx_datetime'] >= start_date_training) 
            & (data['tx_datetime'] < end_date_training)].copy()
    
    # Get the test set data
    test_df = []
    
    # Note: Cards known to be compromised after 
    # the delay period are removed from the test set
    # That is, for each test day, all frauds known at (test_day-delay_period) 
    # are removed
    
    # First, get known defrauded customers from the training set
    known_defrauded_customers = \
        set(train_df[train_df['tx_fraud']==1]['customer_id'])
    
    # Get the relative starting day of training set 
    # (easier than TX_DATETIME to collect test data)
    start_tx_time_days_training = train_df['tx_time_days'].min()
    
            
    # Then, for each day of the test set
    for day in range(delta_test):

        # current test day number since the start of training
        test_day = sum([
            start_tx_time_days_training,
            delta_train,
            delta_delay,
            day
        ])
        
        # Get whole test data for that day
        test_day_df = data[data['tx_time_days']==test_day]
        
        # find current day minus delay period
        test_day_delay = test_day - delta_delay
        
        # Compromised cards from that test day minus the delay period, 
        # are added to the pool of known defrauded customers
        test_day_df_delay = data[data['tx_time_days']==test_day_delay-1].copy()

        new_defrauded_customers = \
            set(test_day_df_delay[test_day_df_delay['tx_fraud']==1]['customer_id'])

        # add defrauded customers from delay period to known defrauded customers
        known_defrauded_customers = (known_defrauded_customers
                                     .union(new_defrauded_customers))

        # remove from cureent test day data all known customers at that moment:
        # first, all defrauded customers from train dataset
        # second, all defrauded customers from start of delay period till 
        # current test day minus delay period and minus one day 
        # (because we can't get all frauds of current test day minus 
        # delay period immediately - so, for current test day we get 
        # data from "yesterday minus delay period")
        test_day_df = (test_day_df[~test_day_df['customer_id']
                       .isin(known_defrauded_customers)])
        
        test_df.append(test_day_df)
        
    test_df = pd.concat(test_df)
    
    # If subsample
    if sampling_ratio < 1:
        
        train_df_frauds = (train_df[train_df['tx_fraud'] == 1]
                           .sample(frac=sampling_ratio,
                                   random_state=random_state))
        
        train_df_genuine = (train_df[train_df['tx_fraud'] == 0]
                            .sample(frac=sampling_ratio,
                                    random_state=random_state))
        
        train_df = pd.concat([train_df_frauds, train_df_genuine])
        
    # Sort data sets by ascending order of transaction ID
    train_df = train_df.sort_values('transaction_id')
    test_df = test_df.sort_values('transaction_id')
    
    return train_df, test_df

### prequentialSplit

First use in [Chapter 5, Validation Strategies](Validation_Strategies).

In [14]:
def prequentialSplit(
        transactions_df, start_date_training, n_folds=4,
        delta_train=7, delta_delay=7, delta_assessment=7):
    
    prequential_split_indices=[]
        
    # For each fold
    for fold in range(n_folds):
        
        # Shift back start date for training by the fold index times
        # the assessment period (delta_assessment)
        start_date_training_fold = (start_date_training 
                                    - dt.timedelta(days=fold*delta_assessment))
        
        # Get the training and test (assessment) sets
        (train_df, test_df) = get_train_test_set(transactions_df,
                                                 start_date_training_fold,
                                                 delta_train=delta_train,
                                                 delta_delay=delta_delay,
                                                 delta_test=delta_assessment)
    
        # Get the indices from the two sets, and add them to the list of prequential splits
        indices_train=list(train_df.index)
        indices_test=list(test_df.index)
        
        prequential_split_indices.append((indices_train,indices_test))
    
    return prequential_split_indices

### card_precision_top_k_day

In [15]:
def card_precision_top_k_day(df_day, top_k):
    
    # This takes the max of the predictions AND the max of label TX_FRAUD for each CUSTOMER_ID, 
    # and sorts by decreasing order of fraudulent prediction
    df_day = (df_day
              .groupby('customer_id')
              .max()
              .sort_values(by="predictions", ascending=False)
              .reset_index(drop=False))
            
    # Get the top k most suspicious cards
    df_day_top_k = df_day.head(top_k)
    
    list_detected_compromised_cards = \
        list(df_day_top_k[df_day_top_k['tx_fraud']==1]['customer_id'])
    
    # Compute precision top k
    card_precision_top_k = len(list_detected_compromised_cards) / top_k
    
    return list_detected_compromised_cards, card_precision_top_k

### card_precision_top_k

In [16]:
def card_precision_top_k(
        predictions_df, top_k,
        remove_detected_compromised_cards=True):

    # Sort days by increasing order
    list_days=list(predictions_df['tx_time_days'].unique())
    list_days.sort()
    
    # At first, the list of detected compromised cards is empty
    list_detected_compromised_cards = []
    
    card_precision_top_k_per_day_list = []
    nb_compromised_cards_per_day = []
    
    # For each day, compute precision top k
    for day in list_days:
        
        df_day = predictions_df[predictions_df['tx_time_days']==day]
        df_day = df_day[['predictions', 'customer_id', 'tx_fraud']]
        
        # Let us remove detected compromised cards from the set of daily transactions
        df_day = (df_day[df_day['customer_id']
                  .isin(list_detected_compromised_cards)==False])

        num_cards = len(df_day[df_day['tx_fraud']==1]['customer_id'].unique())
        nb_compromised_cards_per_day.append(num_cards)
        
        (detected_compromised_cards,
         card_precision_top_k) = card_precision_top_k_day(df_day,top_k)
        
        card_precision_top_k_per_day_list.append(card_precision_top_k)
        
        # Let us update the list of detected compromised cards
        if remove_detected_compromised_cards:
            list_detected_compromised_cards.extend(detected_compromised_cards)
        
    # Compute the mean
    mean_card_precision_top_k = np.array(card_precision_top_k_per_day_list).mean()
    
    # Returns precision top k per day as a list, and resulting mean
    return (nb_compromised_cards_per_day,
            card_precision_top_k_per_day_list,
            mean_card_precision_top_k)

### card_precision_top_k_custom

First use in [Chapter 5, Validation Strategies](Validation_Strategies).

In [17]:
def card_precision_top_k_custom(y_true, y_pred, top_k, data):
    
    # Let us create a predictions_df DataFrame, that contains all transactions matching the indices of the current fold
    # (indices of the y_true vector)
    predictions_df = data.iloc[y_true.index.values].copy()
    predictions_df['predictions'] = y_pred
    
    # Compute the CP@k using the function implemented in Chapter 4, Section 4.2
    (nb_compromised_cards_per_day,
     card_precision_top_k_per_day_list,
     mean_card_precision_top_k) = card_precision_top_k(predictions_df, top_k)
    
    # Return the mean_card_precision_top_k
    return mean_card_precision_top_k

### prequential_grid_search

In [18]:
def prequential_grid_search(
        transactions_df, classifier, parameters, features, target,
        scoring, start_date_training, subset='Test', preprop_list=[],
        n_folds=4, delta_train=7, delta_delay=7, delta_assessment=7,
        performance_metrics_list_grid=['roc_auc'], performance_metrics_list=['AUC ROC'],
        search_type='grid', n_iter=None, random_state=42, n_jobs=-1):

    estimators = preprop_list.copy()
    estimators.extend([('clf', classifier)])
                                
    pipe = Pipeline(estimators)
    
    prequential_split_indices = prequentialSplit(
                                    transactions_df,
                                    start_date_training=start_date_training,
                                    n_folds=n_folds, 
                                    delta_train=delta_train, 
                                    delta_delay=delta_delay, 
                                    delta_assessment=delta_assessment
                                )
    
    if search_type=="grid":

        cv = GridSearchCV(pipe,
                          parameters,
                          scoring=scoring,
                          cv=prequential_split_indices,
                          refit=False,
                          n_jobs=n_jobs)

    if search_type=="random":
        
        cv = RandomizedSearchCV(pipe,
                                parameters,
                                scoring=scoring,
                                cv=prequential_split_indices,
                                refit=False,
                                n_jobs=n_jobs,
                                n_iter=n_iter,
                                random_state=random_state)

    
    X = transactions_df[features]
    y = transactions_df[target]

    cv.fit(X, y)

    performances_df = pd.DataFrame()
    
    for i in range(len(performance_metrics_list_grid)):

        metric_col = performance_metrics_list[i] + ' ' + subset
        metric_col_std = performance_metrics_list[i] + ' ' + subset + ' Std'
        
        metric_grid = 'mean_test_' + performance_metrics_list_grid[i]
        metric_grid_std = 'std_test_' + performance_metrics_list_grid[i]
        
        performances_df[metric_col] = cv.cv_results_[metric_grid]
        performances_df[metric_col_std] = cv.cv_results_[metric_grid_std]

    performances_df['Parameters'] = cv.cv_results_['params']
    performances_df['Fit time'] = cv.cv_results_['mean_fit_time']
    performances_df['Score time'] = cv.cv_results_['mean_score_time']
    
    return performances_df

### grid_create

In [19]:
def grid_create(
        clf, params, preprop_list, search_type='grid', n_iter=None,
        random_state=42, key_params_idxs=0, time_exec=False):
    
    grid = {
        # 'data': data,
        'clf': clf,
        'params': params,
        'features': features,
        'target': target,
        'scoring': scoring,
        'train_start_valid': train_start_valid,
        'train_start_test': train_start_test,
        'preprop_list': preprop_list,
        'n_folds': n_folds,
        'delta_train': delta_train,
        'delta_delay': delta_delay,
        'delta_assessment': delta_assessment,
        'metrics_list_grid': metrics_grid,
        'metrics_list': metrics,
        'search_type': search_type,
        'n_jobs': n_jobs,
        'n_iter': n_iter,
        'random_state': random_state,
        'key_params_idxs': key_params_idxs,
        'time_exec': time_exec
    }

    return grid

### model_performance_CV

In [20]:
def model_performance_CV(
        data, clf, params, features, target,
        preprop_list, scoring, train_start_valid, train_start_test,
        n_folds, delta_train, delta_delay, delta_assessment,
        metrics_list_grid, metrics_list, search_type, n_jobs,
        n_iter=None, random_state=42, key_params_idxs=0, time_exec=True):

    t_start = time.time()
    
    performance_validation = prequential_grid_search(
        data,
        clf,
        params,
        features,
        target,
        scoring,
        start_date_training=train_start_valid,
        preprop_list=preprop_list,
        n_folds=n_folds,
        subset='Validation',
        delta_train=delta_train,
        delta_delay=delta_delay,
        delta_assessment=delta_assessment,
        performance_metrics_list_grid=metrics_list_grid,
        performance_metrics_list=metrics_list,
        search_type=search_type,
        n_iter=n_iter,
        random_state=random_state,
        n_jobs=n_jobs)


    performance_test = prequential_grid_search(
        data,
        clf,
        params,
        features,
        target,
        scoring,
        start_date_training=train_start_test,
        preprop_list=preprop_list,
        n_folds=n_folds,
        subset='Test',
        delta_train=delta_train,
        delta_delay=delta_delay,
        delta_assessment=delta_assessment,
        performance_metrics_list_grid=metrics_list_grid,
        performance_metrics_list=metrics_list,
        search_type=search_type,
        n_iter=n_iter,
        random_state=random_state,
        n_jobs=n_jobs)

    cols_drop = ['Parameters', 'Fit time', 'Score time']
    performance_validation.drop(columns=cols_drop, inplace=True)

    performance = pd.concat([performance_test, performance_validation], axis=1)

    summary_series = performance['Parameters']
    params_keys_list = list(params.keys())
    summary_params = [params_keys_list[i] for i in key_params_idxs]
    
    summary_list = []

    for params_value in summary_series:
        
        summary_value_list = [params_value[i] for i in summary_params]
        summary_value = '/'.join(map(str, summary_value_list))
        summary_list.append(summary_value)
        
    performance['Parameters summary'] = summary_list
    
    if time_exec:
        e_time = time.time() - t_start
        e_time_format = str(dt.timedelta(seconds=np.round(e_time)))
        print('Execution time: {}'.format(e_time_format))

    return performance

### get_summary_performances

First use in [Chapter 5, Model Selection](Model_Selection).

In [21]:
def get_summary_performances(performances_df, metrics):

    # metrics = ['AUC ROC','Average precision','Card Precision@100']
    metrics_test = [(i + ' Test') for i in metrics]
    
    performances_results=pd.DataFrame(columns=metrics)
    
    performances_df.reset_index(drop=True, inplace=True)

    best_estimated_parameters = []
    validation_performance = []
    test_performance = []
    
    for metric in metrics:

        metric_max = np.argmax(performances_df[metric + ' Validation'].values)
        
        idx_best_valid = performances_df.index[metric_max]

        metric_params = (performances_df['Parameters summary']
                         .iloc[idx_best_valid])
        
        best_estimated_parameters.append(metric_params)

        perf_valid = (performances_df[metric + ' Validation']
                      .iloc[idx_best_valid])
        
        perf_valid_std = (performances_df[metric + ' Validation' + ' Std']
                          .iloc[idx_best_valid])

        perf_valid = '%.3f' % round(perf_valid, 3)
        perf_valid_std = '%.3f' % round(perf_valid_std, 3)
        
        validation_performance.append(perf_valid + '+/-' + perf_valid_std)

        perf_test = (performances_df[metric + ' Test']
                     .iloc[idx_best_valid])

        perf_test_std = (performances_df[metric + ' Test' + ' Std']
                         .iloc[idx_best_valid])

        perf_test = '%.3f' % round(perf_test, 3)
        perf_test_std = '%.3f' % round(perf_test_std, 3)
        
        test_performance.append(perf_test + '+/-' + perf_test_std)
    
    performances_results.loc["Best estimated parameters"] = best_estimated_parameters
    performances_results.loc["Validation performance"] = validation_performance
    performances_results.loc["Test performance"] = test_performance

    optimal_test_performance = []
    optimal_parameters = []

    for metric in metrics_test:

        metric_max_test = np.argmax(performances_df[metric].values)
        idx_opt_test = performances_df.index[metric_max_test]

        metric_params_test = (performances_df['Parameters summary']
                              .iloc[idx_opt_test])
    
        optimal_parameters.append(metric_params_test)

        perf_opt_test = performances_df[metric].iloc[idx_opt_test]
        perf_opt_test_std = performances_df[metric + ' Std'].iloc[idx_opt_test]

        perf_opt_test = '%.3f' % round(perf_opt_test, 3)
        perf_opt_test_std = '%.3f' % round(perf_opt_test_std, 3)
        
        optimal_test_performance.append(perf_opt_test + '+/-' + perf_opt_test_std)

    performances_results.loc["Optimal parameter(s)"] = optimal_parameters
    performances_results.loc["Optimal test performance"] = optimal_test_performance
    
    return performances_results

### conf_matrix

In [22]:
def conf_matrix(y_true, y_score, save=False, filename='0'):

    matrix = confusion_matrix(y_true, y_score)

    tp = matrix[1][1]
    tn = matrix[0][0]
    fp = matrix[0][1]
    fn = matrix[1][0]

    fig, ax = plt.subplots(figsize=(4, 2))

    ax = sns.heatmap(
        matrix,
        cmap=gradient,
        vmin=0,
        vmax=0,
        annot_kws={'size': 12},
        cbar_kws={'shrink': 1},
        annot=True,
        xticklabels=['NO', 'YES'],
        yticklabels=['NO', 'YES'],
        cbar=False,
        linewidths=0.5,
        linecolor='0.75',
        fmt='g'
    )

    plt.xticks(size=9, rotation=0, y=-0.03)
    plt.yticks(size=9, rotation=0, x=-0.01)

    ax.set_ylabel('Actual', fontsize=9)
    ax.set_xlabel('Predicted', fontsize=9)
    ax.tick_params(left=True, bottom=True)
    
    ax.spines[:].set_visible(True)
    ax.spines[:].set_linewidth(1)
    ax.spines[:].set_color('0.35')

    if save:
        plt.savefig(
            fname='img/{}.png'.format(filename),
            bbox_inches='tight'
        )
        
    plt.show()

    return tn, fp, fn, tp

### threshold_range

In [23]:
def threshold_range(y_true, y_score_prob, lower=0.2,
                    upper=0.5, step=0.05, kind='prob'):

    arr = np.arange(lower, upper, step)
    df = pd.DataFrame(
        columns={
            'Threshold': float,
            'Precision': None,
            'Recall': None,
            'F1-score': None,
            'FP': int,
            'FN': None,
            'TP': None,
            'TN': None
            }
        )

    if kind == 'prob':
        
        for threshold in arr:
            y_pred_lower_threshold = y_score_prob[:, 1]>=threshold
            p = precision_score(y_true, y_pred_lower_threshold).round(2)
            r = recall_score(y_true, y_pred_lower_threshold).round(2)
            f1 = f1_score(y_true, y_pred_lower_threshold).round(2)

            matrix = confusion_matrix(
                y_true, 
                y_score_prob[:, 1] >= threshold
            )

            fp = matrix[0][1]
            fn = matrix[1][0]
            tp = matrix[1][1]
            tn = matrix[0][0]

            l = [threshold, p, r, f1, fp, fn, tp, tn]
            df.loc[len(df)] = l

        for col in ['FP', 'FN', 'TP', 'TN']:
            df[col] = df[col].astype(np.int64)

        return df

    if kind == 'func':
        
        for threshold in arr:
            y_pred_lower_threshold = y_score_prob>=threshold
            p = precision_score(y_true, y_pred_lower_threshold).round(2)
            r = recall_score(y_true, y_pred_lower_threshold).round(2)
            f1 = f1_score(y_true, y_pred_lower_threshold).round(2)

            j = round(threshold, 4)

            l = [j, p, r, f1]
            df.loc[len(df)] = l

        return df

### results_df_create

In [24]:
def results_df_create(model_list, model_performance_list,
                      model_exec_time_list, hyper_dict=None, drop_hyper=None):

    final_df = pd.DataFrame(
        columns={
            'index': [],
            'AUC ROC': [],
            'AUC ROC Std': [],
            'Average Precision': [],
            'Average Precision Std': [],
            'Card Precision@100': [],
            'Card Precision@100 Std': [],
            'Fit Time': [],
            'Score Time': [],
            'Tuning Time': [],
        }
    )
    
    final_zip = zip(model_list,
                    model_performance_list,
                    model_exec_time_list)

    for name, perf, time in final_zip:

        final_df_row = []
        df_sorted = perf.sort_values('Average Precision Validation',
                                     ascending=False)
        
        auc_roc = df_sorted.iloc[0]['AUC ROC Test']
        auc_roc_std = df_sorted.iloc[0]['AUC ROC Test Std']
        avg_precision = df_sorted.iloc[0]['Average Precision Test']
        avg_precision_std = df_sorted.iloc[0]['Average Precision Test Std']
        card_precision = df_sorted.iloc[0]['Card Precision@100 Test']
        card_precision_std = df_sorted.iloc[0]['Card Precision@100 Test Std']
        fit_time = df_sorted.iloc[0]['Fit time']
        score_time = df_sorted.iloc[0]['Score time']

        # round metrics
        round_3 = [
            auc_roc,
            avg_precision,
            card_precision
        ]
        
        (auc_roc,
         avg_precision,
         card_precision) = [np.round(i, 3) for i in round_3]

        # round std deviations
        round_4 = [
            auc_roc_std,
            avg_precision_std,
            card_precision_std
        ]

        (auc_roc_std,
         avg_precision_std,
         card_precision_std) = [np.round(i, 4) for i in round_4]

        # add rows to df
        cols = [
            name, auc_roc, auc_roc_std, avg_precision, avg_precision_std,
            card_precision, card_precision_std, fit_time, score_time, time
        ]
        
        for i in cols:
            final_df_row.append(i)

        final_df.loc[len(final_df)] = final_df_row

        if hyper_dict:
            
            hyper_list = []
            
            for model in hyper_dict.keys():
                params = list(hyper_dict[model].values())
                if drop_hyper:
                    del params[-drop_hyper:]
                else:
                    pass
                    
                hyper_list.append(params)
                
                hyper_string = []

                for params in hyper_list:
                    params_string = '/'.join(map(str, params))
                    hyper_string.append(params_string)
        
    final_df = final_df.set_index('index', drop=True)  
    final_df.index.name = None

    if hyper_dict:

        hyper_list = []
        
        cols = ['Parameters'] + list(final_df.columns)
        final_df['Parameters'] = hyper_string
        final_df = final_df[cols]

    return final_df

### extract_hyperparams

In [25]:
def extract_hyperparams(dict, model, kind='value'):

    dict_model = dict[model].copy()

    if kind == 'list':
        
        dict_model_keys = dict_model.keys()
        
        for key in dict_model_keys:
            dict_model[key] = [dict_model[key]]
    
    return dict_model

## Plotting

### data_smoothing

In [None]:
def data_smoothing(data, x_var, y_vars, k=3, smooth_length=300):

    df = pd.DataFrame()
    
    for col in y_vars:
    
        x = data[x_var].values
        y = data[col].values
    
        spl = make_interp_spline(x, y, k=k)
        x_smooth = np.linspace(x.min(), x.max(), smooth_length) 
        y_smooth = spl(x_smooth)
    
        df[col] = y_smooth
    
    df[x_var] = x_smooth
    
    return df

In [None]:
def data_smoothing(data, x_var, y_vars, k=3, smooth_num=300):

    df = pd.DataFrame()

    x = data[x_var].values
    x_smooth = np.linspace(x.min(), x.max(), smooth_num)
    
    df[x_var] = x_smooth
    
    for col in y_vars:
    
        y = data[col].values
        spl = make_interp_spline(x, y, k=k)
        y_smooth = spl(x_smooth)
    
        df[col] = y_smooth
    
    return df

### plot_performance

In [26]:
def plot_performance(data, metrics, subsets, 
                     title=None, xlabel=None, colors=None,
                     rotation=None, save=False, filename='0'):

    if not colors:
        colors = palette

    if not xlabel:
        xlabel = 'Hyperparameter Value'
    
    cols = len(metrics)
        
    fig, axs = plt.subplots(nrows=1, ncols=cols,
                            sharex=True, figsize=(11.5,2.5))
    
    fig.suptitle(title, weight='normal', size=11, y=1.1)
    
    abscissa = data['Parameters summary']
    
    for metric in metrics:
    
        ax_idx = metrics.index(metric)
        
        for subset in subsets:
            
            metric_subset = metric + ' ' + subset
            metric_subset_std = metric_subset + ' Std'
            
            subset_idx = subsets.index(subset)
    
            # conf_min = data[metric_subset] - 2*data[metric_subset_std]
            # conf_max = data[metric_subset] + 2*data[metric_subset_std]

            # t-student = 3.18

            conf_min = (data[metric_subset]
                        - 3.18*(data[metric_subset_std]/4**0.5))
            conf_max = (data[metric_subset]
                        + 3.18*(data[metric_subset_std]/4**0.5))
    
            legend, label = (None, ) * 2
            
            if ax_idx == cols-1:
                legend = 'brief'
                label = subset
            try:
                sns.lineplot(
                    data=data,
                    x=abscissa,
                    y=metric_subset,
                    legend=legend,
                    label=label,
                    color=colors[subset_idx],
                    ax=axs[ax_idx]
                );

            except TypeError:
                print('Specify colors')
                plt.close()
                return
            
            axs[ax_idx].fill_between(abscissa,
                                     conf_min,
                                     conf_max,
                                     color=colors[subset_idx],
                                     alpha=0.07)
            
        # optimum - parameters for max value of each metric due Validation
        metric_name = metric + ' ' + 'Validation'
        best_index = data[metric_name].idxmax()
        
        best_parameter = data.loc[best_index, 'Parameters summary']
        best_performance = data.loc[best_index, metric_name]
    
        ymin = axs[ax_idx].get_ylim()[0]
        
        # optimum line
        axs[ax_idx].vlines(
            best_parameter,
            ymin,
            best_performance,
            linestyles='--',
            color=palette[-1]
        )

        axs[ax_idx].set_title(metric,
                                weight='bold',
                                size=9, y=1.03,
                                loc='center')
        
        axs[ax_idx].set_xlabel(xlabel=xlabel,
                                 weight='normal',
                                 style='italic',
                                 labelpad=10)
        
        axs[ax_idx].set_ylabel(None)

        if rotation:
            axs[ax_idx].tick_params(axis='x', rotation=rotation)
            anchor = (-0.33, -0.5)
        else:
            anchor = (-0.33, -0.32)

    plt.legend(
        labels=None, bbox_to_anchor=anchor,
        ncols=2, prop={'size': 9})
        
    plt.subplots_adjust(wspace=0.2)

    if save:
        plt.savefig(
            fname='img/{}.png'.format(filename),
            bbox_inches='tight'
        )
    
    plt.show()

### plot_metrics

In [27]:
def plot_metrics(data, metrics, models_list, palette, limits_list, save=False, filename='0'):

    width=0.55
    fig_width = 11
    fig_height = 3
    t = 3.18
    n_folds = 4

    fig, axs = plt.subplots(nrows=1, ncols=3, figsize=(fig_width,fig_height))

    for metric in metrics:

        metric_index = metrics.index(metric)
        
        sns.barplot(
            data=data,
            x=results.index,
            y=metric,
            width=width,
            palette=palette,
            ax=axs[metric_index]
        );

        ymin = limits_list[metric_index][0]
        ymax = limits_list[metric_index][1]
        step = limits_list[metric_index][2]

        axs[metric_index].tick_params(
            axis='x',
            which='both',
            bottom=False,
            top=False,
            labelbottom=False)
        
        axs[metric_index].set_ylim(ymin, ymax)
        axs[metric_index].set_yticks(ticks=np.arange(ymin, ymax, step))
        axs[metric_index].yaxis.set_tick_params(labelsize=8)
        # axs[metric_index].xaxis.set_tick_params(rotation=rotation)
        axs[metric_index].set_title(metric, size=9)
        axs[metric_index].set_ylabel(None)

        for model in data.index:

            mean = data.loc[model, metric]
            
            metric_std_col = metric + ' Std'
            std = data.loc[model, metric_std_col]
            
            ymin = mean - t*(std/n_folds**0.5)
            ymax = mean + t*(std/n_folds**0.5)
            
            axs[metric_index].plot([model, model], [ymin, ymax],
                     color=palette[-1],
                     linestyle='-',
                     linewidth=1.5)
    
    lr = mlines.Line2D([], [], color=colors[0], marker='s', linestyle='None',
                          markersize=5, label=models_list[0])

    rf = mlines.Line2D([], [], color=colors[1], marker='s', linestyle='None',
                          markersize=5, label=models_list[1])

    xgb = mlines.Line2D([], [], color=colors[2], marker='s', linestyle='None',
                          markersize=5, label=models_list[2])

    lgb = mlines.Line2D([], [], color=colors[3], marker='s', linestyle='None',
                          markersize=5, label=models_list[3])
    
    plt.legend(
        labels=None, loc='lower center', bbox_to_anchor=(-0.8, -0.35), ncols=2,
        prop={'size': 9}, labelcolor='0.3', handles=[lr,rf,xgb,lgb]
    )
    
    plt.subplots_adjust(wspace=0.3)
    
    if save:
        plt.savefig(
            fname='img/{}.png'.format(filename),
            bbox_inches='tight'
        )

    plt.show()

### plot_roc_curve

In [28]:
def plot_roc_curve(
        y_true, y_score_prob_list, name_list,
        palette, save=True, filename='0'):

    markers = ['o', 'v', 's', 'D']
    vars_zip = zip(y_score_prob_list, name_list, palette, markers)
    
    plt.figure(figsize=(8, 4))
    
    for score, name, color, marker in vars_zip:

        fpr, tpr, thresholds = roc_curve(y_true, score)
        roc_auc = auc(fpr, tpr)
        label = '{0} (AUC = {1:.3f})'.format(name, roc_auc)

        # roc_curve
        plt.plot(fpr, tpr, label=label, color=color)

        # random model curve
        plt.plot([0, 1], [0, 1],
                 color=palette[-1],
                 linestyle='--',
                 linewidth=0.6)
        
        plt.xlabel(
            xlabel='False Positive Rate (1-Specifity)', fontsize=10,
            weight='normal', color='0.3'
        )
        
        plt.ylabel(
            ylabel='True Positive Rate (Recall)', fontsize=10,
            weight='normal', color='0.3'
        )

        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.08])
        plt.xticks(size=9)
        plt.yticks(size=9)
    
        close_default = np.argmin(np.abs(thresholds - 0.5))
        
        sns.scatterplot(
            x=[fpr[close_default]],
            y=[tpr[close_default]],
            marker=marker,
            s=50,
            color=palette[-1],
            edgecolor=palette[-1],
            linewidth=1.5,
            facecolor='None',
            label='0.5 Predict Probability Threshold'
        )
    
    plt.legend(
        labels=None, loc='upper right', bbox_to_anchor=(1.435, 1),
        prop={'size': 9}, labelcolor='0.3'
    )

    if save:
        plt.savefig(
            fname='img/{}.png'.format(filename),
            bbox_inches='tight'
        )
        
    plt.show()

### plot_prec_rec

In [29]:
def plot_prec_rec(
        y_true, y_score_prob_list, name_list,
        palette, save=False, filename='0'):

    markers = ['o', 'v', 's', 'D']
    vars_zip = zip(y_score_prob_list, name_list, palette, markers)
    plt.figure(figsize=(8, 4))
    metrics_opt = []
    
    for score, name, color, marker in vars_zip:
    
        precision, recall, thresholds = precision_recall_curve(y_true, score)
        # calculate f_score
        f = (2 * precision * recall) / (precision + recall)
        # locate the index of the largest f_score
        x = np.argmax(f)
        # calculate best recall, precision
        precision_opt = round(precision[x], 2)
        recall_opt = round(recall[x], 2)
        threshold_opt = round(thresholds[x], 2)

        ap = average_precision_score(y_true, score)

        metrics_list = [precision_opt, recall_opt, threshold_opt]
        metrics_opt.append(metrics_list)
    
        plt.xlabel('Recall', fontsize=10, weight='normal', color='0.3')
        plt.ylabel('Precision', fontsize=10, weight='normal', color='0.3')
    
        plt.plot(recall, precision, label='{0} (AP = {1:.3f})'.format(name, ap), color=color)
    
        sns.scatterplot(
            x=[recall_opt],
            y=[precision_opt],
            marker=marker,
            s=50,
            color=palette[-1],
            edgecolor=palette[-1],
            linewidth=1.5,
            facecolor='None',
            label='Optimal Threshold'
        )

    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.08])
    plt.xticks(size=9)
    plt.yticks(size=9)
    
    plt.legend(
        labels=None, loc='upper right', bbox_to_anchor=(1.425, 1),
        prop={'size': 9}, labelcolor='0.3'
    )
    
    if save:
        plt.savefig(
            fname='img/{}.png'.format(filename),
            bbox_inches='tight'
        )
        
    plt.show()

    return metrics_opt

### plot_times

In [30]:
def plot_times(data, times, palette, figsize=(8,6), save=False, filename='0'):

    width = 0.6
    size = 9
    nrows = len(times)
    ncols = 1

    fig, axs = plt.subplots(nrows=nrows, ncols=ncols, figsize=figsize)

    for time in times:

        time_index = times.index(time)

        if len(times) == 1:
            ax=axs
        else:
            ax=axs[time_index]

        sns.barplot(
            data=data,
            x=time,
            y=data.index,
            palette=palette,
            width=width,
            ax=ax
        )
        
        ax.set_xlabel(None)
        ax.set_title(
            label='{0} (sec)'.format(time), weight='bold', size=size
        )
        
    plt.subplots_adjust(hspace=0.6)

    if save:
        plt.savefig(
            fname='img/{}.png'.format(filename),
            bbox_inches='tight'
        )
        
    plt.show()

### plot_unbalanced_model

In [31]:
def plot_unbalanced_model(
        data, metrics_list, colors,
        n_folds, limits_list, save=False, filename='0'):

    linewidth = 1.5
    alpha = 0.25
    t = 3.18
    
    plt.figure(figsize=(11.5,5))

    rows = int(np.ceil(len(metrics_list) / 2))
    cols = 2

    for metric in metrics_list:

        metric_index = metrics_list.index(metric)
        
        plt.subplot(rows,cols,metric_index+1)
        plt.title(metric, size=9)
        
        sns.lineplot(
            data=data,
            x=data.index,
            y=metric,
            linewidth=linewidth,
            alpha=alpha,
            color=colors[metric_index]
        );

        sns.scatterplot(
            data=data,
            x=data.index,
            y=metric,
            s=25,
            alpha=1,
            color=colors[metric_index]
        )

        column_std_list = list(data.columns)
        metric_std = metric + ' Std'
        
        if metric_std in column_std_list:
        
            for model in data.index:
    
                metric_std = metric + ' Std'
                
                mean = data.loc[model, metric]
                std = data.loc[model, metric_std]
                
                ymin = mean - t*(std/n_folds**0.5)
                ymax = mean + t*(std/n_folds**0.5)
        
                # plot confidence intervals
                plt.plot([model, model],
                         [ymin, ymax],
                          color=colors[metric_index],
                          linewidth=1.4,
                          alpha=0.75)
        
        ymin = np.arange(limits_list[metric_index][0],
                         limits_list[metric_index][1],
                         limits_list[metric_index][2])[0]
        ymax = np.arange(limits_list[metric_index][0],
                         limits_list[metric_index][1],
                         limits_list[metric_index][2])[-1]
            
        plt.ylim(ymin, ymax)
        plt.yticks(
            np.arange(limits_list[metric_index][0],
                      limits_list[metric_index][1],
                      limits_list[metric_index][2])
        )
        plt.ylabel(None)
        
    plt.subplots_adjust(wspace=0.2, hspace=0.5)

    if save:
        plt.savefig(
            fname='img/{}.png'.format(filename),
            bbox_inches='tight'
        )

    plt.show()

### plot_unbalanced_metric

In [1]:
def plot_unbalanced_metric(data, metric, colors, models_list, save=False, filename='0'):

    plt.figure(figsize=(8,4))
    
    sns.lineplot(
        data=data,
        x=data.index,
        y=metric,
        hue='model',
        legend=False,
        palette=colors,
        alpha=0.25,
        linewidth=1.5
    )
    
    sns.scatterplot(
        data=data,
        x=data.index,
        y=metric,
        hue='model',
        palette=colors,
        marker='s',
        s=40,
        alpha=1
    )

    lr = mlines.Line2D([], [], color=colors[0], marker='s', linestyle='None',
                          markersize=5, label=models_list[0])

    rf = mlines.Line2D([], [], color=colors[1], marker='s', linestyle='None',
                          markersize=5, label=models_list[1])

    xgb = mlines.Line2D([], [], color=colors[2], marker='s', linestyle='None',
                          markersize=5, label=models_list[2])

    lgb = mlines.Line2D([], [], color=colors[3], marker='s', linestyle='None',
                          markersize=5, label=models_list[3])
    
    plt.legend(
        labels=None, loc='upper right', bbox_to_anchor=(1.42, 1.025),
        prop={'size': 9}, labelcolor='0.3', handles=[lr,rf,xgb,lgb]
    )

    plt.ylabel(metric, weight='normal', size=10)

    if save:
        plt.savefig(
            fname='img/{}.png'.format(filename),
            bbox_inches='tight'
        )

    plt.show()