In [1]:
%matplotlib inline
from matplotlib import pyplot as plt
import pandas as pd
import numpy as np

from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.metrics import log_loss, precision_recall_curve
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_classification

import warnings
warnings.filterwarnings('ignore')

In [2]:
def choose_best_threshold(probas, target):
    ''' iterates over thresholds to choose the one with the highest f1 score '''
    results = dict()
    precision, recall, thresholds = precision_recall_curve(target, probas)
    f1_scores = 2 * recall * precision / (recall + precision)
    best_f1_ind = np.argmax(f1_scores)
    results['best_threshold'] = thresholds[best_f1_ind]
    results['validation_precision']      = precision[best_f1_ind]
    results['validation_recall']         = recall[best_f1_ind]
    results['validation_f1_score']  = np.max(f1_scores)
    return results 
    
def train_model_and_do_validation_predict(df_train, df_valid, target):
    ''' trains the model and predicts on validation set
        returns model and validation set predictions'''
    X, y = df_train.drop(columns=target), df_train[target]
    X_valid = df_valid.drop(columns=target)
    model = RandomForestClassifier(n_jobs=-1, n_estimators=256, min_samples_leaf=20, random_state=42).fit(X, y)
    return model, model.predict_proba(X_valid)[:, 1]

def score_production_predictions(df_prod, target, model, threshold):
    ''' computes the f1 score on the "production" dataset with the provided threshold '''
    X_prod = df_prod.drop(columns=target)
    P_prod = model.predict_proba(X_prod)[:, 1] > threshold
    return f1_score(df_prod[target], P_prod)


def train_optimize_and_predict_on_prod(train_val, df_prod, target):
    ''' the full test, validate, tune threhsold, and predict on "production data" sequence '''
    train, val = train_test_split(train_val, test_size=0.2, random_state=42)
    model, P_val = train_model_and_do_validation_predict(train, val, target)
    results = choose_best_threshold(P_val, val[target])
    prod_f1_score = score_production_predictions(df_prod, target, model, results['best_threshold'])
    results['prod_f1_score'] = prod_f1_score
    return results

def run_experiment(df, target):
    # set aside test set for final evaluation
    #     held out "test set" is called "df_prod" to denote that it's
    #     our best representation of what the model will be encountering "in prod"
    #     having the "untouched" class balance 
    train_val, df_prod = train_test_split(df, test_size=0.2, random_state=42) 

    # resampler - try different resamplers if you like
    resampler = RandomUnderSampler(sampling_strategy=1.0, random_state=42) # 50/50 balanced undersampling
    train_val_resampled, _ = resampler.fit_resample(train_val, train_val[target])
        
    # experiment 1: 
    #  - train model on resampled data
    #  - pick optimal threshold based on validation set f1 score
    #  - predict on "production_data" and compare to our expectations
    #    as defined by validation set results
    resampled_results = train_optimize_and_predict_on_prod(train_val_resampled, df_prod, target)
    
    # experiment 2: 
    #   - same as experiment 1 but do not resample the data first
    raw_results = train_optimize_and_predict_on_prod(train_val, df_prod, target)
    
    results = pd.DataFrame([resampled_results, raw_results], index=['resampled', 'non-resampled']).T.round(2)
    return results

In [3]:
df = pd.read_csv('mapped-v2_imputed_mf5iter.csv')
target = 'DC201'
df.head(3)

Unnamed: 0,DC216,DC220,DC142a,DC024,DC025,DC205,DC206,DC207,DC208,DC209,...,DC237e,DC237f,DC241,DC242,DC244,DC246,DC252,DC270a,DC109,DC201
0,1.0,80.0,60.0,0,1,6.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0,1.0,1.0
1,2.0,44.0,70.0,0,1,13.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1,3.0,1.0
2,2.0,35.0,75.0,0,1,2.0,1.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,3,0.0,1.0


In [4]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
scaler = MinMaxScaler()
# scaler = StandardScaler()

NUMERICAL_DATA = ['DC216', 'DC220', 'DC142a']
CATEGORICAL_DATA = [column for column in df.columns if column != target and column not in NUMERICAL_DATA]

df_copy = df.drop(target, axis=1)
X_all = pd.DataFrame(scaler.fit_transform(df_copy))

numerical_df = df[NUMERICAL_DATA].copy()
categorical_df = df[CATEGORICAL_DATA].astype('category').copy()

norm_numerical_df = pd.DataFrame(scaler.fit_transform(numerical_df), columns=NUMERICAL_DATA)
# encoded_df = pd.get_dummies(categorical_df)
encoded_df = categorical_df.copy()

merged_df = pd.concat([norm_numerical_df, encoded_df], axis=1)

y = np.array(df[target])
X = np.array(merged_df)
X_all

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,28,29,30,31,32,33,34,35,36,37
0,0.05,0.788235,0.060120,0.0,1.0,0.266667,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.50,0.00,0.125
1,0.10,0.364706,0.070140,0.0,1.0,0.733333,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,1.0,1.0,0.25,0.25,0.375
2,0.10,0.258824,0.075150,0.0,1.0,0.000000,1.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.000000,0.0,1.0,1.0,0.00,0.75,0.000
3,0.15,0.776471,0.085170,0.0,1.0,0.000000,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.333333,0.0,1.0,0.0,0.25,0.00,0.000
4,0.10,0.282353,0.065130,0.0,1.0,0.000000,1.0,0.0,1.0,1.0,...,0.5,0.0,0.0,0.000000,0.0,0.0,1.0,0.00,0.25,0.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35965,0.05,0.094118,0.009018,1.0,0.0,0.333333,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.666667,0.0,0.0,0.0,0.00,0.00,0.250
35966,0.05,0.329412,0.012024,1.0,0.0,0.000000,1.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.000000,1.0,0.0,0.0,0.00,0.25,0.000
35967,0.10,0.458824,0.048096,1.0,0.0,0.000000,1.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.000000,1.0,0.0,1.0,0.25,0.00,0.000
35968,0.10,0.529412,0.039078,1.0,0.0,0.000000,1.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.000000,1.0,0.0,0.0,0.00,0.25,0.000


In [5]:
run_experiment(df=X_all, target=y)

: 

: 