In [3]:
import csv
import numpy as np
import pandas as pd

In [4]:
from sklearn import metrics
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import make_scorer, accuracy_score
from sklearn.metrics import precision_score, recall_score, f1_score, cohen_kappa_score, average_precision_score
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, GridSearchCV

In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
#import xgboost as xgb

In [6]:
df = pd.read_csv('Pfizer_11-5-21.csv', converters={'POS_LEVEL1': lambda x: str(x), 'POS_LEVEL2':lambda x: str(x),
                                          'POS_LEVEL3': lambda x: str(x), 'POS_LEVEL4': lambda x: str(x),
                                          'POS_LEVEL5': lambda x: str(x)})

  exec(code_obj, self.user_global_ns, self.user_ns)


In [7]:
df = df.drop(columns=['CALYRMO', 'CATEGORY', 'VOL_INVOL', 'ACTIONDT', 'ACTION', 'ACTIONCD', 'REASON', 'RSNCD',
                     'Country.Level.Manufacturing',
       'Country.Level.Sales', 'Country.Level.R.D', 'Country.Level.Finance',
       'Country.Level.Legal', 'Country.Level.HR', 'Country.Level.IT',
       'Country.Level.Pharma.Other'])

In [8]:
df = df.dropna()

In [9]:
df.shape

(279505, 45)

In [10]:
scorers = {
    'kappa': make_scorer(cohen_kappa_score),
    'f1': make_scorer(f1_score)
}

In [11]:
y = df['Status']

In [12]:
chosen_vars = [
       'POSITIONLVL',
       'LGTHOFSVC', 'TIMEINJOB', 'TIMEINPOSITION', 'COMPARATIO',
       'Status', 'Country.Level.Cost.of.Living.Index',
       'Country.Level.Traffic.Index', 'Country.Level.Time.Index..in.minutes.',
       'Country.Level.Time.Ex.Index', 'Country.Level.Inefficiencies.Index',
       'Country.Level.Unemployment.Rate....',
       'Country.Level.Corruption.Perception.Index',
       'Country.Level.Inflation....',
       'Country.Level.GDP..nominal...in.USD.billion.',
       'Country.Level.Pharma.Industry.Growth',
       'Country.Level.Labor.Market.Risk.Index.Score',
       'Country.Level.Political.Risk.Index.Score',
       'Country.Level.Power.distance', 'Country.Level.Individualism',
       'Country.Level.Masculinity', 'Country.Level.Uncertainty.avoidance',
       'Country.Level.Long.term.orientation', 'Country.Level.Indulgence',
       'COMPARATIO_IMPUTED', 'demand_press']

In [13]:
X = df[chosen_vars]

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.3, random_state=8)

In [15]:
scaler = MinMaxScaler()
X_train_mm = scaler.fit_transform(X_train)
X_test_mm = scaler.transform(X_test)

In [16]:
X_train = pd.DataFrame(X_train_mm, columns=[chosen_vars])

In [17]:
X_test = pd.DataFrame(X_test_mm, columns=[chosen_vars])

In [18]:
X_train.describe()

Unnamed: 0,POSITIONLVL,LGTHOFSVC,TIMEINJOB,TIMEINPOSITION,COMPARATIO,Status,Country.Level.Cost.of.Living.Index,Country.Level.Traffic.Index,Country.Level.Time.Index..in.minutes.,Country.Level.Time.Ex.Index,...,Country.Level.Labor.Market.Risk.Index.Score,Country.Level.Political.Risk.Index.Score,Country.Level.Power.distance,Country.Level.Individualism,Country.Level.Masculinity,Country.Level.Uncertainty.avoidance,Country.Level.Long.term.orientation,Country.Level.Indulgence,COMPARATIO_IMPUTED,demand_press
count,195653.0,195653.0,195653.0,195653.0,195653.0,195653.0,195653.0,195653.0,195653.0,195653.0,...,195653.0,195653.0,195653.0,195653.0,195653.0,195653.0,195653.0,195653.0,195653.0,195653.0
mean,0.459525,0.272513,0.153664,0.055004,0.44039,0.000618,0.488116,0.472614,0.380586,0.285403,...,0.70698,0.669008,0.600893,0.34813,0.812729,0.656832,0.601273,0.453311,0.156251,0.575917
std,0.21452,0.210042,0.128043,0.134511,0.064456,0.024861,0.314143,0.320929,0.306208,0.29999,...,0.312788,0.276682,0.3466,0.244266,0.22216,0.342348,0.349511,0.344109,0.363094,0.2428
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.333333,0.073756,0.044037,0.003552,0.398293,0.0,0.243449,0.09622,0.150888,0.104825,...,0.5,0.666667,0.235294,0.222222,0.753846,0.292308,0.369565,0.0,0.0,0.382886
50%,0.5,0.264494,0.115876,0.00888,0.442742,0.0,0.342043,0.493518,0.262382,0.119444,...,0.785714,0.666667,0.558824,0.277778,0.753846,0.707692,0.543478,0.5625,0.0,0.505007
75%,0.666667,0.409834,0.231681,0.019655,0.481586,0.0,0.65058,0.742034,0.589969,0.45328,...,1.0,0.833333,1.0,0.277778,1.0,1.0,1.0,0.791667,0.0,0.825775
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [19]:
clf_dict = {
    'Logistic Regression': LogisticRegression(class_weight='balanced', max_iter=3000, random_state=8),
    'Naive Bayes': GaussianNB(priors=[0.113, 0.887]), 
    'Random Forest': RandomForestClassifier(criterion='entropy', class_weight='balanced', random_state=8),
    'SV Classification': SVC(class_weight='balanced', random_state=8), 
    #'XG Boost' : xgb.XGBClassifier(scale_pos_weight=7.85, use_label_encoder=False, seed=8)    
}

In [22]:
def batch_clf(X_train, y_train, X_test, y_test, clf_dict, verbose=True):
    '''
    Fits a dictionary of algorithms, tests them and returns metrics
    '''
    train_f1_scores = []
    test_f1_scores = []
    train_cohen_scores = []
    test_cohen_scores = []
    train_pre_scores = []
    test_pre_scores = []
    train_acc_scores = []
    test_acc_scores = []
    
    # Loop through dictionary items
    for key, clf in clf_dict.items():
   
        # Fit classifier
        clf_fitted = clf.fit(X_train, y_train)
        
        # Get predictions
        train_preds = clf_fitted.predict(X_train)
        test_preds = clf_fitted.predict(X_test)

        #Get F1 scores
        train_f1 = f1_score(y_train, train_preds, average='binary')
        train_f1_scores.append(round(train_f1, 4))
        test_f1 = f1_score(y_test, test_preds, average='binary')
        test_f1_scores.append(round(test_f1, 4))
        
        # Get Cohen's kappa
        train_cohen = cohen_kappa_score(y_train, train_preds)
        train_cohen_scores.append(round(train_cohen, 4))
        test_cohen = cohen_kappa_score(y_test, test_preds)
        test_cohen_scores.append(round(test_cohen, 4))
        
        # Get AUC of precision-recall curves
        train_pre = average_precision_score(y_train, train_preds, average='macro')
        train_pre_scores.append(round(train_pre, 4))
        test_pre = average_precision_score(y_test, test_preds, average='macro')
        test_pre_scores.append(round(test_pre, 4))        
                
        #Get accuracy scores
        train_acc = accuracy_score(y_train, train_preds)
        train_acc_scores.append(round(train_acc,4))
        test_acc = accuracy_score(y_test, test_preds)
        test_acc_scores.append(round(test_acc,4))
        
    # Create results dataframe
    results = pd.DataFrame({'Model': list(clf_dict.keys()), 
                            'Train F1': train_f1_scores,
                            'Test F1': test_f1_scores,
                            "Train Cohen's kappa" : train_cohen_scores,
                            "Test Cohen's kappa" : test_cohen_scores,
                            'Train PR-AUC': train_pre_scores,
                            'Test PR-AUC': test_pre_scores,
                            'Train Accuracy': train_acc_scores,
                            'Test Accuracy': test_acc_scores,
                            })

    return results

In [23]:
results = batch_clf(X_train, y_train, X_test, y_test, clf_dict)
results

Unnamed: 0,Model,Train F1,Test F1,Train Cohen's kappa,Test Cohen's kappa,Train PR-AUC,Test PR-AUC,Train Accuracy,Test Accuracy
0,Logistic Regression,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
1,Naive Bayes,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2,Random Forest,1.0,0.9903,1.0,0.9903,1.0,0.9808,1.0,1.0
3,SV Classification,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
