In [None]:
import warnings
warnings.filterwarnings('ignore')

from scipy import stats
import pandas as pd
from pandas.tseries.offsets import DateOffset
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as datetime
import time
import shap
from sklearn.model_selection import GridSearchCV, ShuffleSplit, cross_validate, train_test_split, RandomizedSearchCV
from sklearn.metrics import roc_auc_score, roc_curve, accuracy_score, confusion_matrix, classification_report,ConfusionMatrixDisplay
import xgboost as xgb
from pandas_profiling import ProfileReport
from sklearn.preprocessing import OneHotEncoder
import sys
from sklearn.experimental import enable_iterative_imputer 
from sklearn.impute import IterativeImputer
from sklearn.linear_model import LogisticRegression
import os


pd.options.display.max_rows = 2000
pd.options.display.max_columns = 100

# Eda Functions and running

In [None]:
def ttest(df,target, feature):
    pos_class = df[df[target]==1][feature]
    neg_class = df[df[target]==0][feature]
    tstat, pval = stats.ttest_ind(pos_class, neg_class, equal_var=False)
    print('t-statistic: {:.1f}, p-value: {:.3}'.format(tstat, pval))
    
def describe_cont_feature(df,target,feature):
    print('\n*** Results for {} ***'.format(feature))
    print(df.groupby(target)[feature].describe())
    print(ttest(df, target, feature))

def quad_plot(df, feature, target, logistic = True):
    f, axes = plt.subplots(2, 2,figsize=(15,15))
    # top left swarm with boxplot
    sns.boxplot(x=df[target], y=df[feature], palette = "viridis", ax = axes[0][0])
    sns.violinplot(x=df[target], y=df[feature], color="turquoise", ax = axes[0][0])
    plt.setp(axes[0][0].collections, alpha=.3)
    # top right cumulative density plot
    sns.kdeplot(data = df, x = feature, hue = target, palette= "viridis_r", multiple = "fill", ax = axes[0][1])
    axes[0][1].legend([target,"Not {}".format(target)])
    if (df[feature].min() > 0):
        axes[0][1].set_xlim(left = 0)
    # bottom left regression plot either logistic or regular
    if logistic == False:
        sns.regplot(data = df, x= feature,y= target,logistic = False,color = 'teal', ax = axes[1][0])
    else:
        sns.regplot(data = df, x= feature,y= target,logistic = True,color = 'teal', ax = axes[1][0])
    # bottom right is are comparing the distributions of the features between positive and negative class
    sns.distplot(df.loc[df[target] != 1,feature], color = 'teal', ax = axes[1][1])
    sns.distplot(df.loc[df[target] == 1,feature], color = 'purple', ax = axes[1][1])
    if (df[feature].min() > 0):
        axes[1][1].set_xlim(left = 0)
    plt.show()
    
def univariate_breakdown(df, feature, target, logistic = True):
    
    describe_cont_feature(df,target,feature)
    quad_plot(df, feature, target, logistic = logistic)

def cols_analysis(df, target, ignore_cols = []):
    continuous_cols = [col for col in df.columns if col not in ignore_cols]
    for col in continuous_cols:
        print("Pearson correlation of {} to target {}: {}"
              .format(col, target, df[col].corr(df[target])))
        univariate_breakdown(df, col ,target)
        

In [None]:
# continuous variable EDA
# specify continuous columns
cont_cols = []
cols_analysis(df= model_df[cont_cols], target = 'not_working', ignore_cols = ['UnitID','not_working','dow'])

In [None]:
def univariate_barplot(df, target, feat2):
    plt.figure(figsize = (12,8))
    sns.barplot(y = df[feat1], x = df[feat2], palette = "viridis", orient = "h")
    plt.show()
# Categorical variable EDA
# for col in categorical colsumns:
    univariate_barplot(model_df, target = '', feature = '')

# Modeling Funcs

In [282]:
def get_train_test_holdout(model_df,target, split = 0.3, validation_set = False, categorical_cols = []):
    # randomize data 
    model_df = model_df.sample(frac=1, random_state = 33).reset_index(drop=True)
    X = model_df.copy()
    if len(categorical_cols) > 0:
        X = pd.get_dummies(model_df, drop_first = True, columns = categorical_cols)
    y = model_df[[target]].values
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= split, random_state= 33 )
    # for holdout set take piece of train i.e. 0.125 x 0.8 = 0.1 of whole set
    if validation_set == True:
        X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.125, random_state= 33 ) 
    target_weight = (len(y_train) - y_train[:,0].sum()) / y_train[:,0].sum()
    if validation_set == True:
        return X_train, y_train, X_test, y_test, X_val, y_val, target_weight
    return X_train, y_train, X_test, y_test, target_weight

def reg_imputer(X_train, X_test, X_val = None, drop_cols = []):
    # if iterative imputer is throwing errors because it is imputing impossibly large values,
    # use min_vale = __ and max_value = ___ as parameters when initializing class object
    imp_reg = IterativeImputer(random_state=0)
    x_cols = [col for col in X_train if col not in drop_cols]
    X_train_reg = pd.DataFrame(imp_reg.fit_transform(X_train.drop(columns =  drop_cols)), columns =x_cols)
    X_train_reg.loc[:,drop_cols] =  X_train[drop_cols].reset_index(drop = True).copy()
    X_test_reg = pd.DataFrame(imp_reg.transform(X_test.drop(columns =  drop_cols)), columns =x_cols)
    X_test_reg.loc[:,drop_cols] =  X_test[drop_cols].reset_index(drop = True).copy()
    if (X_val is not None):
        X_val_reg = pd.DataFrame(imp_reg.transform(X_val.drop(columns =  drop_cols)), columns =x_cols)
        X_val_reg.loc[:,drop_cols] =  X_val[drop_cols].reset_index(drop = True).copy()
    return X_train_reg, X_test_reg, X_val_reg
    
def model_summary(model, X_train, y_train, X_test, y_test, threshold = 0.5):
    # train set predictions
    train_pred_prob = model.predict_proba(X_train)[:,1]
    train_pred = (train_pred_prob > threshold).astype("int")
    # test set predictions
    test_pred_prob = model.predict_proba(X_test)[:,1]
    test_pred = (test_pred_prob > threshold).astype("int")
    # model report
    print("Confusion Matrix for Test Set")
    print(confusion_matrix(y_test, test_pred))
    disp = ConfusionMatrixDisplay(confusion_matrix(y_test, test_pred))
    disp.plot()
    
    print("\n Classification Report for Test Set")
    print(classification_report(y_test, test_pred))
    
    print("Accuracy Train: {:.4f}".format(accuracy_score(y_train, train_pred)))
    print("AUC Train: {}".format(roc_auc_score(y_train, train_pred)))
    print("Accuracy Test: {:.4f}".format(accuracy_score(y_test, test_pred)))
    print("AUC Test: {}".format(roc_auc_score(y_test, test_pred)))
    
def plot_roc_curve(model, X_test, y_test):
    y_pred = model.predict_proba(X_test)[:,1]
    y_pred_pred = model.predict(X_test)
    test_auc = roc_auc_score(y_test, y_pred_pred)
    fpr, tpr, thresholds = roc_curve(y_test, y_pred)
    plt.figure()
    plt.plot(fpr, tpr, label = "AUC = {:.3f}".format(test_auc))
    plt.plot([0,1],[0,1],'r--')
    plt.xlim([0,1])
    plt.ylim([0,1.05])
    plt.title("ROC Plot")
    plt.legend(loc = "lower right")
    plt.show()

In [283]:
target = ''
# add anything else here
drop_cols = [target] + []
X_train, y_train, X_test, y_test, X_val, y_val, target_weight = get_train_test_holdout(model_df,
                                                                                       target,
                                                                                       split = 0.2,
                                                                                       validation_set = True,
                                                                                       cat_cols = [])
# impute nulls
X_train_reg, X_test_reg, X_val_reg = reg_imputer(X_train, X_test, X_val, drop_cols)

# Logistic Regression

In [None]:
clf = LogisticRegression(random_state=0).fit(X_train_reg.drop(columns =drop_cols), y_train)
log_pred_prob = clf.predict_proba(X_test_reg.drop(columns = drop_cols))[:,1]
log_pred = clf.predict(X_test_reg.drop(columns = drop_cols))

print("Logistic Regression Result")
model_summary(clf, X_train_reg.drop(columns = drop_cols), y_train,
              X_test_reg.drop(columns = drop_cols), y_test)
plot_roc_curve(clf, X_test_reg.drop(columns = drop_cols), y_test)

## XGBoost

In [None]:
# baseline XGB 
seed = 30
simple_xgb = xgb.XGBClassifier(random_state = seed, scale_pos_weight = target_weight)
simple_xgb.fit(X_train.drop(columns = drop_cols), y_train)
simple_xgb_pred_prob = simple_xgb.predict_proba(X_test.drop(columns = drop_cols))[:,1]
simple_xgb_pred = simple_xgb.predict(X_test.drop(columns = drop_cols))

print("Simple XGBoost Result")
model_summary(simple_xgb, X_train.drop(columns = drop_cols), y_train,
              X_test.drop(columns = drop_cols), y_test)
plot_roc_curve(simple_xgb, X_test.drop(columns = drop_cols), y_test)

# Validation Set results 

In [None]:
# holdout set results
val_pred = simple_xgb.predict(X_val.drop(columns = drop_cols))
val_pred_proba = simple_xgb.predict_proba(X_val.drop(columns = drop_cols))[:,1]

model_summary(simple_xgb, X_train.drop(columns = drop_cols), y_train,
              X_val.drop(columns = drop_cols), y_val)
plot_roc_curve(simple_xgb, X_val.drop(columns = drop_cols), y_val)

# Shap Values

In [None]:
# shap results for model
mybooster = simple_xgb.get_booster()
model_bytearray = mybooster.save_raw()[4:]
def myfunc(self = None):
    return model_bytearray
mybooster.save_raw = myfunc

In [None]:
explainer = shap.TreeExplainer(mybooster)
simple_shap_values = explainer.shap_values(X_test.drop(columns = drop_cols))
shap.summary_plot(simple_shap_values, X_test.drop(columns = drop_cols), 
                  max_display = X_test.drop(columns = drop_cols).columns.nunique())

# Regression

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [None]:
sc_X = StandardScaler()
X_train_std = sc_X.fir_transform(X_train)
X_test_std = sc_X.transform(X_test)
regressor = LinearRegression()
regressor.fit(X_train_std, y_train)
coef = regressor.coef_
y_pred = regressor.predict(X_test_std)

In [None]:
# residuals
sns.residplot(y_test, y_pred)

In [None]:
mod = xgb.XGBRegressor(objective = 'reg:linear')
eval_set = [(xtrain y train), (xtest y tesm)]
model.fit(xrrain, ytrain, evalset = evalset, eval_metric =' mae', early stopping rounds =30)