In [1]:
#Load the necessary Libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import math
import plotly.express as px
import datetime
import pytz
from plotly.subplots import make_subplots
import plotly.graph_objects as go

import statsmodels.api as sm

# Check for the VIF values of the feature variables. 
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Importing RFE and LinearRegression
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression

import warnings
warnings.filterwarnings("ignore")

### Linear Regression Custom Functions

In [2]:
# Build a linear model custom funtion

def lrm_vif(X_train_lm,y_train,addconstant=True):
    """This is a shortcut function to do linear regression model
    with less amount of code. Here we fit a constant and use stats nmodel
    if addconstant is given as 0 or false than we wont add the constant"""
    
    #This part will give us option to run statsmodel with out adding constant if needed
    if addconstant:
        X_train_lm = sm.add_constant(X_train_lm)

        
    lr = sm.OLS(y_train, X_train_lm).fit()
    print('\033[1m' +'\n\n LR parameters \n' +'\033[0m',lr.params)
    # Check the summary
    print('\033[1m' +'\n\n\t\t\t\t LR Summary\n\n\n' +'\033[0m',lr.summary())
    #now do the vif function call
    checkvif(X_train_lm)
    return lr
    

In [3]:
# custom code for VIF

def checkvif(X_train_lm):
    """we use function for getting vif values in sorted format"""
    vif = pd.DataFrame()
    vif['Features'] = X_train_lm.columns
    vif['VIF'] = [variance_inflation_factor(X_train_lm.values, i) for i in range(X_train_lm.shape[1])]
    vif['VIF'] = round(vif['VIF'], 2)
    vif = vif.sort_values(by = "VIF", ascending = False)
    print('\033[1m' +'\n\n VIF Values\n' +'\033[0m',vif)

In [4]:
# Running RFE with the output number of the variable equal to 1 by default if nothing is given

def lrm_rfe(X_train,y_train,feature_count=1,addconstant=True):
    """we use this function for linear regression basis rfe"""
    lm = LinearRegression()
    lm.fit(X_train, y_train)

    rfe = RFE(lm, feature_count)             # running RFE
    rfe = rfe.fit(X_train, y_train)
    list(zip(X_train.columns,rfe.support_,rfe.ranking_))

    col = X_train.columns[rfe.support_]
    print('\033[1m' +'columns selected are\n' +'\033[0m',col)
    print('\033[1m' +'columns rejected are\n' +'\033[0m',X_train.columns[~rfe.support_])

    return lrm_vif(X_train[col],y_train,addconstant)

   

### Logistic regression custom functions

In [5]:
def confusion_metrics(Actual,Predicted):
    """one function to given all confusion metrics required"""

    #importing required librarires
    from sklearn import metrics

    # Confusion matrix 
    confusion = metrics.confusion_matrix(Actual,Predicted )
    print("\n confusion matrix is \n",confusion)


    # Let's check the overall accuracy via function

    print("Accuracy=",round(metrics.accuracy_score(Actual,Predicted ),2))

    #Metrics beyond simply accuracy
    TP = confusion[1,1] # true positive 
    TN = confusion[0,0] # true negatives
    FP = confusion[0,1] # false positives
    FN = confusion[1,0] # false negatives


    # Let's see the sensitivity of our logistic regression model
    print("Sensitivity=",round(TP / float(TP+FN),2))

    # Let us calculate specificity
    print("Specificity=",round(TN / float(TN+FP),2))


    # Calculate false postive rate - predicting churn when customer does not have churned


    print("False postive rate=",round(FP/ float(TN+FP),2))
    # positive predictive value 


    print("Positive predictive value =",round(TP / float(TP+FP),2))
    # Negative predictive value

    print("Negative predictive value=",round(TN / float(TN+ FN),2))

    #Precision

    print("Precision using confusion matrix=",round(confusion[1,1]/(confusion[0,1]+confusion[1,1]),2))

    #recall

    print("Recall using confusion matrix=",round(confusion[1,1]/(confusion[1,0]+confusion[1,1]),2))

    from sklearn.metrics import precision_score, recall_score

    print("Precision using sklearn=",round(precision_score(Actual,Predicted ),2))

    print("Recall using sklearn=",round(recall_score(Actual,Predicted ),2))

In [6]:
# ROC curve for logistic regression
def draw_roc( actual, probs ):
    """one function to draw the roc curve"""
    from sklearn import metrics
    fpr, tpr, thresholds = metrics.roc_curve( actual, probs,
                                              drop_intermediate = False )
    auc_score = metrics.roc_auc_score( actual, probs )
    plt.figure(figsize=(5, 5))
    plt.plot( fpr, tpr, label='ROC curve (area = %0.2f)' % auc_score )
    plt.plot([0, 1], [0, 1], 'k--')#need to see what this does doubt
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate or [1 - True Negative Rate]')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic example')
    plt.legend(loc="lower right")
    plt.show()

    return None

In [7]:
def bestroc(y_train_pred_final):
    """once function to find the bestroc"""
    
    #importing required librarires
    from sklearn import metrics
    


    # Let's create columns with different probability cutoffs 
    numbers = [float(x)/100 for x in range(100)]
    for i in numbers:
        y_train_pred_final[i]= y_train_pred_final['Prob'].map(lambda x: 1 if x > i else 0)
    #y_train_pred_final.head()

    # Now let's calculate accuracy sensitivity and specificity for various probability cutoffs.
    cutoff_df = pd.DataFrame( columns = ['prob','accuracy','sensi','speci'])
    from sklearn.metrics import confusion_matrix

    # TP = confusion[1,1] # true positive 
    # TN = confusion[0,0] # true negatives
    # FP = confusion[0,1] # false positives
    # FN = confusion[1,0] # false negatives

    
    for i in numbers:
        cm1 = metrics.confusion_matrix(y_train_pred_final['Actual'],y_train_pred_final[i] )
        total1=sum(sum(cm1))
        accuracy = (cm1[0,0]+cm1[1,1])/total1

        speci = cm1[0,0]/(cm1[0,0]+cm1[0,1])
        sensi = cm1[1,1]/(cm1[1,0]+cm1[1,1])
        cutoff_df.loc[i] =[ i ,accuracy,sensi,speci]
    print(cutoff_df)


    # Let's plot accuracy sensitivity and specificity for various probabilities.
    px.line(data_frame=cutoff_df,x='prob', y=['accuracy','sensi','speci'],title='Best Probabilty curve').show()

In [8]:
def logreg(X_train,y_train,probaility=0.5):
    """This function is used to make logistic regression easy by giving all required data in once easy function
    with all required parameters incuding roc and confusion matrix"""
    
    
    X_train = sm.add_constant(X_train)
    logm1 = sm.GLM(y_train,(sm.add_constant(X_train)), family = sm.families.Binomial())
    res=logm1.fit()
    print(res.summary())
    checkvif(X_train)
    # Getting the predicted values on the train set
    y_train_pred = res.predict(X_train)
    
    #converting to array from data frame
    y_train_pred = y_train_pred.values.reshape(-1)
    # Creating a dataframe with the actual churn flag and the predicted probabilities
    y_train_pred_final = pd.DataFrame({'Actual':y_train.values, 'Prob':y_train_pred})
    y_train_pred_final['ID'] = y_train.index
    print(y_train_pred_final.head())
    

    #creating a new column predicted basis some probability
    y_train_pred_final['predicted'] = y_train_pred_final.Prob.map(lambda x: 1 if x > probaility else 0)

    # Let's see the head
    y_train_pred_final.head()
    
    #calling the function for all metrics in one go

    confusion_metrics(y_train_pred_final['Actual'],y_train_pred_final['predicted'])
    
    #calling function for drawing roc
    
    draw_roc(y_train_pred_final['Actual'],y_train_pred_final['predicted'])
    
    #calling function to find best roc
    
    bestroc(y_train_pred_final)
    
    
    from sklearn.metrics import precision_recall_curve
    p, r, thresholds = precision_recall_curve(y_train_pred_final.Actual, y_train_pred_final.Prob)
    plt.plot(thresholds, p[:-1], "g-")
    plt.plot(thresholds, r[:-1], "r-")
    plt.title('Precision vs recall -precision is green and recall is red line')
    plt.show()
    
    return res,X_train

In [9]:
def logreg_rfe(X_train, y_train,fcount=1):
    """This is used for rfe fcount means the number of features you want"""
    from sklearn.linear_model import LogisticRegression

    logreg = LogisticRegression()

    from sklearn.feature_selection import RFE
    rfe = RFE(logreg,fcount)             # running RFE with x variables as output
    rfe = rfe.fit(X_train, y_train)
    print(list(zip(X_train.columns, rfe.support_, rfe.ranking_)))
    col = X_train.columns[rfe.support_]
    print(col)
    return col

