In [None]:
import warnings
warnings.simplefilter('ignore')

import pandasql as ps
import sqlite3
import random
import numpy as np
from shapely.geometry import Point
import pandas as pd
from geopandas import GeoDataFrame
import geopandas as gpd
import matplotlib as mpl
import seaborn as sns
import matplotlib.pyplot as plt
import graphviz
import plotly.express as px
import plotly.graph_objects as go
from datetime import datetime, timedelta, date
import gc
pd.set_option('display.max_columns', None)

from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve, roc_auc_score, accuracy_score, classification_report, confusion_matrix, precision_score, recall_score, ConfusionMatrixDisplay
from sklearn.ensemble import RandomForestClassifier
from scipy.stats import randint
import scipy.stats as st

from sklearn.tree import export_graphviz
from IPython.display import Image
import graphviz

In [None]:
# A function to read the file and store it as a Pandas Dataframe.
def read_dataframe (path , filename):
    df = pd.read_csv(path + "/" + filename)
    return df

In [None]:
# A function that drops specific columns.
def drop_columns(df , columns):
    df.drop( columns , axis=1 , inplace=True )

In [None]:
# A function that drops columns with all null values.
def drop_all_null_columns(df):
    df.dropna(how = 'all' , axis=1 , inplace = True)

In [None]:
# A function that drops rows with null values in a particular column.
def drop_rows(df , column_checker):
    indices_to_drop = df[ df[ column_checker ].isnull() ].index
    df.drop(indices_to_drop , inplace = True)

In [None]:
# A function that fills NaN or NA column with median of that column.
def fill_column_with_median(df , column):
    df[column] = df[column].fillna(df[column].median())

In [None]:
# A function that fills NaN or NA column with some value.
def fill_column_with_values(df , column , value):
    df[column] = df[column].fillna(value)

In [None]:
# A function that converts a string object to datetime-format.
def convert_column_to_datetime(df , column):
    df[column] = pd.to_datetime(df[column])

In [None]:
# A function that converts extracts year from a datetime object.
def extract_year_to_new_column(df , column , new_column):
    df[new_column] = df[column].dt.year

In [None]:
# A function that converts extracts month from a datetime object and is stored as a string-object.
def extract_month_to_new_column(df , column , new_column):
    df[new_column] = df[column].dt.month
    df[new_column] = df[new_column].astype(str)

In [None]:
# A function that makes new column with some default value. For example, a column with all 0's.
def make_new_column(df , new_column , default_values):
    df[new_column] = pd.Series([default_values for x in range(len(df.index))])

In [None]:
# A function that sets some entries of a column to a particular value based on some condition.
def set_conditional_values_column(df , column , condition , value):
    df.loc[ condition , column ] = value

In [None]:
# A function that gets a dummy column for a categorical column, and attaches them to the dataframe.
def get_dummies_for_categorical_columns(df , column):
    df_dummy = pd.get_dummies(df[column], prefix = column)
    drop_columns(df , column)
    return pd.concat( [df , df_dummy] , axis = 1)

In [None]:
# A function that makes 0/1 labels from a dataframe's column.
def make_labels(df , column , value_to_zero , value_to_one):
    labels = pd.DataFrame(df[column])
    labels[column][labels[column] == value_to_zero] = 0
    labels[column][labels[column] == value_to_one] = 1
    labels = labels.apply(pd.to_numeric)
    return labels

In [None]:
# A function that adds makes and adds dummy columns for multiple categorical columns to a dataframe. 
def add_dummy_columns_to_df(df , list_of_cols):
    df_temp = df
    for col in list_of_cols:
        df_temp = get_dummies_for_categorical_columns(df_temp , col)
    return df_temp

In [None]:
# A function that makes a dictionary from two one-to-one columns, and sorts them in an ascending fasion
def make_and_sort_dictionary_by_values(lk , lv):
    res = {}
    for k in lk:
        for v in lv:
            res[k] = v
            lv.remove(v)
            break

    keys = list(res.keys())
    values = list(res.values())
    n = len(values)
    abs_values = [abs(ele) for ele in values]
    sorted_value_index = np.argsort(abs_values)#[::-1][:n]
    sorted_dict = {keys[i]: values[i] for i in sorted_value_index}
    return sorted_dict

In [None]:
# A function that normalizes all numeric columns of a dataframe. 
def normalize_all_numeric_columns( df ):
    list_of_numeric_columns = list( df.select_dtypes(include='number').columns )
    for column in list_of_numeric_columns:
        df[column] = ( df[column] - df[column].mean() ) / df[column].std()

In [None]:
# A function that runs a balanced logisitc regression num_of_sims times using a test-size picked randomly from a list. It also stores a certain number of important coefficients by their weighted size
def logistic_regression_simulation( num_of_sims , list_test_size , num_of_coefs_stored ):
    data = []
    for i in range(num_of_sims):
        ts = random.choice(list_test_size)
        rs = random.randint(1, 10000)
        X_train, X_test, y_train, y_test = train_test_split(df , labels ,  test_size = ts , random_state = rs )
        logmodel = LogisticRegression(class_weight='balanced')
        logmodel.fit(X_train, y_train)
        predictions_log = logmodel.predict(X_test)
        importance = logmodel.coef_.flatten()
        cm = confusion_matrix(y_test, predictions_log)
        accuracy = accuracy_score(y_test, predictions_log)
        lr_probs = logmodel.predict_proba(X_test)
        lr_probs = lr_probs[:, 1]
        lr_auc = roc_auc_score(y_test, lr_probs)
        importance = logmodel.coef_.flatten()
        sorted_dict = make_and_sort_dictionary_by_values( list(df.columns) , list(importance) )
        for k in sorted_dict:
            sorted_dict[k] = sorted_dict[k] * sum(df[k]) / len(df[k])
        sorted_dict = make_and_sort_dictionary_by_values( list(sorted_dict.keys()) , list(sorted_dict.values()) )
        lk = list(sorted_dict.keys())[::-1][:num_of_coefs_stored] 
        lv = list(sorted_dict.values())[::-1][:num_of_coefs_stored]
        res = {}
        for k in lk:
            for v in lv:
                res[k] = v
                lv.remove(v)
                break
        data.append([ts , rs , cm , accuracy , lr_auc, res])
    return data

In [None]:
# A function that spits out various performance metrics and their confidence interval
def performance_metrics(data , confidence_int):
    s1 = []
    s2 = []
    s3 = []
    s4 = []
    s5 = []
    s6 = []
    s7 = []
    s8 = []
    for pt in data:
        s1.append(pt[3])
        s2.append(pt[4])
        cm = pt[2]
        s3.append( cm[0,0] / (cm[0,0] + cm[0,1]) )
        s4.append( cm[1,1] / (cm[1,0] + cm[1,1]) )
        s5.append( cm[1,0] / (cm[1,0] + cm[1,1]) )
        s6.append( cm[0,1] / (cm[0,0] + cm[0,1]) )
        s7.append( cm[0,0] / (cm[0,0] + cm[1,0]) )
        s8.append( cm[1,1] / (cm[0,1] + cm[1,1]) )

    a = s1
    print( "Accuracy is " , sum(a) / len(a) * 100 )
    print("Accuracy's " + str(confidence_int*100) + "% confidence interval is" , st.t.interval(alpha = confidence_int, df=len(a)-1, loc=np.mean(a), scale=st.sem(a)))
    print("\n")
    del s1

    a = s2
    print( "ROC-AUC is " , sum(a) / len(a) )
    print("ROC-AUC's " + str(confidence_int*100) + "% confidence interval is" , st.t.interval(alpha = confidence_int, df=len(a)-1, loc=np.mean(a), scale=st.sem(a)))
    print("\n")
    del s2

    a = s3
    print( "TPR is " , sum(a) / len(a) * 100 )
    print("TPR's " + str(confidence_int*100) + "% confidence interval is" , st.t.interval(alpha = confidence_int, df=len(a)-1, loc=np.mean(a), scale=st.sem(a)))
    print("\n")
    del s3

    a = s4
    print( "TNR is " , sum(a) / len(a) * 100 )
    print("TNR's " + str(confidence_int*100) + "% confidence interval is" , st.t.interval(alpha = confidence_int, df=len(a)-1, loc=np.mean(a), scale=st.sem(a)))
    print("\n")
    del s4

    a = s5
    print( "Type I error is " , sum(a) / len(a) * 100 )
    print("Type I's " + str(confidence_int*100) + "% confidence interval is" , st.t.interval(alpha = confidence_int, df=len(a)-1, loc=np.mean(a), scale=st.sem(a)))
    print("\n")
    del s5

    a = s6
    print( "Type II error is " , sum(a) / len(a) * 100 )
    print("Type II's " + str(confidence_int*100) + "% confidence interval is" , st.t.interval(alpha = confidence_int, df=len(a)-1, loc=np.mean(a), scale=st.sem(a)))
    print("\n")
    del s6

    a = s7
    print( "Precision is " , sum(a) / len(a) * 100 )
    print("Precision's " + str(confidence_int*100) + "% confidence interval is" , st.t.interval(alpha = confidence_int, df=len(a)-1, loc=np.mean(a), scale=st.sem(a)))
    print("\n")
    del s7

    a = s8
    print( "Recall is " , sum(a) / len(a) * 100 )
    print("Recall's " + str(confidence_int*100) + "% confidence interval is" , st.t.interval(alpha = confidence_int, df=len(a)-1, loc=np.mean(a), scale=st.sem(a)))
    print("\n")
    del a,s8

In [None]:
# A function that spits out the most important coefficients that appeared in all simulations and their confidence interval

def important_coefs(data , confidence_int):
    set1 = set(data[0][5].keys())
    for i in range(num_of_sims):
        set1 = set1 & set(data[i][5].keys())
    list_of_coefs = list(set1)
    list_of_coefs

    for coef in list_of_coefs:
        d1 = []
        for i in range(num_of_sims):
            d1.append(data[i][5][coef])
        print( "Average of " + coef + " is " , sum(d1) / len(d1) )
        print( str(confidence_int*100) + "% confidence interval of " + coef + " is " , st.t.interval(alpha = confidence_int , df=len(d1)-1 , loc=np.mean(d1) , scale=st.sem(d1)) )
        print("\n")
    

In [None]:
df = read_dataframe("Downloads" , "loan_data.csv")

In [None]:
df.drop_duplicates(inplace = True)

In [None]:
drop_all_null_columns(df)

In [None]:
drop_rows(df , 'term')

In [None]:
fill_column_with_values(df , 'emp_length' , '0 years')

In [None]:
fill_column_with_median(df , 'mths_since_last_delinq')

In [None]:
convert_column_to_datetime(df , 'earliest_cr_line')

In [None]:
extract_year_to_new_column(df , 'earliest_cr_line' , 'year_earliest_cr_line')

In [None]:
extract_month_to_new_column(df , 'earliest_cr_line' , 'month_earliest_cr_line')

In [None]:
drop_columns(df , ['int_rate2' , 'int_rate3' , 'earliest_cr_line' , 'id' , 'loan_amnt'])

In [None]:
df = df.reset_index(drop=True)

In [None]:
make_new_column(df , 'loan_good_or_bad' , 'good')

In [None]:
df.groupby(['loan_status']).size()

In [None]:
set_conditional_values_column(df , 
                              'loan_good_or_bad' , 
                              (df['loan_status'] == 'Charged Off') | 
                              (df['loan_status'] == 'Default') , 
                              'bad')


In [None]:
drop_columns(df , 'loan_status')

In [None]:
df = add_dummy_columns_to_df( df , [ 'term' , 
                                     'emp_length' , 
                                     'home_ownership' , 
                                     'purpose' , 
                                     'addr_state' , 
                                     'month_earliest_cr_line' ] )


In [None]:
labels = make_labels(df , 'loan_good_or_bad' , 'good' , 'bad')

In [None]:
drop_columns(df , 'loan_good_or_bad')

In [None]:
# normalize_all_numeric_columns( df )

In [None]:
list_of_test_size = [0.1 , 0.15, 0.2, 0.25, 0.3, 0.35, 0.4]

num_of_sims = 1000

coefs_stored = 10

data = logistic_regression_simulation(num_of_sims , list_of_test_size , coefs_stored )

In [None]:
confidence_int = 0.9

In [None]:
performance_metrics(data, confidence_int)

In [None]:
important_coefs(data , confidence_int)