In [None]:
# -*- coding: utf-8 -*-

"""
Purdue University - Krannert School of Management
MS BAIM Program - Summer 2021
MGMT-58600-B03 - Python Programming
Final Project - Group 6
@authors: Su Tien Lee; Chayadeepsai Cherukupalli; Sri Manogna Gurijala; Alejandro Brillembourg Cuenca

"""

In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix,accuracy_score,recall_score,precision_recall_curve,precision_score,classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier

In [None]:
def file_exists(file_name):
    '''the exists function will accept a string file name and it will check
    to see if the file exists or not returning True/False'''
    exists = os.path.isfile(file_name)
    return exists

def create_df(file_name):
    '''the create_dataframe function will accept a string file name for a 
    csv file and it will read the file's contents into a pd data frame'''
    dim_df = pd.read_csv(file_name)
    print("Dataframe file has been created.\n")
    input("Please press 'Enter' to continue.\n")
    return dim_df

def null_check(df):
    '''the null_check function accepts a pandas dataframe and displays counts for
    null feature values using the .isna() function'''
    for i in list(df.columns.values):
        print(str(i), "has", df[str(i)].isna().sum(), "null values.")
        print()
    input("Please press 'Enter' to continue.\n")
    return df

def dup_check(df):
    '''the dup_check function accepts a pandas dataframe and displays counts for
    duplicate feature values using the .duplicated() function'''
    is_dup = df.duplicated()
    return is_dup

def drop_col(df):
    '''the drop_col function will accept a pandas dataframe and it will remove unnecessary columns based
    on a user-entered column index'''
    done = ""
    col_to_drop = []
    headers = list(df.columns.values)
    pp_headers = ""
    x = 0

    for i in headers:
        pp_headers += "- Column item '" + str(i) + "' is in index # " + str(headers.index(i)) +"\n"
    print(pp_headers)
    
    while done.lower() != "stop":
        done = input("Please enter the index of all columns you wish to drop, one at a time. "
                     +"Once you are done, please enter 'stop': ")
        if done.isnumeric() == True and int(done) <= len(headers):
            col_to_drop.append(int(done))
        elif done.lower() == "stop":
            continue
        else:
            print("That is not a valid index. Please try again.")
    print("\nCOLUMN(S) DROPPED:")
    print("You have successfully dropped the following columns",col_to_drop,"\n")
    df.drop(df.columns[col_to_drop], axis = 'columns', inplace = True)
    input("Please press 'Enter' to continue.\n")
    return df

def show_shape(df):
    '''the show_shape function receives a pandas dataframe and displays the shape of the 
    dataframe in a string sentence'''
    rows, columns = df.shape
    print("The file contains " + str(rows) + " rows and " + str(columns) + " columns.\n")
    input("Please press 'Enter' to continue.\n")
    return None

def head_tail(df):
    '''the head_tail function receives a pandas dataframe and displays the user-defined 
    top and bottom rows of a dataframe'''
    x = False
    while x == False:
        rows = input("How many rows would you like to review for the top and bottom of the file?: ")
        if rows.isnumeric() == True and int(rows) <= df.shape[0]:
            rows = int(rows)
            print("\nTOP ROWS")
            print("See below the top",rows,"row(s):\n",df.head(rows))
            print()
            print("\nBOTTOM ROWS")
            print("See below the bottom",rows,"row(s):\n",df.tail(rows))
            print()
            x = True
        else:
            print("\nPlease enter a valid number of rows.")
    input("Please press 'Enter' to continue.\n")
    return None

def dis_val_count(df):
    '''the dis_val_count function accepts a pandas dataframe and displays counts for
    all features using the .values() function'''    
    for i in list(df.columns.values):
        df.sort_values(by = [str(i)])
        print(df[str(i)].value_counts())
        print()
    input("Please press 'Enter' to continue.\n")
    return df

def enc_bin(df):
    '''the enc_bin function accepts a pandas dataframe and replaces categorical values with binary values'''
    df['Gender'] = df['Gender'].replace({'F':1, 'M':0})
    df['Attrition_Flag'] = df['Attrition_Flag'].replace({'Existing Customer':1, 'Attrited Customer':0})
    return df

def enc_ord(df):
    '''the enc_ord function accepts a pandas dataframe and replaces categorical values with ordinal values'''
    ord_val = {'Card_Category': {'Blue':1, 'Silver':2, 'Gold':3, 'Platinum':4}}
    df = df.replace(ord_val)
    return df

def enc_dum(df):
    '''the enc_dum function accepts a pandas dataframe and creates dummy feature values for categorical features'''
    df = pd.get_dummies(df, columns = ["Education_Level"], prefix = ["EDU_LVL_"], drop_first = True) 
    df = pd.get_dummies(df, columns = ["Marital_Status"], prefix = ["MAR_ST_"], drop_first = True)
    df = pd.get_dummies(df, columns = ["Income_Category"], prefix = ["INC_CAT_"], drop_first = True)
    return df

def show_types(df):
    '''the show_types function receives a pandas dataframe and displays the data types of the 
    dataframe in a string sentence'''
    for i in list(df.columns.values):
        print("Column ",str(i)," contains data of the '",df[str(i)].dtype,"' data type.\n", sep = "")
    input("Please press 'Enter' to continue.\n")
    return None

def set_X(df):
    '''the set_X function receives a pandas dataframe, drops column index 0, and returns remaining data as dataframe'''
    X=df.drop(df.columns[0], axis = 'columns')
    return X

def set_y(df):
    '''the set_y function receives a pandas dataframe, sets dataframe to 'Attrition_Flag' only and returns it'''
    y=df['Attrition_Flag']
    return y


In [None]:
def pre_processing():
    global X_train
    '''start program for data preprocessing and model building'''
    
    # define file name
    dim_file = 'Customer-Churn_Dataset.csv'
    
    # verify that file exists
    dim_file_exists = file_exists(dim_file)
    
    # check if document exists in set path
    if dim_file_exists == True:
        
        # if it does, create data frame and continue
        print("CREATE FILE:\n")
        dim_file_df = create_df(dim_file)
        
        # drop unnecessary columns
        print("\nCOLUMN INDEX:")
        dim_file_df = drop_col(dim_file_df)
        
        # show df rows and columns
        print("\nDATAFRAME SHAPE:")
        show_shape(dim_file_df)
        
        # check for null values
        print("\nNULL VALUE CHECK: \nPlease see below the counts of all null values per feature.\n")
        null_check(dim_file_df)
        
        # check for duplicate values
        print("\nDUPLICATE VALUE CHECK: \nPlease see below the result after checking for duplicate rows.\n")
        print(dup_check(dim_file_df).value_counts())
        input("Please press 'Enter' to continue.\n")  
        
        # show top and bottom rows of df
        print("\nHEAD AND TAIL:")
        head_tail(dim_file_df)
        
        # display value counts
        print("\nVALUE COUNTS PER FEATURE: \nPlease see below the value counts per all features.\n")
        dis_val_count(dim_file_df)
        print()
        
        # encode categorical values and show data before and after encoding
        print("ENCODING CATEGORICAL VALUES: \n")
        print("\nPlease see below the column data types before categorical encoding.\n")        
        print(show_types(dim_file_df))                
        print("Please see below the top 5 rows for column data types before categorical encoding.")        
        print(dim_file_df[['Attrition_Flag','Gender','Card_Category','Education_Level',
                           'Marital_Status','Income_Category']].head(5))
        dim_file_df = enc_bin(dim_file_df)
        print("\nGender and Attrition Flag Categories Encoded Successfully.\n")
        dim_file_df = enc_ord(dim_file_df)
        print("\nCard Category Encoded Successfully.\n")
        dim_file_df = enc_dum(dim_file_df)
        print("\nEducation Level, Marital Status, and Income Categories Encoded Successfully.\n")
        print("Please see below the rows of data after categorical encoding.")
        print(head_tail(dim_file_df))
        print("\nPlease see below the column data types after categorical encoding.\n")
        print(show_types(dim_file_df))
        
        # define X and y for statistical models
        print("\nSTATISTICAL MODEL: \n")
        df_X = set_X(dim_file_df)
        df_y = set_y(dim_file_df)
        print("X and y axis have been defined for logistic regression and decision tree models.\n")
        
        # split X and y dataframes into test and train data
        X_train, X_test, y_train, y_test = train_test_split(df_X, df_y, test_size = 0.2, random_state = 100)
        
        # show X and y train and test shapes
        print("Train and test data has been created with a test dataset of 20% and a train dataset of 80%.\n")
        print("Please see below the shape of each set:\n")
        print("X train data shape:", X_train.shape)
        print("y train data shape:", y_train.shape)
        print("X test data shape:", X_test.shape)        
        print("y test data shape:", y_test.shape)
        input("Please press 'Enter' to continue.\n")
        
        # scale datasets
        scaler = StandardScaler()
        scaler.fit(X_train)
        X_scaled_inp = scaler.transform(X_train)
        X_scaled_inp_test = scaler.transform(X_test)
        print("Train and test data has been successfully scaled for logistic regression and decision tree models.\n")
        
        return X_scaled_inp,X_scaled_inp_test, y_train, y_test
        
    # if file is not found, stop program
    else:
        print("The program will not be able to run as the Microsoft Excel Comma" +
              "\nSeparated Values File '" + dim_file + "'\ncannot be found.")

In [None]:
# calling the pre_processing function to return the processed dataframes
# enter columns 0,21 and 22 for dropping columns followed by 'stop'

train_X, test_X, train_y, test_y = pre_processing()


In [None]:
# build model #1: logistic regression

model_1 = LogisticRegression()
model_1.fit(train_X, train_y)
y_pred_1 = model_1.predict(test_X)

# build model #2: decision tree

model_2 = DecisionTreeClassifier(random_state=100)
model_2.fit(train_X, train_y)
y_pred_2 = model_2.predict(test_X)

# build model #3: random forest
model_3=RandomForestClassifier(random_state=100)
model_3.fit(train_X, train_y)
y_pred_3 = model_3.predict(test_X)

input("models trained press 'Enter' to continue to view results.\n")

# show accuracy score, confusion matrix, and recall & precision scores 

# for model_1
print("\n Evaluation Metrics for model_1 \n")
print("Accuracy Score:", "{:.2%}".format(accuracy_score(test_y, y_pred_1))) 
print("Confusion Matrix:\n", confusion_matrix(test_y, y_pred_1))
print("Recall Score:", "{:.2%}".format(recall_score(test_y, y_pred_1)))
print("Precision Score:", "{:.2%}".format(precision_score(test_y, y_pred_1)))

# for model_2
print("\n Evaluation metrics for model_2 \n")
print("Accuracy Score:", "{:.2%}".format(accuracy_score(test_y, y_pred_2)))
print("Confusion Matrix:\n", confusion_matrix(test_y, y_pred_2))
print("Recall Score:", "{:.2%}".format(recall_score(test_y, y_pred_2)))
print("Precision Score:", "{:.2%}".format(precision_score(test_y, y_pred_2)))

# for model_3
print("\n Evaluation metrics for model_3 \n")
print("Accuracy Score:", "{:.2%}".format(accuracy_score(test_y, y_pred_3)))
print("Confusion Matrix:\n", confusion_matrix(test_y, y_pred_3))
print("Recall Score:", "{:.2%}".format(recall_score(test_y, y_pred_3)))
print("Precision Score:", "{:.2%}".format(precision_score(test_y, y_pred_3)))

# selecting model_3 (random forest) due to better model stats and displaying feature importance
print("\nFeature Importance - Random Forest\n")
plt.figure(figsize=(10,7))
feat_importances = pd.Series(model_3.feature_importances_, index = X_train.columns)
feat_importances.nlargest(7).plot(kind='barh')