### DSC 4320 Final Project
Brock Carey, Emily Liau

**Environment Set-Up**

In [None]:
# import packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math
import category_encoders as ce
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn import metrics

**Data Pre-Processing: Exploratory Data Analysis**

In [None]:
# import data into environment
data = pd.read_csv('C:/Users/bacar/4320/Final Project/callpredict/data.csv', sep=',')
data

In [None]:
# check data types of columns
data_types = data.dtypes
data_types

In [None]:
# convert 'DATE_FOR' from object to datetime format
data['DATE_FOR'] = pd.to_datetime(data['DATE_FOR'])
data['YEAR'] = data['DATE_FOR'].dt.year
data['MONTH'] = data['DATE_FOR'].dt.month
data['DAY'] = data['DATE_FOR'].dt.day
print(data.info())

In [None]:
# remove 'DATE_FOR' column from dataframe to prevent redundancy
data.drop('DATE_FOR', axis=1, inplace=True)
data

In [None]:
# visualize class imbalance in target variable
call_flag_counts = data['Call_Flag'].value_counts()

plt.figure(figsize=(8, 6))
plt.pie(call_flag_counts, labels=call_flag_counts.index, autopct='%1.1f%%', startangle=140)
plt.title('Class Imbalance in Call_Flag Column')
plt.axis('equal')
plt.show()

In [None]:
# identify categorical variables
categorical_vars = [var for var in data.columns if data[var].dtype == 'O']
print("There are {} categorical variables.".format(len(categorical_vars)))
print("The categorical variables are: ", categorical_vars)

In [None]:
# check frequency counts of categorical variables
for var in categorical_vars:
    print(data[var].value_counts())
    print(data[var].value_counts() / float(len(data)))
    print(var, " contains ", len(data[var].unique()), " labels.")

In [None]:
# identify the presence of NA values in categorical variables
print(data[categorical_vars].isnull().sum())

In [None]:
# identify numerical variables
numerical_vars = [var for var in data.columns if data[var].dtype != 'O']
print("There are {} numerical variables.".format(len(numerical_vars)))
print("The numerical variables are: ", numerical_vars)

In [None]:
# identify the presence of NA values in numerical variables
print(data[numerical_vars].isnull().sum())

In [None]:
# identify outliers in numerical variables
print(round(data[numerical_vars].describe()), 2)

In [None]:
# visualize outliers in numerical variables
plt.figure(figsize=(15,15))

plt.subplot(5,5,1)
fig = data.boxplot(column='Tenure')
fig.set_ylabel('Tenure')

plt.subplot(5,5,2)
fig = data.boxplot(column='Age')
fig.set_ylabel('Age')

plt.subplot(5,5,3)
fig = data.boxplot(column='CHANNEL1_6M')
fig.set_ylabel('CHANNEL1_6M')

plt.subplot(5,5,4)
fig = data.boxplot(column='CHANNEL2_6M')
fig.set_ylabel('CHANNEL2_6M')

plt.subplot(5,5,5)
fig = data.boxplot(column='CHANNEL3_6M')
fig.set_ylabel('CHANNEL3_6M')

plt.subplot(5,5,6)
fig = data.boxplot(column='CHANNEL4_6M')
fig.set_ylabel('CHANNEL4_6M')

plt.subplot(5,5,7)
fig = data.boxplot(column='CHANNEL5_6M')
fig.set_ylabel('CHANNEL5_6M')

plt.subplot(5,5,8)
fig = data.boxplot(column='METHOD1_6M')
fig.set_ylabel('METHOD1_6M')

plt.subplot(5,5,9)
fig = data.boxplot(column='RECENT_PAYMENT')
fig.set_ylabel('RECENT_PAYMENT')

plt.subplot(5,5,10)
fig = data.boxplot(column='PAYMENTS_6M')
fig.set_ylabel('PAYMENTS_6M')

plt.subplot(5,5,11)
fig = data.boxplot(column='CHANNEL1_3M')
fig.set_ylabel('CHANNEL1_3M')

plt.subplot(5,5,12)
fig = data.boxplot(column='CHANNEL2_3M')
fig.set_ylabel('CHANNEL2_3M')

plt.subplot(5,5,13)
fig = data.boxplot(column='CHANNEL3_3M')
fig.set_ylabel('CHANNEL3_3M')

plt.subplot(5,5,14)
fig = data.boxplot(column='CHANNEL4_3M')
fig.set_ylabel('CHANNEL4_3M')

plt.subplot(5,5,15)
fig = data.boxplot(column='CHANNEL5_3M')
fig.set_ylabel('CHANNEL5_3M')

plt.subplot(5,5,16)
fig = data.boxplot(column='METHOD1_3M')
fig.set_ylabel('METHOD1_3M')

plt.subplot(5,5,17)
fig = data.boxplot(column='PAYMENTS_3M')
fig.set_ylabel('PAYMENTS_3M')

plt.subplot(5,5,18)
fig = data.boxplot(column='NOT_DI_3M')
fig.set_ylabel('NOT_DI_3M')

plt.subplot(5,5,19)
fig = data.boxplot(column='NOT_DI_6M')
fig.set_ylabel('NOT_DI_6M')

plt.subplot(5,5,20)
fig = data.boxplot(column='EVENT2_90_SUM')
fig.set_ylabel('EVENT2_90_SUM')

plt.subplot(5,5,21)
fig = data.boxplot(column='LOGINS')
fig.set_ylabel('LOGINS')

In [None]:
# identify columns with NA values
data.isna().sum()

In [None]:
# apply median imputation on NA values for all relevant columns
na_cols = ['CHANNEL1_6M', 'CHANNEL2_6M', 'CHANNEL3_6M', 'CHANNEL4_6M', 'CHANNEL5_6M', 'METHOD1_6M',
          'RECENT_PAYMENT', 'PAYMENTS_6M']

for idx, row in data.iterrows():
    for col in na_cols:
        if math.isnan(float(row[col])):
            median = data[col].median()
            data[col].fillna(median, inplace=True)

In [None]:
# visualize outliers in numerical variables
plt.figure(figsize=(15,15))

plt.subplot(5,5,1)
fig = data.Tenure.hist(bins=10)
fig.set_xlabel('Tenure')

plt.subplot(5,5,2)
fig = data.Age.hist(bins=10)
fig.set_xlabel('Age')

plt.subplot(5,5,3)
fig = data.CHANNEL1_6M.hist(bins=10)
fig.set_xlabel('CHANNEL1_6M')

plt.subplot(5,5,4)
fig = data.CHANNEL2_6M.hist(bins=10)
fig.set_xlabel('CHANNEL2_6M')

plt.subplot(5,5,5)
fig = data.CHANNEL3_6M.hist(bins=10)
fig.set_xlabel('CHANNEL3_6M')

plt.subplot(5,5,6)
fig = data.CHANNEL4_6M.hist(bins=10)
fig.set_xlabel('CHANNEL4_6M')

plt.subplot(5,5,7)
fig = data.CHANNEL5_6M.hist(bins=10)
fig.set_xlabel('CHANNEL5_6M')

plt.subplot(5,5,8)
fig = data.METHOD1_6M.hist(bins=10)
fig.set_xlabel('METHOD1_6M')

plt.subplot(5,5,9)
fig = data.RECENT_PAYMENT.hist(bins=10)
fig.set_xlabel('RECENT_PAYMENT')

plt.subplot(5,5,10)
fig = data.PAYMENTS_6M.hist(bins=10)
fig.set_xlabel('PAYMENTS_6M')

plt.subplot(5,5,11)
fig = data.CHANNEL1_3M.hist(bins=10)
fig.set_xlabel('CHANNEL1_3M')

plt.subplot(5,5,12)
fig = data.CHANNEL2_3M.hist(bins=10)
fig.set_xlabel('CHANNEL2_3M')

plt.subplot(5,5,13)
fig = data.CHANNEL3_3M.hist(bins=10)
fig.set_xlabel('CHANNEL3_3M')

plt.subplot(5,5,14)
fig = data.CHANNEL4_3M.hist(bins=10)
fig.set_xlabel('CHANNEL4_3M')

plt.subplot(5,5,15)
fig = data.CHANNEL5_3M.hist(bins=10)
fig.set_xlabel('CHANNEL5_3M')

plt.subplot(5,5,16)
fig = data.METHOD1_3M.hist(bins=10)
fig.set_xlabel('METHOD1_3M')

plt.subplot(5,5,17)
fig = data.PAYMENTS_3M.hist(bins=10)
fig.set_xlabel('PAYMENTS_3M')

plt.subplot(5,5,18)
fig = data.NOT_DI_3M.hist(bins=10)
fig.set_xlabel('NOT_DI_3M')

plt.subplot(5,5,19)
fig = data.NOT_DI_6M.hist(bins=10)
fig.set_xlabel('NOT_DI_6M')

plt.subplot(5,5,20)
fig = data.EVENT1_30_FLAG.hist(bins=10)
fig.set_xlabel('EVENT1_30_FLAG')

plt.subplot(5,5,21)
fig = data.EVENT2_90_SUM.hist(bins=10)
fig.set_xlabel('EVENT2_90_SUM')

**Data Pre-Processing: Feature Engineering**

In [None]:
# identify independent and target variables
X = data.drop(['Call_Flag'], axis=1)
y = data['Call_Flag']

In [None]:
# view categorical variables in training dataset
print(X[categorical_vars])

In [None]:
# encode categorical variables
encoder = ce.BinaryEncoder(cols=['RTD_ST_CD', 'CustomerSegment', 'MART_STATUS', 'GENDER'])
X = encoder.fit_transform(X)
print(X.head())

In [None]:
# Perform SMOTE oversampling to address class imbalance
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)
X_resampled.shape

In [None]:
# train-test-split
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

In [None]:
# Cap outliers
def cap_outliers(columns):
    for dat in [X_train, X_test]:
        for col in columns:
            IQR = dat[col].quantile(0.75) - dat[col].quantile(0.25)
            lower = dat[col].quantile(0.25) - (1.5 * IQR)
            upper = dat[col].quantile(0.75) + (1.5 * IQR)
            print(f'{col} outliers are < {lower} and > {upper}.'.format(lower, upper))
            
            dat[col] = np.where(dat[col] > upper, upper, dat[col])
        

cols = ['CHANNEL1_6M', 'CHANNEL2_6M', 'CHANNEL3_6M', 'CHANNEL4_6M', 'CHANNEL5_6M',
       'METHOD1_6M', 'RECENT_PAYMENT', 'PAYMENTS_6M', 'CHANNEL1_3M', 'CHANNEL2_3M',
       'Tenure', 'Age', 'CHANNEL3_3M', 'CHANNEL4_3M', 'CHANNEL5_3M', 'METHOD1_3M',
       'PAYMENTS_3M', 'NOT_DI_3M', 'NOT_DI_6M', 'EVENT1_30_FLAG', 'EVENT2_90_SUM']
cap_outliers(cols)

In [None]:
# normalization with MinMaxScaler
column_names = X_train.columns

scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

X_train = pd.DataFrame(X_train_scaled, columns=column_names)
X_test = pd.DataFrame(X_test_scaled, columns=column_names)

**Model Training**

In [None]:
# convert values to tensors
X_train = torch.tensor(X_train.values)
X_test = torch.tensor(X_test.values)

y_train = torch.tensor(y_train.values)
y_test = torch.tensor(y_test.values)

In [None]:
# Implement neural network model
class NN(nn.Module):
    def __init__(self, input_dim, hidden_dim1, hidden_dim2, hidden_dim3, act, dropout_rate):
        super(NN, self).__init__()
        self.layer1 = nn.Linear(input_dim, hidden_dim1)
        self.dropout1 = nn.Dropout(dropout_rate)
        self.layer2 = nn.Linear(hidden_dim1, hidden_dim2)
        self.dropout2 = nn.Dropout(dropout_rate)
        self.layer3 = nn.Linear(hidden_dim2, hidden_dim3)
        self.dropout3 = nn.Dropout(dropout_rate)
        self.output = nn.Linear(hidden_dim3, 1)
        if act == 'relu':
            self.act = nn.functional.relu
        elif act == 'leaky_relu':
            self.act = nn.functional.leaky_relu
    
    def forward(self, x):
        x = self.act(self.layer1(x))
        x = self.dropout1(x)
        x = self.act(self.layer2(x))
        x = self.dropout2(x)
        x = self.act(self.layer3(x))
        x = self.dropout3(x)
        x = nn.functional.sigmoid(self.output(x))
        return x

**Model Evaluation**

In [None]:
# Train NN based on passed values
def train_NN(data, epochs, dim1, dim2, dim3, act, dropout, lr):
    X_train = data[0]
    X_test = data[1]
    y_train = data[2]
    y_test = data[3]
    model = NN(input_dim, dim1, dim2, dim3, act, dropout)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    acc_list = []
    f1_list = []
    for i in range(epochs):
        #if i % 10 ==0:
        #    print(f'Epoch: {i}/{epochs}')
        optimizer.zero_grad()
        pred = model.forward(X_train.float())
        loss = nn.functional.binary_cross_entropy(pred.squeeze(), y_train.float())
        loss.backward()
        optimizer.step()
        
        pred_test = model.forward(X_test.float())
        pred_labels = (pred_test.squeeze() > 0.5).int()
        val_loss = nn.functional.binary_cross_entropy(pred_test.squeeze(), y_test.float())
        
        acc_list.append(accuracy_score(y_test, pred_labels))
        f1_list.append(f1_score(y_test, pred_labels))
    
    return([np.mean(acc_list), np.mean(f1_list)])

In [None]:
# Generate search space array for grid search
def search_space(data, param_grid):
    space_rows = 1
    space_cols = len(param_grid) + 2
    for key in param_grid:
        space_rows *= len(param_grid[key])
    space = np.zeros((space_rows, space_cols))
    index = 0
    for epochs in param_grid['epochs']:
        for dim1 in param_grid['hidden_dim1']:
            for dim2 in param_grid['hidden_dim2']:
                for dim3 in param_grid['hidden_dim3']:
                    for idx, act in enumerate(param_grid['act']):
                        for drop in param_grid['dropout']:
                            for lr in param_grid['lr']:
                                print(f'Iteration: {index}/{space_rows}')
                                result = train_NN(data, epochs, dim1, dim2, dim3, act, drop, lr)
                                space[index] = np.array([epochs, dim1, dim2, dim3, idx, drop, lr, result[0], result[1]])
                                index += 1
    resultDf = pd.DataFrame(space, columns=['Epochs', 'Dim1', 'Dim2', 'Dim3',
                                           'Activation', 'Dropout', 'LR', 'Acc', 'F1'])
    return resultDf

In [None]:
# Perform grid search based on possible values
param_grid = dict(epochs=[100, 500, 1000], hidden_dim1=[32, 64],
                 hidden_dim2=[16, 32], hidden_dim3=[8, 16],
                 act=['relu', 'leaky_relu'], dropout=[0.1, 0.2],
                 lr=[0.01, 0.001])
data = [X_train, X_test, y_train, y_test]
resultDf = search_space(data, param_grid)

In [None]:
# View maximum values
resultDf.describe()

In [None]:
# View accuracies greater than 92%
resultDf[resultDf['Acc'] > 0.92]

In [None]:
# Perform final training loop based on hyperparameters with best results from grid search
epochs = 1000
dim1 = 64
dim2 = 16
dim3 = 8
activation = 'relu'
dropout = 0.1
lr = 0.01

model = NN(input_dim, dim1, dim2, dim3, activation, dropout)


optimizer = torch.optim.Adam(model.parameters(), lr=lr)

train_loss = []
test_loss = []

for i in range(epochs):
    optimizer.zero_grad()
    pred = model.forward(X_train.float())
    loss = nn.functional.binary_cross_entropy(pred.squeeze(), y_train.float())
    loss.backward()
    optimizer.step()
    train_loss.append(loss.item())
    pred_test = model.forward(X_test.float())
    pred_labels = (pred_test.squeeze() > 0.5).int()
    val_loss = nn.functional.binary_cross_entropy(pred_test.squeeze(), y_test.float())
    test_loss.append(val_loss.item())

    if i % 50 == 0:
        accuracy = accuracy_score(y_test, pred_labels)
        precision = precision_score(y_test, pred_labels, zero_division=0)
        recall = recall_score(y_test, pred_labels)
        f1 = f1_score(y_test, pred_labels)
        
        print(f'Epoch: {i}, Loss: {loss.item()}, Accuracy: {accuracy}, Precision: {precision}, Recall: {recall}, F1: {f1}')

In [None]:
# Find test loss at final epoch
test_loss[-1]

In [None]:
# Generate classification report for model
y_pred = model.forward(X_test.float())
y_pred = y_pred.detach().numpy()
for i, pred in enumerate(y_pred):
    y_pred[i] = round(pred[0])

print(metrics.classification_report(y_test, y_pred))

In [None]:
# Calculate area under ROC curve
print(f'ROC AUC: {metrics.roc_auc_score(y_test, y_pred)}')

In [None]:
# Plot ROC curve
y_pred = model.forward(X_test.float())

fpr, tpr, threshold = metrics.roc_curve(y_test, y_pred.detach().numpy())

roc_df = pd.DataFrame({"False Positive Rate": fpr, "True Positive Rate": tpr})

fig = plt.subplots(figsize=(8, 5))
sns.set_style("darkgrid")
sns.lineplot(x="False Positive Rate", y="True Positive Rate", data=roc_df, color='crimson')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve")
plt.show()

In [None]:
# Plot train / loss curves
fig, ax = plt.subplots(figsize=(8, 5))
sns.set_style("darkgrid")
sns.lineplot(train_loss, label='Training Loss', color='crimson')
sns.lineplot(test_loss, label='Validation Loss', color='midnightblue')

ax.set(title='Train / Validation Loss', xlabel='Epochs', ylabel='Loss')