# Import Statements

In [0]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels as smd
from astropy.table import Table, Column
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.svm import SVC # "Support Vector Classifier" 
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn import svm
from sklearn.preprocessing import Imputer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest

# Uploading a Dataset

In [3]:
from google.colab import files
uploaded = files.upload()

Saving bank-test.csv to bank-test.csv


In [0]:
train = pd.read_csv('bank-train.csv')
test = pd.read_csv('bank-test.csv')

# Splitting Numerical and Categorical Data

In [0]:
nums  = train.select_dtypes(include=[np.number]).columns.tolist()
numerical = train[nums]

cat = train[train.select_dtypes(include=['object']).columns.tolist()]

# Imputing unknown/missing values

In [0]:
imp = SimpleImputer(missing_values = 'unknown', strategy = 'most_frequent')
imp_cat = pd.DataFrame(imp.fit_transform(cat))

# Encoding nominal categorical variables

In [0]:
train0 = pd.read_csv("bank-train.csv")
test0 = pd.read_csv("bank-test.csv")
training=pd.read_csv('bank-train.csv')
testing=pd.read_csv('bank-test.csv')
id_train = train0.iloc[:, 0]
id_test = test0.iloc[:, 0]

df = pd.concat([train0, test0], sort = False)

df = df.reset_index(drop = True) # Concatenating messed up the index. If you want to concatenate two dataframes with index ranges 1-10 and 1-5, the concatenated dataframe will have 1-10 and 1-5.
X0 = df.iloc[:, :-1]
#X0 = X0.drop(columns = "duration") # duration is not known before a call is made (future leak); this will influence results
Y = df.iloc[:, -1]

del train0, test0

In [0]:
# Encode categorical variables (nominal)

def onehotencode(df, field):
    result = []
    grp = list(df.groupby(field).count().iloc[:, 0].index)
    grp1 = []

    for item in grp:
        result.append(list(df.loc[:, field] == item))
        grp1.append(field + " " + item)
    
    result = np.array(result).T
    result = result.astype(int)
    result = pd.DataFrame(result, columns = grp1)
    return result
    
job_encoded = onehotencode(X0, "job")
marital_encoded = onehotencode(X0, "marital")
education_encoded = onehotencode(X0, "education")
default_encoded = onehotencode(X0, "default")
housing_encoded = onehotencode(X0, "housing")
loan_encoded = onehotencode(X0, "loan")
contact_encoded = onehotencode(X0, "contact")
poutcome_encoded = onehotencode(X0, "poutcome")

# Encoding ordinal categorical variables

In [0]:
# Encode categorical variables (ordinal)

def encode_month():
    months_encoded = []
    for i in range(len(X0)):
        month = X0.loc[i, "month"]
        if month == "jan":
            months_encoded.append(1)
        elif month == "feb":
            months_encoded.append(2)
        elif month == "mar":
            months_encoded.append(3)
        elif month == "apr":
            months_encoded.append(4)        
        elif month == "may":
            months_encoded.append(5)
        elif month == "jun":
            months_encoded.append(6)
        elif month == "jul":
            months_encoded.append(7)
        elif month == "aug":
            months_encoded.append(8)
        elif month == "sep":
            months_encoded.append(9)
        elif month == "oct":
            months_encoded.append(10)
        elif month == "nov":
            months_encoded.append(11)
        elif month == "dec":
            months_encoded.append(12)

    months_encoded = pd.DataFrame(months_encoded, columns = ["month num"])
    return months_encoded

def encode_day_of_week():
    days_encoded = []
    for i in range(len(X0)):
        day = X0.loc[i, "day_of_week"]
        if day == "mon":
            days_encoded.append(1)
        elif day == "tue":
            days_encoded.append(2)
        elif day == "wed":
            days_encoded.append(3)
        elif day == "thu":
            days_encoded.append(4)        
        elif day == "fri":
            days_encoded.append(5)

    days_encoded = pd.DataFrame(days_encoded, columns = ["day of week num"])
    return days_encoded

months_encoded = encode_month()
days_encoded = encode_day_of_week()

# Insert all of the encoded variables' values into the dataframe
X = pd.concat([X0, job_encoded, marital_encoded, education_encoded, default_encoded, housing_encoded, loan_encoded, contact_encoded, poutcome_encoded, months_encoded, days_encoded], axis = 1)

# Delete variables
del job_encoded, marital_encoded, education_encoded, default_encoded, housing_encoded, loan_encoded, contact_encoded, poutcome_encoded, months_encoded, days_encoded, X0

# Now that the categorical variables have been encoded, we can remove the original ones
X = X.drop(columns = ["job", "marital", "education", "default", "housing", "loan", "contact", "month", "day_of_week", "poutcome"])

# We need to solve the dummy variable trap (drop some features to prevent perfect multicollinearity)
X = X.drop(columns = ["job unknown", "marital unknown", "education unknown", "default unknown", "housing unknown", "loan unknown", "contact telephone", "poutcome nonexistent"])

# "Rename" variable
X0 = X
del X


# Feature Scaling

In [0]:
sc_X = StandardScaler()
X = pd.DataFrame(sc_X.fit_transform(X0), columns = X0.columns)

# Multicollinearity

In [24]:
multicollTable = pd.Series([variance_inflation_factor(X.values, i) for i in range(X.shape[1])], index=X.columns)

"You want to remove columns with VIF > 2.5" 

  vif = 1. / (1. - r_squared_i)


'You want to remove columns with VIF > 2.5'

# Feature Selection

In [0]:
# =============================================================================
# TRAIN/TEST SPLIT
# =============================================================================

# For code refining purposes
X = X.iloc[0:32950, :]
Y = Y[0:32950]
from sklearn.model_selection import train_test_split
X_train0, X_test0, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 0)

In [0]:
n = 41 # ARBITRARY NUMBER, USE TRIAL AND ERROR TO FIND WHICH NUMBER OF COLUMNS WILL OPTIMIZE F1 SCORE
test = SelectKBest(k = n)
fit = test.fit(X_train0, Y_train)
X_train = fit.transform(X_train0)
X_test = fit.transform(X_test0)