In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

In [2]:
def initialSetup(fl_path: str) -> pd.DataFrame:
    """This is the initial setup of the program
    """

    # Read the data based on its filepath
    data = pd.read_csv(fl_path, delimiter=";")

    # Copy data for sanity check
    copy_data = data.copy()

    # Get duplicates by looping through the data until there is no duplicate
    n_duplicates = data.duplicated(keep=False).sum()
    while n_duplicates != 0:
        print(f"There are duplicates. Total duplicates: {n_duplicates}\n"
              +"Cleaning Duplicates....\n")
        data.drop_duplicates(keep="first", inplace=True)

        # Re-evaluate number of duplicates
        n_duplicates = data.duplicated(keep=False).sum()

    print("No duplicates in the data")

    # Running Sanity Check for data shape
    return data


In [3]:
def splittingData(data: pd.DataFrame, target: str):
    """Splitting Input and Output Data
    
    We split the dataset into independent and dependent variable.
    Then create data training and data test from sklearn train_test_split()
    """

    margin_error = 0.05
    test_size = 0.25

    # Get total variable from the entire data
    total_variables = len(data.columns)

    # Separate Independent Variable (Input) and target variable (output)
    # Common convention X is input, y is output
    y = data[target]
    X = data.drop(target, axis=1)

    # Sanity Check Input-Output
    # output data should only contain one column
    # input data should contain the rest of variable (total_variable-1)
    if (len(X.columns) == total_variables-1) \
        or (len(y.shape) == 1):
        print("Splitting Input-Output success")
    else:
        print("Splitting Input-Ouput failed")
        return None

    # Use the input and output data to split train and test data
    X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                        test_size=test_size,
                                                        random_state=123)

    # Sanity Check for Train Split Data
    print("Sanity Check for Train and Test data shape...\n"
          +f"X_train data shape: {X_train.shape}\n"
          +f"y_train data shape: {X_test.shape}\n")
    
    # Create margin error of splitting result
    low_threshold = test_size-(test_size*margin_error)
    upper_threshold = test_size+(test_size*margin_error)

    # Define the test size ratio after data is splitted
    test_size_ratio = X_test.shape[0]/X.shape[0]

    # Sanity Check for Ratio
    print("Sanity Check for Train and Test data proportion....")
    if (test_size_ratio > low_threshold) \
        or (test_size_ratio < upper_threshold):
        print(f"Defined ratio: {test_size}\n"
              f"Ratio after splitting: {test_size_ratio}\n"
              +"Splitting proportion is similar")
    else:
        print(f"Defined ratio: {test_size}\n"
              f"Ratio after splitting: {test_size_ratio}\n"
              +"Splitting proportion is different")

    return X_train, X_test, y_train, y_test

In [4]:
def numericalImputer(data: pd.DataFrame, 
                     numerical_columns: list ) -> tuple:
    """Function to impute numerical data"""

    # Create numerical data
    numerical_data = data[numerical_columns]

    # Create imputer
    numerical_imputer = SimpleImputer(missing_values=np.nan,
                                      strategy="median")
    numerical_imputer.fit(numerical_data)

    # Transform
    imputed_data = numerical_imputer.transform(numerical_data)
    imputed_numerical_data = pd.DataFrame(imputed_data)

    # Assigned columns and index value from the original data for consistency
    imputed_numerical_data.columns = numerical_columns
    imputed_numerical_data.index = numerical_data.index

    return imputed_numerical_data, numerical_imputer

In [5]:
def categoricalImputer(data: pd.DataFrame, 
                       categorical_columns: list,
                       experiment="ohe"):
    """Impute and Clean Missing Value"""

    # Create categorical data
    categorical_data = data[categorical_columns]

    # Finding Missing value
    n_missing_value = categorical_data.isnull().any().sum()
    if n_missing_value > 0:
        categorical_data.fillna("Unknown", inplace=True)
    else:
        pass

    # experiment (?)
    if experiment == "label":
        pass
    elif experiment == "ohe":
        categorical_ohe = pd.get_dummies(categorical_data)
    elif experiment == "mix":
        pass

    return categorical_ohe

In [6]:
def dataStandarization(data: pd.DataFrame) -> tuple:

    # Get initial columns and index for data consistency    
    data_columns = data.columns
    data_index = data.index

    # Create scaler
    scaler = StandardScaler()
    scaler.fit(data)

    # Transform
    scaler_transformation = scaler.transform(data)
    standardized_data = pd.DataFrame(scaler_transformation)

    # Assign the initial columns and index
    standardized_data.columns = data_columns
    standardized_data.index = data_index

    return standardized_data, scaler

In [35]:
def testDataProcessing(data: pd.DataFrame,
                       numerical_columns: list,
                       categorical_columns: list,
                       numerical_imputer: SimpleImputer,
                       scaler: StandardScaler) -> tuple:
    """Test Data Preprocessing"""

    # Processing numerical data
    numerical_data = data[numerical_columns]
    numerical_data = pd.DataFrame(numerical_imputer.transform(numerical_data))
    numerical_data.columns = numerical_columns
    numerical_data.index = data.index

    # Categorical Data Preprocessing
    categorical_data = categoricalImputer(data, categorical_columns)

    # merge_data
    data_concat = pd.concat(
        [numerical_data, categorical_data], axis=1
    )

    # standardized data to get clean data
    clean_data = pd.DataFrame(scaler.transform(data_concat))
    clean_data.columns = data_concat.columns

    return clean_data
    

In [7]:
bank_df = initialSetup("bank-additional-full.csv")

There are duplicates. Total duplicates: 24
Cleaning Duplicates....

No duplicates in the data


In [8]:
## As per dataset recommendation we would like to drop duration variable
bank_df.drop(columns=["duration"], inplace=True)
bank_df.shape

(41176, 20)

In [9]:
test = bank_df["age"]

In [10]:
len(test.shape)

1

In [11]:
# Train Test Split
X_train, X_test, y_train, y_test = splittingData(data=bank_df, target="y")

Splitting Input-Output success
Sanity Check for Train and Test data shape...
X_train data shape: (30882, 19)
y_train data shape: (10294, 19)

Sanity Check for Train and Test data proportion....
Defined ratio: 0.25
Ratio after splitting: 0.25
Splitting proportion is similar


In [12]:
X_train.head(2)

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed
38852,35,self-employed,single,university.degree,no,yes,no,cellular,nov,mon,1,999,0,nonexistent,-3.4,92.649,-30.1,0.714,5017.5
17462,47,housemaid,married,basic.4y,unknown,no,no,cellular,jul,mon,3,999,0,nonexistent,1.4,93.918,-42.7,4.962,5228.1


In [13]:
y_train.head(2)

38852    no
17462    no
Name: y, dtype: object

In [14]:
# Data Preprocessing

In [15]:
# separating categorical and numerical data
categorical_columns = [
    "job", "marital", "education", "default", "housing",
    "loan", "contact", "month", "day_of_week", "poutcome",
]

In [16]:
numerical_columns = [
    "age", "campaign", "pdays", "previous", "emp.var.rate", 
    "cons.price.idx", "cons.conf.idx", "euribor3m", "nr.employed",
]

In [17]:
# handling numerical data
X_train_numerical, numerical_imputer = numericalImputer(data=X_train,
                                                        numerical_columns=numerical_columns)

In [18]:
X_train_numerical.head(2)

Unnamed: 0,age,campaign,pdays,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed
38852,35.0,1.0,999.0,0.0,-3.4,92.649,-30.1,0.714,5017.5
17462,47.0,3.0,999.0,0.0,1.4,93.918,-42.7,4.962,5228.1


In [19]:
# Handling categorical data
X_train_categorical = categoricalImputer(data=X_train,
                                         categorical_columns=categorical_columns)

In [20]:
X_train_categorical.head(2)

Unnamed: 0,job_admin.,job_blue-collar,job_entrepreneur,job_housemaid,job_management,job_retired,job_self-employed,job_services,job_student,job_technician,...,month_oct,month_sep,day_of_week_fri,day_of_week_mon,day_of_week_thu,day_of_week_tue,day_of_week_wed,poutcome_failure,poutcome_nonexistent,poutcome_success
38852,0,0,0,0,0,0,1,0,0,0,...,0,0,0,1,0,0,0,0,1,0
17462,0,0,0,1,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0


In [21]:
X_train_concat = pd.concat(
    [X_train_numerical, X_train_categorical], axis=1
)

In [22]:
X_train_concat.head(2)

Unnamed: 0,age,campaign,pdays,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,job_admin.,...,month_oct,month_sep,day_of_week_fri,day_of_week_mon,day_of_week_thu,day_of_week_tue,day_of_week_wed,poutcome_failure,poutcome_nonexistent,poutcome_success
38852,35.0,1.0,999.0,0.0,-3.4,92.649,-30.1,0.714,5017.5,0,...,0,0,0,1,0,0,0,0,1,0
17462,47.0,3.0,999.0,0.0,1.4,93.918,-42.7,4.962,5228.1,0,...,0,0,0,1,0,0,0,0,1,0


In [23]:
X_train_concat.isnull().any().sum()

0

In [24]:
# Standardized Variable
X_train_clean, scaler = dataStandarization(X_train_concat)

In [25]:
X_train_clean.head()

Unnamed: 0,age,campaign,pdays,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,job_admin.,...,month_oct,month_sep,day_of_week_fri,day_of_week_mon,day_of_week_thu,day_of_week_tue,day_of_week_wed,poutcome_failure,poutcome_nonexistent,poutcome_success
38852,-0.484465,-0.566932,0.194257,-0.348581,-2.227561,-1.607972,2.249942,-1.685432,-2.082961,-0.584554,...,-0.134533,-0.115852,-0.485362,1.957191,-0.516827,-0.493089,-0.493647,-0.338574,0.396749,-0.185191
17462,0.672423,0.159255,0.194257,-0.348581,0.834933,0.586558,-0.475762,0.768882,0.842806,-0.584554,...,-0.134533,-0.115852,-0.485362,1.957191,-0.516827,-0.493089,-0.493647,-0.338574,0.396749,-0.185191
6254,0.961645,0.159255,0.194257,-0.348581,0.643528,0.717987,0.88709,0.708218,0.327393,-0.584554,...,-0.134533,-0.115852,-0.485362,-0.510936,-0.516827,2.028032,-0.493647,-0.338574,0.396749,-0.185191
31366,-0.388058,-0.203839,0.194257,1.676795,-1.206729,-1.186014,-1.232902,-1.327223,-0.94933,1.710704,...,-0.134533,-0.115852,-0.485362,-0.510936,-0.516827,-0.493089,2.025739,2.953566,-2.520487,-0.185191
27748,-0.195243,-0.566932,0.194257,-0.348581,-1.206729,-1.272481,-2.054939,-1.082831,-0.94933,-0.584554,...,-0.134533,-0.115852,-0.485362,-0.510936,1.934885,-0.493089,-0.493647,-0.338574,0.396749,-0.185191


In [26]:
# Define Baseline
y_train.value_counts(normalize=True)

no     0.887831
yes    0.112169
Name: y, dtype: float64

##### **Model Fitting**

In [27]:
# Fitting Model with KNN

knn = KNeighborsClassifier(n_neighbors=10)
knn.fit(X_train_clean, y_train)

In [28]:
# Fitting Model with Decision Tree

dec_tree = DecisionTreeClassifier(criterion="entropy",
                                  max_depth=10,
                                  random_state=0)
dec_tree.fit(X_train_clean, y_train)

##### **Model Prediction**

In [29]:
# KNN Prediction
knn_score = knn.score(X_train_clean, y_train)
knn_score

0.9044750987630334

In [30]:
dec_tree_score = dec_tree.score(X_train_clean, y_train)
dec_tree_score

0.9143514021112622

##### **Test Prediction**

**<p>Test Data Preprocessing<p>**

In [36]:
X_test_clean = testDataProcessing(data=X_test,
                                  numerical_columns=numerical_columns,
                                  categorical_columns=categorical_columns,
                                  numerical_imputer=numerical_imputer,
                                  scaler=scaler)

In [37]:
X_test_clean.head(2)

Unnamed: 0,age,campaign,pdays,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,job_admin.,...,month_oct,month_sep,day_of_week_fri,day_of_week_mon,day_of_week_thu,day_of_week_tue,day_of_week_wed,poutcome_failure,poutcome_nonexistent,poutcome_success
0,-1.06291,-0.566932,0.194257,-0.348581,0.643528,0.717987,0.88709,0.709373,0.327393,-0.584554,...,-0.134533,-0.115852,-0.485362,-0.510936,-0.516827,-0.493089,2.025739,-0.338574,0.396749,-0.185191
1,1.443681,0.159255,0.194257,-0.348581,0.834933,0.586558,-0.475762,0.768882,0.842806,-0.584554,...,-0.134533,-0.115852,-0.485362,-0.510936,-0.516827,2.028032,-0.493647,-0.338574,0.396749,-0.185191


In [38]:
X_train_clean.head(2)

Unnamed: 0,age,campaign,pdays,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,job_admin.,...,month_oct,month_sep,day_of_week_fri,day_of_week_mon,day_of_week_thu,day_of_week_tue,day_of_week_wed,poutcome_failure,poutcome_nonexistent,poutcome_success
38852,-0.484465,-0.566932,0.194257,-0.348581,-2.227561,-1.607972,2.249942,-1.685432,-2.082961,-0.584554,...,-0.134533,-0.115852,-0.485362,1.957191,-0.516827,-0.493089,-0.493647,-0.338574,0.396749,-0.185191
17462,0.672423,0.159255,0.194257,-0.348581,0.834933,0.586558,-0.475762,0.768882,0.842806,-0.584554,...,-0.134533,-0.115852,-0.485362,1.957191,-0.516827,-0.493089,-0.493647,-0.338574,0.396749,-0.185191


**<p>Evaluate Model Performance</p>**

In [39]:
knn_test_score = knn.score(X_test_clean, y_test)
knn_test_score

0.8945016514474451

In [40]:
dt_test_score = dec_tree.score(X_test_clean, y_test)
dt_test_score

0.8943073635127259