## Business Understanding

Goal(s):

Predict if the customer will subscribe (yes/no) to a term deposit (variable y)

Success Metric(s):

Hit %81 or above accuracy by evaluating with 5-fold cross validation and reporting the average performance score.

## Data Understanding

In [36]:
## Import libraries / modules
import pandas as pd
import numpy as np
from ydata_profiling import ProfileReport
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.datasets import load_iris
from sklearn.feature_selection import SelectFromModel

%matplotlib inline

In [52]:
"""
Load data from CSV
"""
def load_data(csv_file_path):
    df = pd.read_csv(csv_file_path)
    return df

"""
Auto EDA of dataset
"""
def auto_eda(df, title):
    profile = ProfileReport(df, title=title)
    profile.to_file(output_file='../reports/report.html')
    
"""
Change column names
"""
def change_column_name(df, col_index_label):
    column_names = df.columns.tolist()
    for key, value in col_index_label.items():
        df = df.rename(columns={column_names[key]: value})
    return df
    
"""
drop unnecessary columns
"""
def drop_columns_by_index(df, indices):
    df = df.drop(df.columns[indices], axis=1)
    return df

"""
drop rows
"""
def drop_rows(df, indices):
    df = df.drop(indices, axis=0)
    return df

"""
drop duplicate row ignoring numeric columns
"""
def drop_duplicate_rows(df):
    duplicated_rows = df.duplicated(keep='first')
    row_numbers = np.where(duplicated_rows)[0]
    df = drop_rows(df, row_numbers)
    print(pd.DataFrame(row_numbers))
    return df
    
"""
Scale numeric columns
"""
def scale_numeric_data(df, col_indices):
    scaler = MinMaxScaler()
    df.iloc[:, col_indices] = scaler.fit_transform(df.iloc[:, col_indices])
    return df

"""
Encode categorical columns
"""
def encode_cat_columns(df, col_names):
    df_encoded = pd.get_dummies(df, columns = col_names)
    return df_encoded

"""
Return columns
"""
def get_col_names(df):
    return df.columns

"""
Return column indices by name
"""
def get_col_indices(df, col_names):
    column_indices = [df.columns.get_loc(col) for col in col_names]
    return column_indices

"""
Get Test train split. default = 75%
"""
def get_test_train_split(df, target_col_index, random_state=42):
    X = df.drop(df.columns[target_col_index], axis=1)
    y = pd.DataFrame(df.iloc[:, target_col_index])
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)
    return [X_train, X_test, y_train, y_test]

"""
Return X and y only
"""
def get_X_y(df, target_col_index):
    X = df.drop(df.columns[target_col_index], axis=1)
    y = pd.DataFrame(df.iloc[:, target_col_index])
    return [X, y]
    

"""
Return Forest model for feature importance
"""

def get_forest_model(X_train, y_train):
    forest = RandomForestClassifier(random_state=0)
    forest.fit(X_train, y_train)
    return forest

"""
Retturn LinearSVC model
"""
def get_feature_selection_model(X_train, y_train):
    lsvc = LinearSVC(C=0.01, penalty="l1", dual=False).fit(X_train, y_train)
    model = SelectFromModel(lsvc, prefit=True)
    return model

In [38]:
df = load_data("../data/raw/term-deposit-marketing-2020.csv")
df

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39995,53,technician,married,tertiary,no,395,no,no,cellular,3,jun,107,1,no
39996,30,management,single,tertiary,no,3340,no,no,cellular,3,jun,238,3,yes
39997,54,admin,divorced,secondary,no,200,no,no,cellular,3,jun,170,1,yes
39998,34,management,married,tertiary,no,1047,no,no,cellular,3,jun,342,1,no


In [39]:
df2 = change_column_name(df, {4: 'has_credit_default', 6: 'has_housing_loan', 7: 'has_personal_loan', 
                                             8: 'contact_mode', 12: 'num_of_contacts'})
df2.head()

Unnamed: 0,age,job,marital,education,has_credit_default,balance,has_housing_loan,has_personal_loan,contact_mode,day,month,duration,num_of_contacts,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,no


### AutoEDA Report generation

In [7]:
auto_eda(df, "Term Deposit Marketing Report")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

### Data cleaning

In [40]:
## Drop day and month column as they do not affect target variable
df_ = drop_columns_by_index(df2, [9,10])
df_

Unnamed: 0,age,job,marital,education,has_credit_default,balance,has_housing_loan,has_personal_loan,contact_mode,duration,num_of_contacts,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,261,1,no
1,44,technician,single,secondary,no,29,yes,no,unknown,151,1,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,76,1,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,92,1,no
4,33,unknown,single,unknown,no,1,no,no,unknown,198,1,no
...,...,...,...,...,...,...,...,...,...,...,...,...
39995,53,technician,married,tertiary,no,395,no,no,cellular,107,1,no
39996,30,management,single,tertiary,no,3340,no,no,cellular,238,3,yes
39997,54,admin,divorced,secondary,no,200,no,no,cellular,170,1,yes
39998,34,management,married,tertiary,no,1047,no,no,cellular,342,1,no


### Feature Scaling

In [41]:
## Normalize numeric columns
df_scaled = scale_numeric_data(df_, [0, 5, 9, 10])
df_scaled

Unnamed: 0,age,job,marital,education,has_credit_default,balance,has_housing_loan,has_personal_loan,contact_mode,duration,num_of_contacts,y
0,0.513158,management,married,tertiary,no,0.092259,yes,no,unknown,0.053070,0.000000,no
1,0.328947,technician,single,secondary,no,0.073067,yes,no,unknown,0.030704,0.000000,no
2,0.184211,entrepreneur,married,secondary,no,0.072822,yes,yes,unknown,0.015453,0.000000,no
3,0.368421,blue-collar,married,unknown,no,0.086476,yes,no,unknown,0.018707,0.000000,no
4,0.184211,unknown,single,unknown,no,0.072812,no,no,unknown,0.040260,0.000000,no
...,...,...,...,...,...,...,...,...,...,...,...,...
39995,0.447368,technician,married,tertiary,no,0.076390,no,no,cellular,0.021757,0.000000,no
39996,0.144737,management,single,tertiary,no,0.103127,no,no,cellular,0.048394,0.032258,yes
39997,0.460526,admin,divorced,secondary,no,0.074619,no,no,cellular,0.034567,0.000000,yes
39998,0.197368,management,married,tertiary,no,0.082309,no,no,cellular,0.069540,0.000000,no


In [15]:
## Remove duplicates

## Find indices of duplicate rows while ignoring numeric column and then drop the duplicate rows from dataframe
df_cleaned = drop_duplicate_rows(df_scaled)
df_cleaned


       0
0  26091
1  38952


Unnamed: 0,age,job,marital,education,has_credit_default,balance,has_housing_loan,has_personal_loan,contact_mode,duration,num_of_contacts,y
0,0.513158,management,married,tertiary,no,0.092259,yes,no,unknown,0.053070,0.000000,no
1,0.328947,technician,single,secondary,no,0.073067,yes,no,unknown,0.030704,0.000000,no
2,0.184211,entrepreneur,married,secondary,no,0.072822,yes,yes,unknown,0.015453,0.000000,no
3,0.368421,blue-collar,married,unknown,no,0.086476,yes,no,unknown,0.018707,0.000000,no
4,0.184211,unknown,single,unknown,no,0.072812,no,no,unknown,0.040260,0.000000,no
...,...,...,...,...,...,...,...,...,...,...,...,...
39995,0.447368,technician,married,tertiary,no,0.076390,no,no,cellular,0.021757,0.000000,no
39996,0.144737,management,single,tertiary,no,0.103127,no,no,cellular,0.048394,0.032258,yes
39997,0.460526,admin,divorced,secondary,no,0.074619,no,no,cellular,0.034567,0.000000,yes
39998,0.197368,management,married,tertiary,no,0.082309,no,no,cellular,0.069540,0.000000,no


In [42]:
df_cleaned.job.unique()

array(['management', 'technician', 'entrepreneur', 'blue-collar',
       'unknown', 'retired', 'admin', 'services', 'self-employed',
       'unemployed', 'housemaid', 'student'], dtype=object)

#### One-hot encoding categorical columns

In [43]:
col_names = ['job', 'marital', 'education', 'has_credit_default', 'has_housing_loan', 'has_personal_loan', 'contact_mode', 'y']
df_encoded = encode_cat_columns(df_cleaned, col_names)
df_encoded

Unnamed: 0,age,balance,duration,num_of_contacts,job_admin,job_blue-collar,job_entrepreneur,job_housemaid,job_management,job_retired,...,has_credit_default_yes,has_housing_loan_no,has_housing_loan_yes,has_personal_loan_no,has_personal_loan_yes,contact_mode_cellular,contact_mode_telephone,contact_mode_unknown,y_no,y_yes
0,0.513158,0.092259,0.053070,0.000000,0,0,0,0,1,0,...,0,0,1,1,0,0,0,1,1,0
1,0.328947,0.073067,0.030704,0.000000,0,0,0,0,0,0,...,0,0,1,1,0,0,0,1,1,0
2,0.184211,0.072822,0.015453,0.000000,0,0,1,0,0,0,...,0,0,1,0,1,0,0,1,1,0
3,0.368421,0.086476,0.018707,0.000000,0,1,0,0,0,0,...,0,0,1,1,0,0,0,1,1,0
4,0.184211,0.072812,0.040260,0.000000,0,0,0,0,0,0,...,0,1,0,1,0,0,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39995,0.447368,0.076390,0.021757,0.000000,0,0,0,0,0,0,...,0,1,0,1,0,1,0,0,1,0
39996,0.144737,0.103127,0.048394,0.032258,0,0,0,0,1,0,...,0,1,0,1,0,1,0,0,0,1
39997,0.460526,0.074619,0.034567,0.000000,1,0,0,0,0,0,...,0,1,0,1,0,1,0,0,0,1
39998,0.197368,0.082309,0.069540,0.000000,0,0,0,0,1,0,...,0,1,0,1,0,1,0,0,1,0


In [55]:
## Drop column with binary classes
columns = get_col_names(df_encoded)
print(columns)
#df_dropped_binary = drop_columns_by_name(df_encoded,  ['y_no', 'has_personal_loan_no', 'has_housing_loan_no', 'has_credit_default_no'])

Index(['age', 'balance', 'duration', 'num_of_contacts', 'job_admin',
       'job_blue-collar', 'job_entrepreneur', 'job_housemaid',
       'job_management', 'job_retired', 'job_self-employed', 'job_services',
       'job_student', 'job_technician', 'job_unemployed', 'job_unknown',
       'marital_divorced', 'marital_married', 'marital_single',
       'education_primary', 'education_secondary', 'education_tertiary',
       'education_unknown', 'has_credit_default_no', 'has_credit_default_yes',
       'has_housing_loan_no', 'has_housing_loan_yes', 'has_personal_loan_no',
       'has_personal_loan_yes', 'contact_mode_cellular',
       'contact_mode_telephone', 'contact_mode_unknown', 'y_no', 'y_yes'],
      dtype='object')


In [56]:
binary_cols = ['y_no', 'has_personal_loan_no', 'has_housing_loan_no', 'has_credit_default_no']
col_indices = get_col_indices(df_encoded, binary_cols)
col_indices

[32, 27, 25, 23]

In [57]:
df_binary_dropped = drop_columns_by_index(df_encoded, col_indices)
df_binary_dropped

Unnamed: 0,age,balance,duration,num_of_contacts,job_admin,job_blue-collar,job_entrepreneur,job_housemaid,job_management,job_retired,...,education_secondary,education_tertiary,education_unknown,has_credit_default_yes,has_housing_loan_yes,has_personal_loan_yes,contact_mode_cellular,contact_mode_telephone,contact_mode_unknown,y_yes
0,0.513158,0.092259,0.053070,0.000000,0,0,0,0,1,0,...,0,1,0,0,1,0,0,0,1,0
1,0.328947,0.073067,0.030704,0.000000,0,0,0,0,0,0,...,1,0,0,0,1,0,0,0,1,0
2,0.184211,0.072822,0.015453,0.000000,0,0,1,0,0,0,...,1,0,0,0,1,1,0,0,1,0
3,0.368421,0.086476,0.018707,0.000000,0,1,0,0,0,0,...,0,0,1,0,1,0,0,0,1,0
4,0.184211,0.072812,0.040260,0.000000,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39995,0.447368,0.076390,0.021757,0.000000,0,0,0,0,0,0,...,0,1,0,0,0,0,1,0,0,0
39996,0.144737,0.103127,0.048394,0.032258,0,0,0,0,1,0,...,0,1,0,0,0,0,1,0,0,1
39997,0.460526,0.074619,0.034567,0.000000,1,0,0,0,0,0,...,1,0,0,0,0,0,1,0,0,1
39998,0.197368,0.082309,0.069540,0.000000,0,0,0,0,1,0,...,0,1,0,0,0,0,1,0,0,0


In [58]:
cols_after_clean = get_col_names(df_binary_dropped)
cols_after_clean

Index(['age', 'balance', 'duration', 'num_of_contacts', 'job_admin',
       'job_blue-collar', 'job_entrepreneur', 'job_housemaid',
       'job_management', 'job_retired', 'job_self-employed', 'job_services',
       'job_student', 'job_technician', 'job_unemployed', 'job_unknown',
       'marital_divorced', 'marital_married', 'marital_single',
       'education_primary', 'education_secondary', 'education_tertiary',
       'education_unknown', 'has_credit_default_yes', 'has_housing_loan_yes',
       'has_personal_loan_yes', 'contact_mode_cellular',
       'contact_mode_telephone', 'contact_mode_unknown', 'y_yes'],
      dtype='object')

### L1-based feature selection
ref. : https://scikit-learn.org/stable/modules/feature_selection.html#:~:text=L1%2Dbased%20feature%20selection,-Linear%20models%20penalized&text=With%20SVMs%20and%20logistic%2Dregression,parameter%2C%20the%20fewer%20features%20selected.

In [65]:
X, y = get_X_y(df_binary_dropped, 29)

In [66]:
X.shape

(39998, 29)

In [67]:
## Get feature selection model 

feature_selection_model = get_feature_selection_model(X, y)
X_new = feature_selection_model.transform(X)
new_data = pd.DataFrame(X_new)
new_data

  y = column_or_1d(y, warn=True)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,0.053070,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0
1,0.030704,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
2,0.015453,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0
3,0.018707,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0
4,0.040260,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...
39993,0.021757,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
39994,0.048394,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
39995,0.034567,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
39996,0.069540,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0


In [76]:
count_1, count_0 = y['y_yes'].value_counts()[1], y['y_yes'].value_counts()[0]
count_1, count_0

(2896, 37102)

### Undersampling negative class