# - Classification -

# Data Preparation

In [1]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)

In [2]:
df = pd.read_csv('../input/bank-marketing-data-set/bank-additional-full.csv', sep=';')
print(df.head())

# Data Cleansing

In [3]:
print(df.info())

In [4]:
print(df.describe())

In [5]:
print(df.isnull().sum())

# Data Transformation and Visualization

In [6]:
df['job'] = df['job'].replace('\.+', '', regex=True)
df['education'] = df['education'].replace('\.+', ' ', regex=True).str.title()

obj_col_lc = list(df.select_dtypes(include=['object']).columns)
for i in range(len(obj_col_lc)):
    df[obj_col_lc[i]] = df[obj_col_lc[i]].str.title()

In [7]:
print(df.head())

In [8]:
import matplotlib.pyplot as plt

In [9]:
plt.pie(df.groupby(['y']).y.count().reset_index(name='counts').counts, labels=df.groupby(['y']).y.count().reset_index(name='counts').y, 
        autopct='%1.1f%%')
plt.title('Proportion of Client Subscribed a Term Deposit')
plt.legend()
plt.show()

In [10]:
df_jc = df.pivot_table(index='y', columns='job', values='campaign', aggfunc='count', fill_value=0)
df_jc = df_jc.reindex(columns=df_jc.count().sort_values(ascending=False).index)
df_jc.plot.pie(subplots=True, 
               figsize=(10, 10),
               layout=(-1, 3),
               autopct='%1.0f%%',
               title='Proportion Client by Education')
plt.tight_layout()
plt.show()

In [11]:
from sklearn.preprocessing import LabelEncoder

In [12]:
df['month'] = df['month'].replace({'Mar': 2,'Apr': 3, 'May': 4, 'Jun': 5, 'Jul': 6, 'Aug': 7, 'Sep': 8, 'Oct': 9, 'Nov': 10, 'Dec': 11})
df['day_of_week'] = df['day_of_week'].replace({'Mon': 0, 'Tue': 1, 'Wed': 2, 'Thu': 3, 'Fri': 4})

In [13]:
le = LabelEncoder()
for i in range(len(obj_col_lc)):
    df[obj_col_lc[i]] = le.fit_transform(df[obj_col_lc[i]])

## Correlation
Here, I will use Pearson's Correlation to select Independent Variable that have impact on the Dependent Variable

In [14]:
import seaborn as sns

In [15]:
plt.figure(figsize=(15,8))
sns.heatmap(round(df.corr(),2), annot=True)
plt.show()

In [16]:
from scipy.stats import pearsonr

In [17]:
def calculate_pvalues(df):
    df = df.dropna()._get_numeric_data()
    dfcols = pd.DataFrame(columns=df.columns)
    pvalues = dfcols.transpose().join(dfcols, how='outer')
    for r in df.columns:
        for c in df.columns:
            pvalues[r][c] = round(pearsonr(df[r], df[c])[1], 4)
    return pvalues

In [18]:
calculate_pvalues(df)['y']

From the results of the Pearson correlation calculation above, it shows that the variable housing, loan, and day_of_week have no impact on the dependent variable, because: 
$$ p-value > 0.05 $$
So H0 rejected. For that, drop that columns.

In [19]:
df = df.drop(columns=['housing','loan','day_of_week'])

In [20]:
print(df.head())

# Modelling

Here data is indicated imbalance (See ouput [9]). I create a model for this data using XGBoost Algorithm with imbalanace method.

In [21]:
X = df.drop(columns=['y'])
y = df.y

In [22]:
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score

In [23]:
np.random.seed(0)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=X[['default']])

In [24]:
from imblearn.over_sampling import SMOTE

In [25]:
resample = SMOTE(n_jobs=-1, random_state=42, k_neighbors=2)
X_train_res, y_train_res = resample.fit_resample(X_train, y_train)

In [26]:
def models(X_train, y_train, X_test, y_test):
    model = XGBClassifier(n_jobs = -1, random_state = 0, 
                          n_estimators = 1000, use_label_encoder = False, 
                          learning_rate = 0.2, gamma = 0.0, 
                          reg_alpha = 1, reg_lambda = 2)
    model.fit(X_train, y_train, eval_metric='error', 
              eval_set=[(X_train, y_train), (X_test, y_test)], verbose=False)
    best_model = model.get_booster().best_iteration
    y_pred = model.predict(X_test, iteration_range=(0, best_model))
    accuracy = accuracy_score(y_test, y_pred)
    precision_0 = precision_score(y_test, y_pred, pos_label=0)
    precision_1 = precision_score(y_test, y_pred, pos_label=1)
    confusionmatrix = pd.crosstab(y_test, y_pred, rownames=['Actual'], colnames=['Predicted'])
    
    return {'y_pred': y_pred, 'accuracy': accuracy, 'precision': (precision_0+precision_1)/2, 
            'confusionmatrix': confusionmatrix, 'best_iterations': best_model}

In [27]:
XGBmodel_res = models(X_train_res, y_train_res, X_test, y_test)

In [28]:
print('XGB Model with Resample')
print()
print('Confusion Matrix:')
print()
print(XGBmodel_res['confusionmatrix'])
print()
print('Accuracy Model:', '{:.2%}'.format(XGBmodel_res['accuracy']))
print()
print('Avg Precision Model:', '{:.2%}'.format(XGBmodel_res['precision']))

In [29]:
output = pd.DataFrame()
output['actual'] = y_test
output['y_pred'] = XGBmodel_res['y_pred']

In [30]:
print(output.head())

# Conclusion
The data used for this classification analysis turns out to be a data imbalance. Can be seen the output [9] "Proportion of Client Subscribed a Term Deposit". For that, I use SMOTE to solve the problem for data imbalance. Proportion of this Data Imbalance, inlcuded in moderate category 1-20% ([Shirazinia, 2020](https://medium.com/analytics-ai-swedbank/imbalanced-learning-in-banking-1bd3868a496d)). And the results of modelling using the imbalance method show good results in predicting the success of bank telemarketing. The accuracy obtained is **90.91%** and average precision from two labels is **78.13%**.