# Bank Marketing Dataset Analysis

### Importing Libraries

In [None]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.decomposition import PCA
from sklearn import model_selection
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

### Membaca Data Train

In [None]:
data_train = pd.read_csv("/content/sample_data/bank-additional-full.csv", na_values =['NA'])
columns = data_train.columns.values[0].split(';')
columns = [column.replace('"', '') for column in columns]
data_train = data_train.values
data_train = [items[0].split(';') for items in data_train]
data_train = pd.DataFrame(data_train,columns = columns)

data_train['job'] = data_train['job'].str.replace('"', '')
data_train['marital'] = data_train['marital'].str.replace('"', '')
data_train['education'] = data_train['education'].str.replace('"', '')
data_train['default'] = data_train['default'].str.replace('"', '')
data_train['housing'] = data_train['housing'].str.replace('"', '')
data_train['loan'] = data_train['loan'].str.replace('"', '')
data_train['contact'] = data_train['contact'].str.replace('"', '')
data_train['month'] = data_train['month'].str.replace('"', '')
data_train['day_of_week'] = data_train['day_of_week'].str.replace('"', '')
data_train['poutcome'] = data_train['poutcome'].str.replace('"', '')
data_train['y'] = data_train['y'].str.replace('"', '')

FileNotFoundError: ignored

In [None]:
data_train.head()

### Read Data Tes

In [None]:
data_test = pd.read_csv("/content/sample_data/bank-additional-full.csv", na_values =['NA'])
data_test = data_test.values
data_test = [items[0].split(';') for items in data_test]
data_test = pd.DataFrame(data_test,columns = columns)

data_test['job'] = data_test['job'].str.replace('"', '')
data_test['marital'] = data_test['marital'].str.replace('"', '')
data_test['education'] = data_test['education'].str.replace('"', '')
data_test['default'] = data_test['default'].str.replace('"', '')
data_test['housing'] = data_test['housing'].str.replace('"', '')
data_test['loan'] = data_test['loan'].str.replace('"', '')
data_test['contact'] = data_test['contact'].str.replace('"', '')
data_test['month'] = data_test['month'].str.replace('"', '')
data_test['day_of_week'] = data_test['day_of_week'].str.replace('"', '')
data_test['poutcome'] = data_test['poutcome'].str.replace('"', '')
data_test['y'] = data_test['y'].str.replace('"', '')


In [None]:
data_test.head()

### Preprocessing Data

In [None]:
def categorize(df):
    new_df = df.copy()
    le = preprocessing.LabelEncoder()

    new_df['job'] = le.fit_transform(new_df['job'])
    new_df['marital'] = le.fit_transform(new_df['marital'])
    new_df['education'] = le.fit_transform(new_df['education'])
    new_df['default'] = le.fit_transform(new_df['default'])
    new_df['housing'] = le.fit_transform(new_df['housing'])
    new_df['month'] = le.fit_transform(new_df['month'])
    new_df['loan'] = le.fit_transform(new_df['loan'])
    new_df['contact'] = le.fit_transform(new_df['contact'])
    new_df['day_of_week'] = le.fit_transform(new_df['day_of_week'])
    new_df['poutcome'] = le.fit_transform(new_df['poutcome'])
    new_df['y'] = le.fit_transform(new_df['y'])
    return new_df

In [None]:
data = pd.concat([data_train, data_test])
data.replace(['basic.6y','basic.4y', 'basic.9y'], 'basic', inplace=True)


### Memeriksa nilai null

In [None]:
data.isnull().sum()

In [None]:
data.isnull().sum()

### Menampilkan Data Visualization

In [None]:
sns.set(style="ticks", color_codes=True)
sns.countplot(y='job', data=data)


In [None]:
data = data[data.job != 'unknown']

In [None]:
sns.countplot(y='marital', data=data)

In [None]:
data.marital.value_counts()

In [None]:
data = data[data.marital != 'unknown']
data = data[data.loan != 'unknown']

In [None]:
sns.countplot(y='education', data=data)

In [None]:
data = data[data.education != 'illiterate']

In [None]:
data.describe()

In [None]:
sns.countplot(y='y', data=data)

Dari gambar di atas, kita dapat mengatakan bahwa datanya tidak seimbang.

In [None]:
data = categorize(data)
# data = data.convert_objects(convert_numeric)

### Memeriksa outlier menggunakan boxplots

In [None]:
sns.boxplot(x='y', y='duration', data=data)

In [None]:
sns.boxplot(x='y', y='education', data=data)

In [None]:
sns.boxplot(x='y', y='housing', data=data)

In [None]:
# sns.boxplot(data['y'],data['age'])

In [None]:
# sns.boxplot(data['y'],data['job'])

In [None]:
# sns.boxplot(data['y'],data['campaign'])


### Menghilangkan outliers

In [None]:
def remove_outliers(df, column , minimum, maximum):
    col_values = df[column].values
    df[column] = np.where(np.logical_or(col_values<minimum, col_values>maximum), col_values.mean(), col_values)
    return df

In [None]:
# min_val = data["duration"].min()
# max_val = 1500
# data = remove_outliers(df=data, column='duration' , minimum = min_val, maximum = max_val)

# min_val = data["age"].min()
# max_val = 80
# data = remove_outliers(df=data, column='age' , minimum=min_val, maximum=max_val)

# min_val = data["campaign"].min()
# max_val = 6
# data = remove_outliers(df=data, column='campaign' , minimum=min_val, maximum=max_val)


### Menghilangkan kolom yang tidak diperlukan

In [None]:
sns.countplot(x='education',hue='y',data=data)

In [None]:
sns.countplot(x='default',hue='y',data=data)

In [None]:
data = data.drop('default',axis=1)

In [None]:
sns.countplot(x='poutcome',hue='y',data=data)

In [None]:
data = data.drop('poutcome',axis=1)

In [None]:
sns.countplot(x='loan',hue='y',data=data)

In [None]:
sns.countplot(x='contact',hue='y',data=data)

In [None]:
data = data.drop('contact',axis=1)

In [None]:
data = data.drop(['emp.var.rate','cons.price.idx','cons.conf.idx','euribor3m','nr.employed'],axis=1)

In [None]:
data.info()

In [None]:
data.head()

### Membagi data train dan data test

In [None]:
X = data.drop('y',axis = 1).values
y = data['y'].values
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [None]:
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.fit_transform(X_train)

In [None]:
pca = PCA(n_components=10)
pca.fit(X_train)
X_train = pca.fit_transform(X_train)

In [None]:
X_train.shape

### Membangun model yang berbeda dan memvalidasi menggunakan validasi silang 10 kali lipat

In [None]:
models = []
models.append(('LR', LogisticRegression()))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('Decison-Tree', DecisionTreeClassifier()))
models.append(('Gaussian', GaussianNB()))
models.append(('SVM', SVC()))
models.append(('RandForest',RandomForestClassifier(max_depth = 8, n_estimators = 120)))
models.append(('ADA', AdaBoostClassifier(n_estimators = 120)))

In [None]:
results = []
names = []
for name, model in models:
    kfold = model_selection.KFold(n_splits=10, random_state=42)
    cv_results = model_selection.cross_val_score(model, X_train, Y_train, cv=kfold, scoring='accuracy')
    results.append(cv_results)
    names.append(name)
    msg = "{}: {}".format(name, cv_results.mean())
    print(msg)

## Logistic Regression memperoleh akurasi tertinggi dengan runtime yang lebih singkat dan hasil lebih stabil. SVM dan Random Forest mendapatkan akurasi yang hampir sama, tetapi memiliki runtime yang lebih lama dibandingkan dengan Logistic Regression.

In [None]:
sns.set(rc={'figure.figsize':(10,8)})
sns.boxplot(names,results)

### Pengujian dengan data test

In [None]:
scaler.fit(X_test)
X_test = scaler.fit_transform(X_test)

In [None]:
pca.fit(X_test)
X_test = pca.fit_transform(X_test)

In [None]:
lr = LogisticRegression()
lr.fit(X_train, Y_train)
predictions = lr.predict(X_test)
print("Accuracy : ", accuracy_score(Y_test, predictions))
print("Confusion Matrix : \n",confusion_matrix(Y_test, predictions))
print("Classification Report: \n",classification_report(Y_test, predictions))