In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import jaccard_score
from sklearn.metrics import confusion_matrix
from sklearn import svm
from sklearn.model_selection import GridSearchCV

In [None]:
df_raw = pd.read_csv('train.csv', delimiter=';', encoding='UTF-8')
df_raw.head()

In [None]:
df_raw.describe(include = 'all')

Results from EDA above:
Numeric Data:
1) Age range from 18 - 95, mean = 40, std = 10.6
2) Balance - 1362 mean, std = 3044
Binary Data:
1) Default history - Yes / No
2) Housing loan - Yes / No
3) Personal Loan - Yes / No
Categorical Data:
1) Job - 12 categories
2) Marital - 3 categories
3) Education - 4 categories

9) contact: contact communication type (categorical: "unknown","telephone","cellular")
10) day: last contact day of the month (numeric)
11) month: last contact month of year (categorical: "jan", "feb", "mar", …, "nov", "dec")
12) duration: last contact duration, in seconds (numeric)
13) campaign: number of contacts performed during this campaign and for this client (numeric, includes last contact)
14) pdays: number of days that passed by after the client was last contacted from a previous campaign (numeric, -1 means client was not previously contacted)
15) previous: number of contacts performed before this campaign and for this client (numeric)
16) poutcome: outcome of the previous marketing campaign (categorical: "unknown","other","failure","success")

In [None]:
df_raw.dtypes

Binary variables to be mapped to 0 & 1 respectively

1) Default
2) Housing
3) Loan

In [None]:
df_raw['default'].replace({'no':0, 'yes':1},inplace=True)
df_raw['housing'].replace({'yes': 1, 'no': 0}, inplace=True)
df_raw['loan'].replace({'no':0, 'yes':1},inplace=True)

**Copying data frame to a new data frame before appying getdummies method for one-hot encoding**

In [None]:
df_work = df_raw.copy()
df_work.head()

*Note that month column has <u>not</u> been converted to dummy values*

In [None]:
df_dummy = pd.get_dummies(df_work, columns=['job','marital','education','contact','poutcome'], drop_first=True)

In [None]:
scaler = StandardScaler()

*Copying data to a new df again <u>after applying one-hot encoding</u> and before applying standardization*

In [None]:
# Applying standardization transformations to selected variables
df_standardized = df_dummy.copy()
df_standardized['age'] = scaler.fit_transform(df_standardized[['age']])
df_standardized['balance'] = scaler.fit_transform(df_standardized[['balance']])
df_standardized['duration'] = scaler.fit_transform(df_standardized[['duration']])
df_standardized['campaign'] = scaler.fit_transform(df_standardized[['campaign']])
df_standardized['pdays'] = scaler.fit_transform(df_standardized[['pdays']])
df_standardized['previous'] = scaler.fit_transform(df_standardized[['previous']])
df_standardized.head()

In [None]:
# Renaming columns for easier comprehension
df_standardized.rename(columns={'campaign':'number of contacts'}, inplace=True)
df_standardized.rename(columns={'previous': 'previous # contacts'}, inplace=True)
df_standardized.rename(columns={'pdays':'#days gap b/w contact'}, inplace=True)

In [None]:
## Replacing the '-1' which meant not contacted to '0'
df_standardized['pdays'].replace({-1: 0}, inplace=True)

Initiate and populate the train variables i.e. x_train and y_train

In [None]:
columns_order = ['age',
 'default',
 'balance',
 'housing',
 'loan',
 'day',
 'month',
 'duration',
 'number of contacts',
 '#days gap b/w contact',
 'previous # contacts',
 'job_blue-collar',
 'job_entrepreneur',
 'job_housemaid',
 'job_management',
 'job_retired',
 'job_self-employed',
 'job_services',
 'job_student',
 'job_technician',
 'job_unemployed',
 'job_unknown',
 'marital_married',
 'marital_single',
 'education_secondary',
 'education_tertiary',
 'education_unknown',
 'contact_telephone',
 'contact_unknown',
 'poutcome_other',
 'poutcome_success',
 'poutcome_unknown',
'y']

In [None]:
df_standardized = df_standardized[columns_order]

In [None]:
x_train = df_standardized.drop('y', axis=1)

In [None]:
x_train.head()

In [None]:
y_train = df_standardized['y']

In [None]:
y_train

In [None]:
y_train.replace({'no':0, 'yes':1}, inplace=True)

In [None]:
y_train

In [None]:
df_test = pd.read_csv('test.csv', encoding='UTF-8', delimiter=';')
df_test.head()

In [None]:
df_test = pd.get_dummies(df_work, columns=['job','marital','education','contact','poutcome'], drop_first=True)

In [None]:
df_test['age'] = scaler.fit(df_test[['age']]).transform(df_test[['age']])

In [None]:
df_test['balance'] = scaler.fit(df_test[['balance']]).transform(df_test[['balance']])

In [None]:
df_test['duration'] = scaler.fit(df_test[['duration']]).transform(df_test[['duration']])

In [None]:
df_test['campaign'] = scaler.fit(df_test[['campaign']]).transform(df_test[['campaign']])

In [None]:
df_test.rename(columns={'campaign':'number of contacts'}, inplace=True)

In [None]:
df_test['pdays'].replace({-1: 0}, inplace=True)

In [None]:
df_test['pdays'] = scaler.fit(df_test[['pdays']]).transform(df_test[['pdays']])

In [None]:
df_test['previous'] = scaler.fit(df_test[['previous']]).transform(df_test[['previous']])

In [None]:
df_test['previous'] = scaler.fit(df_test[['previous']]).transform(df_test[['previous']])

In [None]:
df_test.rename(columns={'pdays': '#days gap b/w contact'}, inplace=True)

In [None]:
df_test.head()

In [None]:
df_test.rename(columns={'previous': 'previous # contacts'}, inplace=True)

In [None]:
df_test = df_test[columns_order]

In [None]:
x_test = df_test.drop('y', axis = 1)

In [None]:
y_test = df_test['y']

In [None]:
y_test.replace({'no':0, 'yes':1}, inplace=True)

In [None]:
y_train.shape

Starting the ML model building process

In [None]:
LR = LogisticRegression()

In [None]:
x_train.drop('month', inplace=True, axis=1)

In [None]:
LR.fit(x_train, y_train)

In [None]:
x_test.drop('month', inplace=True, axis =1)

In [None]:
yhat = LR.predict(x_test)

Checking Model Accuracy For Logistic Regression

In [None]:
print (f'The Jaccard Score for this test is {jaccard_score(y_test,yhat)}')

Confusion Matrix

In [None]:
confusion_matrix(y_test, yhat)

In [None]:
from sklearn.metrics import classification_report

In [None]:
print (classification_report(y_test, yhat))

K Nearest Neighbours Algorithm

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
Ks = 10
mean_acc = np.zeros((Ks-1))
std_acc = np.zeros((Ks-1))

for n in range(1,Ks):
    
    #Train Model and Predict  
    neigh = KNeighborsClassifier(n_neighbors = n).fit(x_train,y_train)
    yhat=neigh.predict(x_test)
    mean_acc[n-1] = accuracy_score(y_test, yhat)

    
    std_acc[n-1]=np.std(yhat==y_test)/np.sqrt(yhat.shape[0])

mean_acc

We will select k = 3 as this gives us the highest accuracy and again print classification report

In [None]:
k = 3
Knn = KNeighborsClassifier()
Knn.fit(x_train, y_train)
yhat = Knn.predict(x_test)

In [None]:
print (classification_report(y_test, yhat))

We find that KNearestNeighbors is better at classifying the given dataset basis classification report accuracy

We will now work on SVM model to check accuracy

In [None]:
SVM_df = svm.SVC(kernel='sigmoid', C=1)
    

In [None]:
parameters = {'kernel':('linear', 'rbf','poly','rbf', 'sigmoid'),
              }

grid = GridSearchCV(SVM_df, param_grid=parameters)

grid.fit(x_train, y_train)

yhat = grid.predict(x_test)

print("tuned hpyerparameters :(best parameters) ",grid.best_params_)
print("accuracy :",grid.best_score_)
print (classification_report(y_test, yhat))

In [None]:
SVM_df.fit(x_train, y_train)

In [None]:
yhat = SVM_df.predict(x_test)

In [None]:
print (classification_report(y_test, yhat))