In [1]:
ls

Banking Marketing Targets.ipynb  train.csv
test.csv                         train.numbers


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
from sklearn.preprocessing import StandardScaler

In [3]:
df_raw = pd.read_csv('train.csv', delimiter=';', encoding='UTF-8')
df_raw.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [4]:
df_raw.describe(include = 'all')

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
count,45211.0,45211,45211,45211,45211,45211.0,45211,45211,45211,45211.0,45211,45211.0,45211.0,45211.0,45211.0,45211,45211
unique,,12,3,4,2,,2,2,3,,12,,,,,4,2
top,,blue-collar,married,secondary,no,,yes,no,cellular,,may,,,,,unknown,no
freq,,9732,27214,23202,44396,,25130,37967,29285,,13766,,,,,36959,39922
mean,40.93621,,,,,1362.272058,,,,15.806419,,258.16308,2.763841,40.197828,0.580323,,
std,10.618762,,,,,3044.765829,,,,8.322476,,257.527812,3.098021,100.128746,2.303441,,
min,18.0,,,,,-8019.0,,,,1.0,,0.0,1.0,-1.0,0.0,,
25%,33.0,,,,,72.0,,,,8.0,,103.0,1.0,-1.0,0.0,,
50%,39.0,,,,,448.0,,,,16.0,,180.0,2.0,-1.0,0.0,,
75%,48.0,,,,,1428.0,,,,21.0,,319.0,3.0,-1.0,0.0,,


Results from EDA above:
Numeric Data:
1) Age range from 18 - 95, mean = 40, std = 10.6
2) Balance - 1362 mean, std = 3044
Binary Data:
1) Default history - Yes / No
2) Housing loan - Yes / No
3) Personal Loan - Yes / No
Categorical Data:
1) Job - 12 categories
2) Marital - 3 categories
3) Education - 4 categories

9) contact: contact communication type (categorical: "unknown","telephone","cellular")
10) day: last contact day of the month (numeric)
11) month: last contact month of year (categorical: "jan", "feb", "mar", …, "nov", "dec")
12) duration: last contact duration, in seconds (numeric)
13) campaign: number of contacts performed during this campaign and for this client (numeric, includes last contact)
14) pdays: number of days that passed by after the client was last contacted from a previous campaign (numeric, -1 means client was not previously contacted)
15) previous: number of contacts performed before this campaign and for this client (numeric)
16) poutcome: outcome of the previous marketing campaign (categorical: "unknown","other","failure","success")

In [5]:
df_raw.dtypes

age           int64
job          object
marital      object
education    object
default      object
balance       int64
housing      object
loan         object
contact      object
day           int64
month        object
duration      int64
campaign      int64
pdays         int64
previous      int64
poutcome     object
y            object
dtype: object

Binary variables to be mapped to 0 & 1 respectively

1) Default
2) Housing
3) Loan

In [6]:
df_raw['default'].value_counts()

no     44396
yes      815
Name: default, dtype: int64

In [7]:
df_raw['default'].replace({'no':0, 'yes':1},inplace=True)

In [8]:
df_raw['default'].value_counts()

0    44396
1      815
Name: default, dtype: int64

In [9]:
df_raw['housing'].value_counts()

yes    25130
no     20081
Name: housing, dtype: int64

In [10]:
df_raw['housing'].replace({'yes': 1, 'no': 0}, inplace=True)

In [11]:
df_raw['housing'].value_counts()

1    25130
0    20081
Name: housing, dtype: int64

In [12]:
df_raw['loan'].value_counts()

no     37967
yes     7244
Name: loan, dtype: int64

In [13]:
df_raw['loan'].replace({'no':0, 'yes':1},inplace=True)

In [14]:
df_raw['loan'].value_counts()

0    37967
1     7244
Name: loan, dtype: int64

In [15]:
df_raw['job'].value_counts()

blue-collar      9732
management       9458
technician       7597
admin.           5171
services         4154
retired          2264
self-employed    1579
entrepreneur     1487
unemployed       1303
housemaid        1240
student           938
unknown           288
Name: job, dtype: int64

In [16]:
df_work = df_raw.copy()

In [17]:
df_work.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,0,2143,1,0,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,0,29,1,0,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,0,2,1,1,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,0,1506,1,0,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,0,1,0,0,unknown,5,may,198,1,-1,0,unknown,no


Note that month column has not been converted to dummy values

In [18]:
df_dummy = pd.get_dummies(df_work, columns=['job','marital','education','contact','poutcome'], drop_first=True)

In [19]:
from sklearn.preprocessing import StandardScaler

In [20]:
scaler = StandardScaler()

In [21]:
df_standardized = df_dummy.copy()

In [22]:
df_standardized['age'] = scaler.fit(df_standardized[['age']]).transform(df_standardized[['age']])

In [23]:
df_standardized.head()

Unnamed: 0,age,default,balance,housing,loan,day,month,duration,campaign,pdays,...,marital_married,marital_single,education_secondary,education_tertiary,education_unknown,contact_telephone,contact_unknown,poutcome_other,poutcome_success,poutcome_unknown
0,1.606965,0,2143,1,0,5,may,261,1,-1,...,1,0,0,1,0,0,1,0,0,1
1,0.288529,0,29,1,0,5,may,151,1,-1,...,0,1,1,0,0,0,1,0,0,1
2,-0.747384,0,2,1,1,5,may,76,1,-1,...,1,0,1,0,0,0,1,0,0,1
3,0.571051,0,1506,1,0,5,may,92,1,-1,...,1,0,0,0,1,0,1,0,0,1
4,-0.747384,0,1,0,0,5,may,198,1,-1,...,0,1,0,0,1,0,1,0,0,1


In [24]:
df_standardized['balance'] = scaler.fit(df_standardized[['balance']]).transform(df_standardized[['balance']])

In [25]:
df_standardized.head()

Unnamed: 0,age,default,balance,housing,loan,day,month,duration,campaign,pdays,...,marital_married,marital_single,education_secondary,education_tertiary,education_unknown,contact_telephone,contact_unknown,poutcome_other,poutcome_success,poutcome_unknown
0,1.606965,0,0.256419,1,0,5,may,261,1,-1,...,1,0,0,1,0,0,1,0,0,1
1,0.288529,0,-0.437895,1,0,5,may,151,1,-1,...,0,1,1,0,0,0,1,0,0,1
2,-0.747384,0,-0.446762,1,1,5,may,76,1,-1,...,1,0,1,0,0,0,1,0,0,1
3,0.571051,0,0.047205,1,0,5,may,92,1,-1,...,1,0,0,0,1,0,1,0,0,1
4,-0.747384,0,-0.447091,0,0,5,may,198,1,-1,...,0,1,0,0,1,0,1,0,0,1


In [26]:
df_standardized['duration'] = scaler.fit(df_standardized[['duration']]).transform(df_standardized[['duration']])

In [27]:
df_standardized.head()

Unnamed: 0,age,default,balance,housing,loan,day,month,duration,campaign,pdays,...,marital_married,marital_single,education_secondary,education_tertiary,education_unknown,contact_telephone,contact_unknown,poutcome_other,poutcome_success,poutcome_unknown
0,1.606965,0,0.256419,1,0,5,may,0.011016,1,-1,...,1,0,0,1,0,0,1,0,0,1
1,0.288529,0,-0.437895,1,0,5,may,-0.416127,1,-1,...,0,1,1,0,0,0,1,0,0,1
2,-0.747384,0,-0.446762,1,1,5,may,-0.707361,1,-1,...,1,0,1,0,0,0,1,0,0,1
3,0.571051,0,0.047205,1,0,5,may,-0.645231,1,-1,...,1,0,0,0,1,0,1,0,0,1
4,-0.747384,0,-0.447091,0,0,5,may,-0.23362,1,-1,...,0,1,0,0,1,0,1,0,0,1


In [28]:
df_standardized['campaign'] = scaler.fit(df_standardized[['campaign']]).transform(df_standardized[['campaign']])

In [29]:
df_standardized.head()

Unnamed: 0,age,default,balance,housing,loan,day,month,duration,campaign,pdays,...,marital_married,marital_single,education_secondary,education_tertiary,education_unknown,contact_telephone,contact_unknown,poutcome_other,poutcome_success,poutcome_unknown
0,1.606965,0,0.256419,1,0,5,may,0.011016,-0.569351,-1,...,1,0,0,1,0,0,1,0,0,1
1,0.288529,0,-0.437895,1,0,5,may,-0.416127,-0.569351,-1,...,0,1,1,0,0,0,1,0,0,1
2,-0.747384,0,-0.446762,1,1,5,may,-0.707361,-0.569351,-1,...,1,0,1,0,0,0,1,0,0,1
3,0.571051,0,0.047205,1,0,5,may,-0.645231,-0.569351,-1,...,1,0,0,0,1,0,1,0,0,1
4,-0.747384,0,-0.447091,0,0,5,may,-0.23362,-0.569351,-1,...,0,1,0,0,1,0,1,0,0,1


In [30]:
df_standardized.rename(columns={'campaign':'number of contacts'}, inplace=True)

In [31]:
df_standardized.head()

Unnamed: 0,age,default,balance,housing,loan,day,month,duration,number of contacts,pdays,...,marital_married,marital_single,education_secondary,education_tertiary,education_unknown,contact_telephone,contact_unknown,poutcome_other,poutcome_success,poutcome_unknown
0,1.606965,0,0.256419,1,0,5,may,0.011016,-0.569351,-1,...,1,0,0,1,0,0,1,0,0,1
1,0.288529,0,-0.437895,1,0,5,may,-0.416127,-0.569351,-1,...,0,1,1,0,0,0,1,0,0,1
2,-0.747384,0,-0.446762,1,1,5,may,-0.707361,-0.569351,-1,...,1,0,1,0,0,0,1,0,0,1
3,0.571051,0,0.047205,1,0,5,may,-0.645231,-0.569351,-1,...,1,0,0,0,1,0,1,0,0,1
4,-0.747384,0,-0.447091,0,0,5,may,-0.23362,-0.569351,-1,...,0,1,0,0,1,0,1,0,0,1


In [32]:
## Replacing the '-1' which meant not contacted to '0'
df_standardized['pdays'].replace({-1: 0}, inplace=True)

In [33]:
df_standardized['pdays'] = scaler.fit(df_standardized[['pdays']]).transform(df_standardized[['pdays']])

In [34]:
df_standardized['previous'] = scaler.fit(df_standardized[['previous']]).transform(df_standardized[['previous']])

In [35]:
df_standardized.rename(columns={'previous': 'previous # contacts'}, inplace=True)

In [36]:
df_standardized.head()

Unnamed: 0,age,default,balance,housing,loan,day,month,duration,number of contacts,pdays,...,marital_married,marital_single,education_secondary,education_tertiary,education_unknown,contact_telephone,contact_unknown,poutcome_other,poutcome_success,poutcome_unknown
0,1.606965,0,0.256419,1,0,5,may,0.011016,-0.569351,-0.411009,...,1,0,0,1,0,0,1,0,0,1
1,0.288529,0,-0.437895,1,0,5,may,-0.416127,-0.569351,-0.411009,...,0,1,1,0,0,0,1,0,0,1
2,-0.747384,0,-0.446762,1,1,5,may,-0.707361,-0.569351,-0.411009,...,1,0,1,0,0,0,1,0,0,1
3,0.571051,0,0.047205,1,0,5,may,-0.645231,-0.569351,-0.411009,...,1,0,0,0,1,0,1,0,0,1
4,-0.747384,0,-0.447091,0,0,5,may,-0.23362,-0.569351,-0.411009,...,0,1,0,0,1,0,1,0,0,1


In [37]:
df_standardized.rename(columns={'pdays': '#days gap b/w contact'}, inplace=True)

Initiate and populate the train variables i.e. x_train and y_train

In [38]:
columns_order = df_standardized.columns.to_list()

In [39]:
columns_order = ['age',
 'default',
 'balance',
 'housing',
 'loan',
 'day',
 'month',
 'duration',
 'number of contacts',
 '#days gap b/w contact',
 'previous # contacts',
 'job_blue-collar',
 'job_entrepreneur',
 'job_housemaid',
 'job_management',
 'job_retired',
 'job_self-employed',
 'job_services',
 'job_student',
 'job_technician',
 'job_unemployed',
 'job_unknown',
 'marital_married',
 'marital_single',
 'education_secondary',
 'education_tertiary',
 'education_unknown',
 'contact_telephone',
 'contact_unknown',
 'poutcome_other',
 'poutcome_success',
 'poutcome_unknown',
'y']

In [40]:
df_standardized = df_standardized[columns_order]

In [41]:
x_train = df_standardized.drop('y', axis=1)

In [42]:
x_train.head()

Unnamed: 0,age,default,balance,housing,loan,day,month,duration,number of contacts,#days gap b/w contact,...,marital_married,marital_single,education_secondary,education_tertiary,education_unknown,contact_telephone,contact_unknown,poutcome_other,poutcome_success,poutcome_unknown
0,1.606965,0,0.256419,1,0,5,may,0.011016,-0.569351,-0.411009,...,1,0,0,1,0,0,1,0,0,1
1,0.288529,0,-0.437895,1,0,5,may,-0.416127,-0.569351,-0.411009,...,0,1,1,0,0,0,1,0,0,1
2,-0.747384,0,-0.446762,1,1,5,may,-0.707361,-0.569351,-0.411009,...,1,0,1,0,0,0,1,0,0,1
3,0.571051,0,0.047205,1,0,5,may,-0.645231,-0.569351,-0.411009,...,1,0,0,0,1,0,1,0,0,1
4,-0.747384,0,-0.447091,0,0,5,may,-0.23362,-0.569351,-0.411009,...,0,1,0,0,1,0,1,0,0,1


In [43]:
y_train = df_standardized['y']

In [44]:
y_train

0         no
1         no
2         no
3         no
4         no
        ... 
45206    yes
45207    yes
45208    yes
45209     no
45210     no
Name: y, Length: 45211, dtype: object

In [45]:
y_train.replace({'no':0, 'yes':1}, inplace=True)

In [46]:
y_train

0        0
1        0
2        0
3        0
4        0
        ..
45206    1
45207    1
45208    1
45209    0
45210    0
Name: y, Length: 45211, dtype: int64

In [47]:
df_test = pd.read_csv('test.csv', encoding='UTF-8', delimiter=';')
df_test.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,30,unemployed,married,primary,no,1787,no,no,cellular,19,oct,79,1,-1,0,unknown,no
1,33,services,married,secondary,no,4789,yes,yes,cellular,11,may,220,1,339,4,failure,no
2,35,management,single,tertiary,no,1350,yes,no,cellular,16,apr,185,1,330,1,failure,no
3,30,management,married,tertiary,no,1476,yes,yes,unknown,3,jun,199,4,-1,0,unknown,no
4,59,blue-collar,married,secondary,no,0,yes,no,unknown,5,may,226,1,-1,0,unknown,no


In [48]:
df_test = pd.get_dummies(df_work, columns=['job','marital','education','contact','poutcome'], drop_first=True)

In [49]:
df_test['age'] = scaler.fit(df_test[['age']]).transform(df_test[['age']])

In [50]:
df_test['balance'] = scaler.fit(df_test[['balance']]).transform(df_test[['balance']])

In [51]:
df_test['duration'] = scaler.fit(df_test[['duration']]).transform(df_test[['duration']])

In [52]:
df_test['campaign'] = scaler.fit(df_test[['campaign']]).transform(df_test[['campaign']])

In [53]:
df_test.rename(columns={'campaign':'number of contacts'}, inplace=True)

In [54]:
df_test['pdays'].replace({-1: 0}, inplace=True)

In [55]:
df_test['pdays'] = scaler.fit(df_test[['pdays']]).transform(df_test[['pdays']])

In [56]:
df_test['previous'] = scaler.fit(df_test[['previous']]).transform(df_test[['previous']])

In [57]:
df_test['previous'] = scaler.fit(df_test[['previous']]).transform(df_test[['previous']])

In [58]:
df_test.rename(columns={'pdays': '#days gap b/w contact'}, inplace=True)

In [59]:
df_test.head()

Unnamed: 0,age,default,balance,housing,loan,day,month,duration,number of contacts,#days gap b/w contact,...,marital_married,marital_single,education_secondary,education_tertiary,education_unknown,contact_telephone,contact_unknown,poutcome_other,poutcome_success,poutcome_unknown
0,1.606965,0,0.256419,1,0,5,may,0.011016,-0.569351,-0.411009,...,1,0,0,1,0,0,1,0,0,1
1,0.288529,0,-0.437895,1,0,5,may,-0.416127,-0.569351,-0.411009,...,0,1,1,0,0,0,1,0,0,1
2,-0.747384,0,-0.446762,1,1,5,may,-0.707361,-0.569351,-0.411009,...,1,0,1,0,0,0,1,0,0,1
3,0.571051,0,0.047205,1,0,5,may,-0.645231,-0.569351,-0.411009,...,1,0,0,0,1,0,1,0,0,1
4,-0.747384,0,-0.447091,0,0,5,may,-0.23362,-0.569351,-0.411009,...,0,1,0,0,1,0,1,0,0,1


In [60]:
df_test.rename(columns={'previous': 'previous # contacts'}, inplace=True)

In [61]:
df_test = df_test[columns_order]

In [62]:
x_test = df_test.drop('y', axis = 1)

In [63]:
y_test = df_test['y']

In [64]:
y_test.replace({'no':0, 'yes':1}, inplace=True)

In [65]:
y_train.shape

(45211,)

Starting the ML model building process

In [66]:
from sklearn.linear_model import LogisticRegression

In [67]:
LR = LogisticRegression()

In [68]:
x_train.drop('month', inplace=True, axis=1)

In [69]:
LR.fit(x_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [70]:
x_test.drop('month', inplace=True, axis =1)

In [71]:
yhat = LR.predict(x_test)

Checking Model Accuracy For Logistic Regression

In [72]:
from sklearn.metrics import jaccard_score
from sklearn.metrics import confusion_matrix

In [73]:
print (f'The Jaccard Score for this test is {jaccard_score(y_test,yhat)}')

The Jaccard Score for this test is 0.2761566983717556


Confusion Matrix

In [74]:
confusion_matrix(y_test, yhat)

array([[39008,   914],
       [ 3576,  1713]])

In [75]:
from sklearn.metrics import classification_report

In [76]:
print (classification_report(y_test, yhat))

              precision    recall  f1-score   support

           0       0.92      0.98      0.95     39922
           1       0.65      0.32      0.43      5289

    accuracy                           0.90     45211
   macro avg       0.78      0.65      0.69     45211
weighted avg       0.89      0.90      0.89     45211



K Nearest Neighbours Algorithm

In [77]:
from sklearn.neighbors import KNeighborsClassifier

In [78]:
from sklearn.metrics import accuracy_score

In [79]:
Ks = 10
mean_acc = np.zeros((Ks-1))
std_acc = np.zeros((Ks-1))

for n in range(1,Ks):
    
    #Train Model and Predict  
    neigh = KNeighborsClassifier(n_neighbors = n).fit(x_train,y_train)
    yhat=neigh.predict(x_test)
    mean_acc[n-1] = accuracy_score(y_test, yhat)

    
    std_acc[n-1]=np.std(yhat==y_test)/np.sqrt(yhat.shape[0])

mean_acc

array([1.        , 0.92853509, 0.93198558, 0.91515339, 0.91860388,
       0.91079605, 0.91322908, 0.90849572, 0.9102652 ])

We will select k = 3 as this gives us the highest accuracy and again print classification report

In [80]:
k = 4
Knn = KNeighborsClassifier()
Knn.fit(x_train, y_train)
yhat = Knn.predict(x_test)

In [81]:
print (classification_report(y_test, yhat))

              precision    recall  f1-score   support

           0       0.93      0.98      0.96     39922
           1       0.75      0.46      0.57      5289

    accuracy                           0.92     45211
   macro avg       0.84      0.72      0.76     45211
weighted avg       0.91      0.92      0.91     45211



We find that KNearestNeighbors is better at classifying the given dataset basis classification report accuracy

We will now work on SVM model to check accuracy

In [83]:
from sklearn import svm
from sklearn.model_selection import GridSearchCV

In [85]:
SVM_df = svm.SVC()
    

In [89]:
parameters = {'kernel':('linear', 'rbf','poly','rbf', 'sigmoid'),
              'C': np.logspace(-3, 3, 5),
              'gamma':np.logspace(-3, 3, 5)}

In [90]:
grid = GridSearchCV(SVM_df, param_grid=parameters)

In [91]:
grid.fit(x_train, y_train)

In [None]:
yhat = grid.predict(x_test)

In [None]:
print("tuned hpyerparameters :(best parameters) ",grid.best_params_)
print("accuracy :",grid.best_score_)
print (classification_report(y_test, yhat))