In [1]:
# Importing the required libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [2]:
# Importing the data

df = pd.read_csv('bank-marketing.csv')
df.head()

Unnamed: 0,age,job,salary,marital,education,targeted,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,response
0,58,management,100000,married,tertiary,yes,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,60000,single,secondary,yes,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,120000,married,secondary,yes,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,20000,married,unknown,no,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,0,single,unknown,no,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [3]:
# Renaming yes and no in different columns to dintinguish them

df['targeted'].replace(['yes','no'],['t_yes','t_no'], inplace = True)
df['default'].replace(['yes','no'],['d_yes','d_no'], inplace = True)
df['housing'].replace(['yes','no'],['h_yes','h_no'], inplace = True)
df['loan'].replace(['yes','no'],['l_yes','l_no'], inplace = True)

In [4]:
df['response'] = df.response.replace(['yes','no'],[1,0])

In [5]:
# Creating list of the best features after analysis

cols = ['education', 'salary', 'age', 'marital', 'default', 'job', 'balance', 'previous', 'housing','response', 'duration', 'campaign']

In [6]:
# Creating dataframe with required cols
df1 = df[cols]

In [7]:
df1.head()

Unnamed: 0,education,salary,age,marital,default,job,balance,previous,housing,response,duration,campaign
0,tertiary,100000,58,married,d_no,management,2143,0,h_yes,0,261,1
1,secondary,60000,44,single,d_no,technician,29,0,h_yes,0,151,1
2,secondary,120000,33,married,d_no,entrepreneur,2,0,h_yes,0,76,1
3,unknown,20000,47,married,d_no,blue-collar,1506,0,h_yes,0,92,1
4,unknown,0,33,single,d_no,unknown,1,0,h_no,0,198,1


In [8]:
# Encoding the data with 1 and 0 for Machine Learning Models

def ohe(x,df):
    temp = pd.get_dummies(df[x], drop_first = True)
    df = pd.concat([df, temp], axis = 1)
    df.drop([x], axis = 1, inplace = True)
    return df

df1 = ohe('job',df1)
df1 = ohe('marital',df1)
df1 = ohe('education',df1)
df1 = ohe('default',df1)
df1 = ohe('housing',df1)

In [9]:
# X for train and Y for Test

x = df1.drop('response', axis = 1)
y = df1.response

In [10]:
# Train test split
X_train,X_test,y_train,y_test = train_test_split(x,y, test_size = 0.3, random_state = 50)

In [11]:
# Scaling the data
scaler =  StandardScaler()

In [12]:
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

In [13]:
# Random Forest Model

rf1 = RandomForestClassifier(n_estimators = 100,max_depth = 10, max_features = 'sqrt')

In [14]:
rf1.fit(X_train,y_train)
rf_predict = rf1.predict(X_test)

In [15]:
accuracy_score(y_test,rf_predict)

0.8909613683279268

In [16]:
print(classification_report(y_test,rf_predict))

              precision    recall  f1-score   support

           0       0.90      0.99      0.94     12025
           1       0.58      0.14      0.22      1539

    accuracy                           0.89     13564
   macro avg       0.74      0.56      0.58     13564
weighted avg       0.86      0.89      0.86     13564



In [28]:
# Create a pickle file using steralization
import pickle
pickle_out = open("rf1.pkl","wb")
pickle.dump(rf1,pickle_out)
pickle_out.close()

In [18]:
c_names = x.columns

In [19]:
tdf = pd.DataFrame(X_test, columns = c_names)
tdf.head()
tdf.columns

Index(['salary', 'age', 'balance', 'previous', 'duration', 'campaign',
       'blue-collar', 'entrepreneur', 'housemaid', 'management', 'retired',
       'self-employed', 'services', 'student', 'technician', 'unemployed',
       'unknown', 'married', 'single', 'secondary', 'tertiary', 'unknown',
       'd_yes', 'h_yes'],
      dtype='object')

In [25]:
rf1.predict([[70000,25,2000,2,100,5,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0,0,0,1]])

array([0], dtype=int64)

In [21]:
x.head()

Unnamed: 0,salary,age,balance,previous,duration,campaign,blue-collar,entrepreneur,housemaid,management,...,technician,unemployed,unknown,married,single,secondary,tertiary,unknown.1,d_yes,h_yes
0,100000,58,2143,0,261,1,0,0,0,1,...,0,0,0,1,0,0,1,0,0,1
1,60000,44,29,0,151,1,0,0,0,0,...,1,0,0,0,1,1,0,0,0,1
2,120000,33,2,0,76,1,0,1,0,0,...,0,0,0,1,0,1,0,0,0,1
3,20000,47,1506,0,92,1,1,0,0,0,...,0,0,0,1,0,0,0,1,0,1
4,0,33,1,0,198,1,0,0,0,0,...,0,0,1,0,1,0,0,1,0,0


In [27]:
#test_data = x.iloc[:10,:]
#test_data.to_csv('test_data.csv')

In [26]:
#test_data1 = x.iloc[-10:-1,:]
#test_data1.to_csv('test_data1.csv')