<a href="https://colab.research.google.com/github/addicted-ai/bank-marketing-classification/blob/main/Bank_Marketing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np

In [2]:
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
url='https://raw.githubusercontent.com/addicted-ai/bank-marketing-classification/main/train.csv'
df = pd.read_csv(url,delimiter=';')

In [4]:
df

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45206,51,technician,married,tertiary,no,825,no,no,cellular,17,nov,977,3,-1,0,unknown,yes
45207,71,retired,divorced,primary,no,1729,no,no,cellular,17,nov,456,2,-1,0,unknown,yes
45208,72,retired,married,secondary,no,5715,no,no,cellular,17,nov,1127,5,184,3,success,yes
45209,57,blue-collar,married,secondary,no,668,no,no,telephone,17,nov,508,4,-1,0,unknown,no


In [5]:
df.shape

(45211, 17)

In [6]:
df.describe()

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous
count,45211.0,45211.0,45211.0,45211.0,45211.0,45211.0,45211.0
mean,40.93621,1362.272058,15.806419,258.16308,2.763841,40.197828,0.580323
std,10.618762,3044.765829,8.322476,257.527812,3.098021,100.128746,2.303441
min,18.0,-8019.0,1.0,0.0,1.0,-1.0,0.0
25%,33.0,72.0,8.0,103.0,1.0,-1.0,0.0
50%,39.0,448.0,16.0,180.0,2.0,-1.0,0.0
75%,48.0,1428.0,21.0,319.0,3.0,-1.0,0.0
max,95.0,102127.0,31.0,4918.0,63.0,871.0,275.0


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45211 entries, 0 to 45210
Data columns (total 17 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        45211 non-null  int64 
 1   job        45211 non-null  object
 2   marital    45211 non-null  object
 3   education  45211 non-null  object
 4   default    45211 non-null  object
 5   balance    45211 non-null  int64 
 6   housing    45211 non-null  object
 7   loan       45211 non-null  object
 8   contact    45211 non-null  object
 9   day        45211 non-null  int64 
 10  month      45211 non-null  object
 11  duration   45211 non-null  int64 
 12  campaign   45211 non-null  int64 
 13  pdays      45211 non-null  int64 
 14  previous   45211 non-null  int64 
 15  poutcome   45211 non-null  object
 16  y          45211 non-null  object
dtypes: int64(7), object(10)
memory usage: 5.9+ MB


In [8]:
df.rename(columns={'y':'output'}, inplace=True)

We can see that all 16 columns have `45211` non-null values that's also total row no. So there is no need to check null values or imputation.

## Data Analysis

### Checking Object Columns

In [9]:
obj_cols = list(df.dtypes[df.dtypes==np.object].keys())

In [10]:
for i in obj_cols:
    print('----------', i, '----------\n', df[i].value_counts(),'\n-----------------------------', sep='')

----------job----------
blue-collar      9732
management       9458
technician       7597
admin.           5171
services         4154
retired          2264
self-employed    1579
entrepreneur     1487
unemployed       1303
housemaid        1240
student           938
unknown           288
Name: job, dtype: int64
-----------------------------
----------marital----------
married     27214
single      12790
divorced     5207
Name: marital, dtype: int64
-----------------------------
----------education----------
secondary    23202
tertiary     13301
primary       6851
unknown       1857
Name: education, dtype: int64
-----------------------------
----------default----------
no     44396
yes      815
Name: default, dtype: int64
-----------------------------
----------housing----------
yes    25130
no     20081
Name: housing, dtype: int64
-----------------------------
----------loan----------
no     37967
yes     7244
Name: loan, dtype: int64
-----------------------------
----------contact-----

- We can see the column `output` i.e our output column as 2 levels yes & no.
- `loan` & `default` column has also 2 values yes & no.
- `job` column has `288` unknown value. `education` column has `1857` unknow value. These can be imputed.
- `poutcome` has `36959` unknown value, that's fairly large no. 

In [11]:
#pip install fancyimpute


## Dummy Variable Creation:

In [12]:
obj_cols

['job',
 'marital',
 'education',
 'default',
 'housing',
 'loan',
 'contact',
 'month',
 'poutcome',
 'output']

## Dummy Variable Creation:

In [22]:
#variable to map

var =  ['default','housing','loan','output']

# Defining the map function
def binary_map(x):
    return x.map({'yes    ': 1, "no     o": 0})

# Applying the function 
df[var] = df[var].apply(binary_map)


In [23]:
df[var]

Unnamed: 0,default,housing,loan,output
0,,,,
1,,,,
2,,,,
3,,,,
4,,,,
...,...,...,...,...
45206,,,,
45207,,,,
45208,,,,
45209,,,,


In [15]:
# Creating a dummy variable for some of the categorical variables and dropping the first one.
dummy = pd.get_dummies(df[['job', 'marital', 'education','contact','month','poutcome']], drop_first=True)

# Adding the results to the master dataframe
df = pd.concat([df, dummy], axis=1)

In [16]:
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,output,job_blue-collar,job_entrepreneur,job_housemaid,job_management,job_retired,job_self-employed,job_services,job_student,job_technician,job_unemployed,job_unknown,marital_married,marital_single,education_secondary,education_tertiary,education_unknown,contact_telephone,contact_unknown,month_aug,month_dec,month_feb,month_jan,month_jul,month_jun,month_mar,month_may,month_nov,month_oct,month_sep,poutcome_other,poutcome_success,poutcome_unknown
0,58,management,married,tertiary,,2143,,,unknown,5,may,261,1,-1,0,unknown,,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1
1,44,technician,single,secondary,,29,,,unknown,5,may,151,1,-1,0,unknown,,0,0,0,0,0,0,0,0,1,0,0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1
2,33,entrepreneur,married,secondary,,2,,,unknown,5,may,76,1,-1,0,unknown,,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1
3,47,blue-collar,married,unknown,,1506,,,unknown,5,may,92,1,-1,0,unknown,,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1
4,33,unknown,single,unknown,,1,,,unknown,5,may,198,1,-1,0,unknown,,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1
