In [8]:
# Import Data manipulation Libraries
import numpy as np
import pandas as pd

# Import Data Visualization Libraries
import matplotlib.pyplot as plt
import seaborn as sns

# Import Filter Warning Libraries
import warnings
warnings.filterwarnings('ignore')

# Import Logging files 
import logging 
logging.basicConfig(level= logging.INFO,
                    filemode= 'w',
                    filename= 'model.log',
                    format= '%(asctime)s - %(levelname)s -  %(message)s', force = True) 
# import stats scipy library
import scipy.stats as stats                   

In [9]:
# Data Input Using Pandas function 

url = 'https://raw.githubusercontent.com/adnansayyedd/BankMarketing_MLModel/refs/heads/main/BankTelemarketing.csv' 

df = pd.read_csv(url,sep = ';') 

df.sample(frac = 1) #shuffle dataset

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
22558,47,services,married,secondary,no,2322,no,no,cellular,22,aug,618,1,-1,0,unknown,no
20236,33,management,single,tertiary,no,0,no,no,cellular,11,aug,699,7,-1,0,unknown,yes
20760,53,technician,married,secondary,no,230,no,yes,cellular,13,aug,112,2,-1,0,unknown,no
9613,58,management,single,unknown,no,1873,no,no,unknown,6,jun,49,2,-1,0,unknown,no
18418,35,blue-collar,single,primary,no,-759,yes,no,cellular,31,jul,142,2,-1,0,unknown,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30551,42,technician,married,unknown,no,906,yes,no,cellular,5,feb,434,2,-1,0,unknown,no
3250,36,blue-collar,single,secondary,no,-175,yes,no,unknown,15,may,180,2,-1,0,unknown,no
6477,32,management,single,tertiary,no,1410,yes,no,unknown,27,may,175,5,-1,0,unknown,no
33528,56,blue-collar,divorced,secondary,no,9698,yes,no,cellular,20,apr,189,2,-1,0,unknown,no


In [10]:
df.shape,df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45211 entries, 0 to 45210
Data columns (total 17 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        45211 non-null  int64 
 1   job        45211 non-null  object
 2   marital    45211 non-null  object
 3   education  45211 non-null  object
 4   default    45211 non-null  object
 5   balance    45211 non-null  int64 
 6   housing    45211 non-null  object
 7   loan       45211 non-null  object
 8   contact    45211 non-null  object
 9   day        45211 non-null  int64 
 10  month      45211 non-null  object
 11  duration   45211 non-null  int64 
 12  campaign   45211 non-null  int64 
 13  pdays      45211 non-null  int64 
 14  previous   45211 non-null  int64 
 15  poutcome   45211 non-null  object
 16  y          45211 non-null  object
dtypes: int64(7), object(10)
memory usage: 5.9+ MB


((45211, 17), None)

In [11]:
logging.info('Dataset Uploaded Succesfully...')

In [12]:
# Split The Data Into Numerical_Data And Categorical_Data

Numerical_Data = df.select_dtypes(exclude = 'object')

Categorical_Data = df.select_dtypes(include = 'object')

In [13]:
# Checking Numerical_Data
Numerical_Data

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous
0,58,2143,5,261,1,-1,0
1,44,29,5,151,1,-1,0
2,33,2,5,76,1,-1,0
3,47,1506,5,92,1,-1,0
4,33,1,5,198,1,-1,0
...,...,...,...,...,...,...,...
45206,51,825,17,977,3,-1,0
45207,71,1729,17,456,2,-1,0
45208,72,5715,17,1127,5,184,3
45209,57,668,17,508,4,-1,0


In [14]:
# Checking Categorical Data
Categorical_Data

Unnamed: 0,job,marital,education,default,housing,loan,contact,month,poutcome,y
0,management,married,tertiary,no,yes,no,unknown,may,unknown,no
1,technician,single,secondary,no,yes,no,unknown,may,unknown,no
2,entrepreneur,married,secondary,no,yes,yes,unknown,may,unknown,no
3,blue-collar,married,unknown,no,yes,no,unknown,may,unknown,no
4,unknown,single,unknown,no,no,no,unknown,may,unknown,no
...,...,...,...,...,...,...,...,...,...,...
45206,technician,married,tertiary,no,no,no,cellular,nov,unknown,yes
45207,retired,divorced,primary,no,no,no,cellular,nov,unknown,yes
45208,retired,married,secondary,no,no,no,cellular,nov,success,yes
45209,blue-collar,married,secondary,no,no,no,telephone,nov,unknown,no


In [27]:
# Checking Descriptive Stats: Numerical_data abd Categorical_Data

from collections import OrderedDict

stats = []

for i in Numerical_Data:

    Numerical_stats = OrderedDict({
        'feature' : i,
        'Maximum': Numerical_Data[i].max(),
        'Minimum': Numerical_Data[i].min(),
        'mean': Numerical_Data[i].mean(),
        'median': Numerical_Data[i].median(),
        '25': Numerical_Data[i].quantile(0.25),
        '75': Numerical_Data[i].quantile(0.75),
        'kurtosis':Numerical_Data[i].kurt(),
        'skewness': Numerical_Data[i].skew(),
        'standard Deviation': Numerical_Data[i].std()
    })

    stats.append(Numerical_stats)
    report = pd.DataFrame(stats)

report




Unnamed: 0,feature,Maximum,Minimum,mean,median,25,75,kurtosis,skewness,standard Deviation
0,age,95,18,40.93621,39.0,33.0,48.0,0.31957,0.684818,10.618762
1,balance,102127,-8019,1362.272058,448.0,72.0,1428.0,140.751547,8.360308,3044.765829
2,day,31,1,15.806419,16.0,8.0,21.0,-1.059897,0.093079,8.322476
3,duration,4918,0,258.16308,180.0,103.0,319.0,18.153915,3.144318,257.527812
4,campaign,63,1,2.763841,2.0,1.0,3.0,39.249651,4.89865,3.098021
5,pdays,871,-1,40.197828,-1.0,-1.0,-1.0,6.935195,2.615715,100.128746
6,previous,275,0,0.580323,0.0,0.0,0.0,4506.86066,41.846454,2.303441


In [28]:
logging.info('The above datset id non normsl distributed dataset')

In [29]:
#CHecking Categorical Dataset stats
for i in Categorical_Data:
    print(Categorical_Data[i].value_counts())
    print('*'*40)

job
blue-collar      9732
management       9458
technician       7597
admin.           5171
services         4154
retired          2264
self-employed    1579
entrepreneur     1487
unemployed       1303
housemaid        1240
student           938
unknown           288
Name: count, dtype: int64
****************************************
marital
married     27214
single      12790
divorced     5207
Name: count, dtype: int64
****************************************
education
secondary    23202
tertiary     13301
primary       6851
unknown       1857
Name: count, dtype: int64
****************************************
default
no     44396
yes      815
Name: count, dtype: int64
****************************************
housing
yes    25130
no     20081
Name: count, dtype: int64
****************************************
loan
no     37967
yes     7244
Name: count, dtype: int64
****************************************
contact
cellular     29285
unknown      13020
telephone     2906
Name: count, dtype