In [77]:
import numpy as np
import pandas as pd
import scipy.stats as stats

In [78]:
df = pd.read_csv('bank-full.csv', sep=';')
df.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


- Mapping the y column so we can make calculations on it.

In [79]:
map = {'yes':1, 'no':0}
df['y'] = df['y'].map(map)

#### Admin job tend have higher y-yes values than the rest of the jobs. (Reject the null Hypothesis)
- null = rest jobs have higher or equal y-yes values than admin
- alternative = rest jobs have lower y-yes values than admin

In [80]:
mask = df.job == 'admin.'
admin = df[mask]['y']
rest_jobs = df[~mask]['y']

statistic, p_value = stats.ttest_ind(rest_jobs, admin, alternative='less')

print(f'statistic is : {statistic:.003f}, p-value: {p_value:.003f}')

statistic is : -6.381, p-value: 0.000


#### married status tend have higher y-yes values than the rest of the marital status. we will test married marital status agianst the rest of marital status and against the  single marital status because we have alot of married status recorded more than half of the data.
1. (Fail to reject the null Hypothesis)
    - null = rest marital statuses have higher or equal y-yes values than married
    - alternative = rest marital status have lower y-yes values than married
2. (Reject the null Hypothesis)
    - null = Married have higher or equal y-yes values than Single
    - alternative = Married status have lower y-yes values than Single
3. (Reject the null Hypothesis)
    - null = other marital statuses have higher or equal y-yes values than single
    - alternative = rest marital status have lower y-yes values than single

In [81]:
mask = df.marital == 'married'

married = df[mask]['y']
other_marital = df[~mask]['y']

statistic, p_value = stats.ttest_ind(other_marital, married, alternative='less')

print(f'statistic is : {statistic:.003f}, p-value: {p_value:.003f}')

statistic is : 8.816, p-value: 1.000


In [82]:
mask1 = df.marital == 'married'
mask2 = df.marital == 'single'

married = df[mask1]['y']
single = df[mask2]['y']

statistic, p_value = stats.ttest_ind(married, single, alternative='less')

print(f'statistic is : {statistic:.003f}, p-value: {p_value:.003f}')

statistic is : -10.786, p-value: 0.000


In [83]:
mask = df.marital == 'single'

single = df[mask]['y']
other_marital = df[~mask]['y']

statistic, p_value = stats.ttest_ind(other_marital, single, alternative='less')

print(f'statistic is : {statistic:.003f}, p-value: {p_value:.003f}')

statistic is : -11.002, p-value: 0.000


#### University grade tend have higher y-yes values than the rest of the Grades. (Reject the null Hypothesis)
- null = rest of degrees have higher or equal y-yes values than University degree.
- alternative = rest of degrees have lower y-yes values than University degree.

In [84]:
mask = df.education == 'university.degree'

university = df[mask]['y']
other_degrees = df[~mask]['y']

statistic, p_value = stats.ttest_ind(other_degrees, university, alternative='less')

print(f'statistic is : {statistic:.003f}, p-value: {p_value:.003f}')

statistic is : -10.234, p-value: 0.000


#### Cellular contact tend have higher y-yes values than telephone contact. (Reject the null Hypothesis)
- null = telephones have higher or equal y-yes values than cellular.
- alternative = telephone have lower y-yes values than cellular.

In [85]:
mask = df.contact == 'cellular'

cellular = df[mask]['y']
telephone = df[~mask]['y']

statistic, p_value = stats.ttest_ind(telephone, cellular, alternative='less')

print(f'statistic is : {statistic:.003f}, p-value: {p_value:.003f}')

statistic is : -29.694, p-value: 0.000


#### [may, jun, jul, aug] Summer tend to have y-yes values than other months. (Fail to reject the null hypothesis)
- null = other months have higher than or equal y-yes values than summer months.
- alternative = other months have lower y-yes values than summer months.

In [86]:
mask = df.month.isin(['may', 'jun', 'jul', 'aug'])

summer = df[mask]['y']
other_months = df[~mask]['y']

statistic, p_value = stats.ttest_ind(other_months, summer, alternative='less')

print(f'statistic is : {statistic:.003f}, p-value: {p_value:.003f}')

statistic is : 35.006, p-value: 1.000


#### Success poutcomes tend have higher y-yes values than Failure poutcomes. (Reject the null Hypothesis)
- null = Failure poutcomes have higher or equal y-yes values than Success poutcomes.
- alternative = Failure poutcomes have lower y-yes values than success poutcomes.

In [87]:
mask1 = df.poutcome == 'success'
mask2 = df.poutcome == 'failure'
success = df[mask1]['y']
failure = df[mask2]['y']

statistic, p_value = stats.ttest_ind(failure, success, alternative='less')

print(f'statistic is : {statistic:.003f}, p-value: {p_value:.003f}')

statistic is : -42.647, p-value: 0.000


- When the P-value equals 1.0 this leads to positive correlation, 0.0 negative.

In [105]:
def hypothesis_testing_for_relations(col):
    mask = df.y == 1

    yes = df[mask][col]
    no = df[~mask][col]

    statistic, p_value = stats.ttest_ind(yes, no, alternative='less')

    print(f'{col:15s} | statistic is : {statistic:.003f}, p-value: {p_value:.003f}')

In [106]:
numerical_data = df.select_dtypes(exclude='object').drop('y', axis=1)

for col in numerical_data.columns:
    hypothesis_testing_for_relations(col)

age             | statistic is : 6.172, p-value: 1.000
duration        | statistic is : 89.967, p-value: 1.000
campaign        | statistic is : -13.497, p-value: 0.000
pdays           | statistic is : -69.722, p-value: 0.000
previous        | statistic is : 48.003, p-value: 1.000
emp.var.rate    | statistic is : -63.434, p-value: 0.000
cons.price.idx  | statistic is : -27.903, p-value: 0.000
cons.conf.idx   | statistic is : 11.154, p-value: 1.000
euribor3m       | statistic is : -65.647, p-value: 0.000
nr.employed     | statistic is : -76.984, p-value: 0.000
