In [53]:
import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt
from scipy.stats import ttest_ind

In [54]:
df = pd.read_excel('../data/bank_combined.xlsx', sheet_name = 'Sheet1')

In [5]:
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,...,contact_unknown,poutcome_failure,poutcome_other,poutcome_success,poutcome_unknown,y_no,y_yes,marital_divorced,marital_married,marital_single
0,49,unemployed,divorced,3.0,no,780,no,no,cellular,8,...,0,1,0,0,0,1,0,1,0,0
1,37,management,divorced,3.0,no,488,yes,no,cellular,17,...,0,1,0,0,0,0,1,1,0,0
2,35,management,single,3.0,no,151,no,no,unknown,20,...,1,1,0,0,0,1,0,0,0,1
3,31,housemaid,married,2.0,no,243,yes,no,cellular,23,...,0,1,0,0,0,0,1,0,1,0
4,43,blue-collar,married,2.0,no,408,yes,no,unknown,14,...,1,0,1,0,0,1,0,0,1,0


### Two-Sample T-Test for y_no/y_yes vs balance 

In [35]:
yes_balance = df['balance'][df['y_yes'] == 1].reset_index(drop=True)
no_balance = df['balance'][df['y_no'] == 1].reset_index(drop=True)

df_balance = pd.DataFrame({
    'yes_balance': yes_balance,
    'no_balance': no_balance
})

In [36]:
df_balance.describe()

Unnamed: 0,yes_balance,no_balance
count,5289.0,39922.0
mean,1804.267915,1303.714969
std,3501.104777,2974.195473
min,-3058.0,-8019.0
25%,210.0,58.0
50%,733.0,417.0
75%,2159.0,1345.0
max,81204.0,102127.0


In [12]:
df_balance.head()

Unnamed: 0,yes_balance,no_balance
0,488.0,780
1,243.0,151
2,320.0,408
3,1547.0,384
4,2048.0,1387


In [37]:
filtered_yes_balance = df_balance['yes_balance'].dropna()

num_samples = 1000
sample_size = 5289

bootstrap_samples = []

for _ in range(num_samples):
    sample = np.random.choice(filtered_yes_balance, size=sample_size, replace=True)
    bootstrap_samples.append(sample)

bootstrap_samples_yes_df = pd.DataFrame(bootstrap_samples)

bootstrap_samples_yes_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,5279,5280,5281,5282,5283,5284,5285,5286,5287,5288
0,52.0,262.0,22.0,1612.0,-39.0,10613.0,312.0,6281.0,513.0,0.0,...,816.0,1636.0,2311.0,132.0,3417.0,893.0,70.0,1451.0,0.0,751.0
1,205.0,2948.0,684.0,2991.0,313.0,1090.0,133.0,3840.0,5561.0,751.0,...,29.0,0.0,196.0,942.0,247.0,318.0,870.0,1536.0,1538.0,265.0
2,973.0,926.0,-454.0,1506.0,93.0,506.0,740.0,0.0,382.0,3754.0,...,611.0,0.0,20.0,5275.0,653.0,507.0,200.0,2303.0,1003.0,87.0
3,701.0,376.0,48.0,1396.0,3115.0,58.0,3301.0,366.0,428.0,40.0,...,76.0,2892.0,469.0,185.0,52.0,0.0,403.0,6403.0,694.0,317.0
4,533.0,76.0,284.0,122.0,4987.0,978.0,3676.0,4256.0,3064.0,706.0,...,699.0,384.0,1188.0,4189.0,313.0,25.0,551.0,137.0,927.0,15.0


In [39]:
bootstrap_samples_yes_df.shape

(1000, 5289)

In [41]:
# with open('../data/bootstrap_samples_yes_df.pkl', 'wb') as f:
#     pickle.dump(bootstrap_samples_yes_df, f)

In [42]:
with open('../data/bootstrap_samples_yes_df.pkl', 'rb') as f:
    bootstrap_samples_yes_df = pickle.load(f)

In [43]:
yes_balance_sample_means = bootstrap_samples_yes_df.mean(axis=1)
yes_balance_sample_means.describe()

count    1000.000000
mean     1802.247006
std        49.742424
min      1658.646058
25%      1769.967196
50%      1801.720647
75%      1832.746502
max      1960.099263
dtype: float64

In [44]:
filtered_no_balance = df_balance['no_balance'].dropna()

num_samples = 1000
sample_size = 39922

bootstrap_samples = []

for _ in range(num_samples):
    sample = np.random.choice(filtered_no_balance, size=sample_size, replace=True)
    bootstrap_samples.append(sample)

bootstrap_samples_no_df = pd.DataFrame(bootstrap_samples)

bootstrap_samples_no_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,39912,39913,39914,39915,39916,39917,39918,39919,39920,39921
0,1055,0,1537,3072,1405,2605,0,147,1023,457,...,-205,2818,2089,99,805,1042,1874,-617,0,2364
1,142,289,23,-438,622,1303,4344,-199,409,446,...,-252,382,2089,1192,195,214,849,477,1905,3409
2,399,595,1286,0,215,228,216,1030,87,225,...,786,1480,0,105,2472,-196,884,6,265,1661
3,1840,143,6770,777,2430,741,228,566,-328,355,...,3083,300,306,809,1137,393,5041,5317,5666,130
4,-5,1098,1259,240,-29,623,518,2185,7668,2352,...,140,0,17,3850,506,1028,122,0,0,11


In [47]:
# file_path = '../data/bootstrap_samples_no_df.pkl'
# with open(file_path, 'wb') as f:
#     pickle.dump(bootstrap_samples_no_df, f)

In [48]:
file_path = '../data/bootstrap_samples_no_df.pkl'
with open(file_path, 'rb') as f:
    bootstrap_samples_no_df = pickle.load(f)

In [49]:
no_balance_sample_means = bootstrap_samples_no_df.mean(axis=1)
no_balance_sample_means.describe()

count    1000.000000
mean     1303.799456
std        14.459256
min      1264.242874
25%      1294.160263
50%      1303.697172
75%      1314.565158
max      1350.859301
dtype: float64

In [51]:
t_stat, p_value = ttest_ind(no_balance_sample_means, yes_balance_sample_means)

print("T-statistic:", t_stat)
print("P-value:", p_value)

alpha = 0.05 
if p_value < alpha:
    print("Reject the null hypothesis: There is a significant difference between the means.")
else:
    print("Fail to reject the null hypothesis: There is no significant difference between the means.")

T-statistic: -304.28354650264765
P-value: 0.0
Reject the null hypothesis: There is a significant difference between the means.


#### Conclusion: there is a signifant difference in average annual bank balances between those who sign up for a term deposit plan and those who do not. 

### Chi-Square Tests
#### House loan (yes/no) vs. term deposits (yes/no)

#### Previous outcome (y/n/unknown) v. term deposits (yes/no)