In [31]:
import pandas as pd
from sklearn.model_selection import train_test_split
import statsmodels.api as sm

In [5]:
bankcsv_df = pd.read_csv(r'C:\Users\Ashish\Desktop\machine learning\bank.csv')

In [7]:
bankcsv_df.head(10)

Unnamed: 0,age,job,marital,education,default,balance,housing-loan,personal-loan,current-campaign,previous-campaign,subscribed
0,30,unemployed,married,primary,no,1787,no,no,1,0,no
1,33,services,married,secondary,no,4789,yes,yes,1,4,no
2,35,management,single,tertiary,no,1350,yes,no,1,1,no
3,30,management,married,tertiary,no,1476,yes,yes,4,0,no
4,59,blue-collar,married,secondary,no,0,yes,no,1,0,no
5,35,management,single,tertiary,no,747,no,no,2,3,no
6,36,self-employed,married,tertiary,no,307,yes,no,1,2,no
7,39,technician,married,secondary,no,147,yes,no,2,0,no
8,41,entrepreneur,married,tertiary,no,221,yes,no,2,0,no
9,43,services,married,primary,no,-88,yes,yes,1,2,no


In [11]:
bankcsv_df.subscribed.value_counts()

subscribed
no     4000
yes     521
Name: count, dtype: int64

In [13]:
bankcsv_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4521 entries, 0 to 4520
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   age                4521 non-null   int64 
 1   job                4521 non-null   object
 2   marital            4521 non-null   object
 3   education          4521 non-null   object
 4   default            4521 non-null   object
 5   balance            4521 non-null   int64 
 6   housing-loan       4521 non-null   object
 7   personal-loan      4521 non-null   object
 8   current-campaign   4521 non-null   int64 
 9   previous-campaign  4521 non-null   int64 
 10  subscribed         4521 non-null   object
dtypes: int64(4), object(7)
memory usage: 388.7+ KB


In [17]:
X_features = list(bankcsv_df.columns)
X_features.remove('subscribed')
X_features

['age',
 'job',
 'marital',
 'education',
 'default',
 'balance',
 'housing-loan',
 'personal-loan',
 'current-campaign',
 'previous-campaign']

In [19]:
encoded_bank_df = pd.get_dummies(bankcsv_df[X_features], drop_first  =True)

In [60]:
boolean_columns = encoded_bank_df.select_dtypes(include = ['bool']).columns
for col in boolean_columns:
    encoded_bank_df[col] = encoded_bank_df[col].astype(int)

In [62]:
Y = bankcsv_df.subscribed.map(lambda x: int(x == 'yes'))
X = sm.add_constant(encoded_bank_df)

In [54]:
encoded_bank_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4521 entries, 0 to 4520
Data columns (total 23 columns):
 #   Column               Non-Null Count  Dtype
---  ------               --------------  -----
 0   age                  4521 non-null   int64
 1   balance              4521 non-null   int64
 2   current-campaign     4521 non-null   int64
 3   previous-campaign    4521 non-null   int64
 4   job_blue-collar      4521 non-null   int32
 5   job_entrepreneur     4521 non-null   int32
 6   job_housemaid        4521 non-null   int32
 7   job_management       4521 non-null   int32
 8   job_retired          4521 non-null   int32
 9   job_self-employed    4521 non-null   int32
 10  job_services         4521 non-null   int32
 11  job_student          4521 non-null   int32
 12  job_technician       4521 non-null   int32
 13  job_unemployed       4521 non-null   int32
 14  job_unknown          4521 non-null   int32
 15  marital_married      4521 non-null   int32
 16  marital_single       452

In [64]:
train_X, test_X, train_Y, test_Y = train_test_split(X, Y, train_size = 0.8 ,random_state = 42)

In [66]:
logit_model = sm.Logit(train_Y, train_X).fit()
logit_model.summary2()

Optimization terminated successfully.
         Current function value: 0.337098
         Iterations 7


0,1,2,3
Model:,Logit,Method:,MLE
Dependent Variable:,subscribed,Pseudo R-squared:,0.066
Date:,2025-01-14 12:03,AIC:,2485.8911
No. Observations:,3616,BIC:,2634.5260
Df Model:,23,Log-Likelihood:,-1218.9
Df Residuals:,3592,LL-Null:,-1304.9
Converged:,1.0000,LLR p-value:,9.2436e-25
No. Iterations:,7.0000,Scale:,1.0000

0,1,2,3,4,5,6
,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
const,-1.9024,0.4242,-4.4846,0.0000,-2.7339,-1.0710
age,0.0113,0.0064,1.7525,0.0797,-0.0013,0.0239
balance,-0.0000,0.0000,-0.2306,0.8176,-0.0000,0.0000
current-campaign,-0.0977,0.0264,-3.6952,0.0002,-0.1495,-0.0459
previous-campaign,0.1401,0.0231,6.0535,0.0000,0.0947,0.1854
job_blue-collar,-0.3901,0.2227,-1.7517,0.0798,-0.8265,0.0464
job_entrepreneur,-0.7912,0.4094,-1.9326,0.0533,-1.5936,0.0112
job_housemaid,0.0446,0.3565,0.1251,0.9005,-0.6541,0.7433
job_management,-0.0912,0.2244,-0.4066,0.6843,-0.5311,0.3486


In [70]:
def get_significant_values(lm):
    var_p_vals = pd.DataFrame(lm.pvalues)
    var_p_vals['vars'] = var_p_vals.index
    var_p_vals.columns = ['pvals', 'vars']
    return list(var_p_vals[var_p_vals.pvals <= 0.05]['vars'])

In [72]:
significant_vars = get_significant_values(logit_model)
significant_vars

['const',
 'current-campaign',
 'previous-campaign',
 'marital_married',
 'education_tertiary',
 'housing-loan_yes',
 'personal-loan_yes']

In [74]:
X_features = ['const',
 'current-campaign',
 'previous-campaign',
 'marital_married',
 'education_tertiary',
 'housing-loan_yes',
 'personal-loan_yes']

In [80]:
X =sm.add_constant(X[X_features])

In [84]:
train_X, test_X, train_Y, test_Y = train_test_split(X, Y, train_size = 0.8 ,random_state = 42)

In [86]:
logit_model2 = sm.Logit(train_Y, train_X).fit()

Optimization terminated successfully.
         Current function value: 0.341929
         Iterations 7


In [88]:
logit_model2.summary2()

0,1,2,3
Model:,Logit,Method:,MLE
Dependent Variable:,subscribed,Pseudo R-squared:,0.052
Date:,2025-01-14 12:35,AIC:,2486.8315
No. Observations:,3616,BIC:,2530.1834
Df Model:,6,Log-Likelihood:,-1236.4
Df Residuals:,3609,LL-Null:,-1304.9
Converged:,1.0000,LLR p-value:,4.4294e-27
No. Iterations:,7.0000,Scale:,1.0000

0,1,2,3,4,5,6
,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
const,-1.3434,0.1213,-11.0722,0.0000,-1.5813,-1.1056
current-campaign,-0.0987,0.0263,-3.7603,0.0002,-0.1502,-0.0473
previous-campaign,0.1417,0.0229,6.1747,0.0000,0.0967,0.1866
marital_married,-0.3061,0.1069,-2.8646,0.0042,-0.5156,-0.0967
education_tertiary,0.2405,0.1116,2.1556,0.0311,0.0218,0.4591
housing-loan_yes,-0.7186,0.1078,-6.6637,0.0000,-0.9300,-0.5072
personal-loan_yes,-0.6351,0.1811,-3.5063,0.0005,-0.9901,-0.2801


In [100]:
Y_pred = logit_model2.predict(test_X)

In [118]:
y_pred_df = pd.DataFrame({'actual': train_Y, 'predicted_prob': logit_model2.predict(test_X)})

In [116]:
sorted_predict_df = y_pred_df[['predicted_prob', 'actual']].sort_values('predicted_prob', ascending=False)

In [120]:
num_per_decile = int(len(sorted_predict_df)/10)
print("Number of observation per decile:", num_per_decile)

Number of observation per decile: 452


In [122]:
def get_decile(df):
    df['decile']= 1
    idx = 0
    for each_d in range(0,10):
        df.iloc[idx:idx+num_per_decile, df.columns.get_loc('decile')] = each_d
        idx += num_per_decile
    df['decile'] = df['decile'] + 1
    return df

In [124]:
deciles_predict_df = get_decile(sorted_predict_df)

In [128]:
deciles_predict_df[0:10]

Unnamed: 0,predicted_prob,actual,decile
3471,0.436354,,1
1904,0.423424,,1
2277,0.417849,,1
1108,0.379106,,1
3612,0.356159,,1
4093,0.346374,,1
1106,0.335516,,1
800,0.289417,,1
1181,0.289401,,1
2486,0.289401,,1
