In [None]:
import pandas as pd
import numpy as np
import sklearn

In [None]:
train = pd.read_csv('/content/cleaned.csv')

In [None]:
X = pd.get_dummies(train.drop('customer_id',axis=1),drop_first=True)

In [None]:
from sklearn import preprocessing

In [None]:
normalizer=preprocessing.MinMaxScaler()

In [None]:
X['customer_age']=normalizer.fit_transform(np.array(X['customer_age']).reshape(-1,1))
X['num_contacts_in_campaign'] = normalizer.fit_transform(np.array(X['num_contacts_in_campaign']).reshape(-1,1))
X['balance'] = normalizer.fit_transform(np.array(X['balance']).reshape(-1,1))
X['day_of_month'] = normalizer.fit_transform(np.array(X['day_of_month']).reshape(-1,1))
X['last_contact_duration'] = normalizer.fit_transform(np.array(X['last_contact_duration']).reshape(-1,1))

In [None]:
X.drop('Unnamed: 0',inplace=True,axis=1)

In [None]:
import imblearn

In [None]:
X = X.drop(['term_deposit_subscribed'],axis=1)
y = train['term_deposit_subscribed']

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test= train_test_split(X,y,test_size=0.2,random_state=101)

In [None]:
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.ensemble import RandomForestClassifier
#Create an instance

In [None]:
from sklearn.model_selection import KFold, StratifiedKFold, cross_val_score

In [None]:
kf =KFold(n_splits=5, shuffle=True, random_state=42)


In [None]:
cnt=1
for train_index, test_index in kf.split(X, y):
    print(f'Fold:{cnt}, Train set: {len(train_index)}, Test set:{len(test_index)}')
    cnt += 1

Fold:1, Train set: 19018, Test set:4755
Fold:2, Train set: 19018, Test set:4755
Fold:3, Train set: 19018, Test set:4755
Fold:4, Train set: 19019, Test set:4754
Fold:5, Train set: 19019, Test set:4754


In [None]:
score = cross_val_score(RandomForestClassifier(), X, y, cv= kf, scoring="accuracy")

In [None]:
score

array([0.90473186, 0.90368034, 0.90431125, 0.90492217, 0.89945309])

In [None]:
n_estimators = [50, 100, 150, 200, 250, 300, 350]

for val in n_estimators:
    score = cross_val_score(RandomForestClassifier(n_estimators= val, random_state= 42), X, y, cv= kf, scoring="accuracy")
    print(f'Average score({val}): {"{:.3f}".format(score.mean())}')

Average score(50): 0.902
Average score(100): 0.903
Average score(150): 0.904
Average score(200): 0.904
Average score(250): 0.904
Average score(300): 0.904
Average score(350): 0.904


In [None]:
pip install hyperopt



In [None]:
from xgboost import XGBClassifier

In [None]:
from hyperopt import hp,STATUS_OK

In [None]:
space= { 'max_depth': hp.quniform ("max_depth", 3, 18, 1),
        'gamma' : hp.uniform ('gamma', 1,9),
        'reg_alpha': hp.quniform ('reg_alpha', 40, 180, 1),
        'reg_lambda' : hp.uniform('reg_lambda', 0,1),
        'colsample_bytree' : hp.uniform('colsample_bytree', 0.5,1),
        'min_child_weight': hp.quniform ('min_child_weight', 0, 10, 1),
        'n_estimators': 180,
        'seed': 0}

In [None]:
def objective (space):
  clf=XGBClassifier(n_estimators =space ['n_estimators'], max_depth = int (space['max_depth']),
                    gamma = space ['gamma'],reg_alpha = int(space ['reg_alpha']),min_child_weight=int(space ['min_child_weight']),
                    colsample_bytree=int (space ['colsample_bytree']))
  evaluation = [( X_train, y_train), (X_test, y_test)]
  clf.fit (X_train, y_train,eval_set=evaluation, eval_metric="auc", early_stopping_rounds=10, verbose=False)
  pred = clf.predict (X_test)
  accuracy = accuracy_score (y_test, pred>0.5)
  print ("SCORE: ", accuracy)
  return {'loss': -accuracy, 'status': STATUS_OK }

In [None]:
import hyperopt

In [None]:
trials = hyperopt.Trials()

In [None]:
from imblearn.over_sampling import SMOTE

In [None]:
oversampler=SMOTE(sampling_strategy=0.75)

In [None]:
X_train,y_train=oversampler.fit_resample(X_train, y_train)

In [None]:
np.unique(y_train,return_counts=True)

(array([0, 1]), array([16975, 12731]))

In [None]:
from hyperopt import fmin, tpe

In [None]:
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report

In [None]:
best_hyperparams=fmin(fn=objective,space=space,algo=tpe.suggest,max_evals=100,trials=trials)

SCORE: 
0.8958990536277602
SCORE: 
0.8963196635120926
SCORE: 
0.8969505783385909
SCORE: 
0.895478443743428
SCORE: 
0.8956887486855941
SCORE: 
0.895478443743428
SCORE: 
0.8963196635120926
SCORE: 
0.8956887486855941
SCORE: 
0.8956887486855941
SCORE: 
0.8963196635120926
SCORE: 
0.8975814931650894
SCORE: 
0.895478443743428
SCORE: 
0.8958990536277602
SCORE: 
0.8969505783385909
SCORE: 
0.8973711882229233
SCORE: 
0.8956887486855941
SCORE: 
0.8956887486855941
SCORE: 
0.8975814931650894
SCORE: 
0.8956887486855941
SCORE: 
0.8950578338590957
SCORE: 
0.8990536277602523
SCORE: 
0.8973711882229233
SCORE: 
0.8988433228180862
SCORE: 
0.8980021030494216
SCORE: 
0.8980021030494216
SCORE: 
0.8982124079915877
SCORE: 
0.8965299684542587
SCORE: 
0.894006309148265
SCORE: 
0.897160883280757
SCORE: 
0.8986330178759201
SCORE: 
0.8982124079915877
SCORE: 
0.895478443743428
SCORE: 
0.8975814931650894
SCORE: 
0.8967402733964248
SCORE: 
0.8956887486855941
SCORE: 
0.8948475289169295
SCORE: 
0.8961093585699264
SCORE: 

In [None]:
best_hyperparams

{'colsample_bytree': 0.8270036744423783,
 'gamma': 2.8835931053514643,
 'max_depth': 3.0,
 'min_child_weight': 5.0,
 'reg_alpha': 177.0,
 'reg_lambda': 0.8009277610301544}

In [None]:
clf_best_model=XGBClassifier(colsample_bytree= 0.6983247169581334,
                              gamma=4.89914799141072,
                              max_depth= 17,
                              min_child_weight= 7.0,
                              reg_alpha= 40.0,
                              reg_lambda= 0.6523469701368025)

In [None]:
clf_best_model.fit(X_train,y_train)

XGBClassifier(colsample_bytree=0.6983247169581334, gamma=4.89914799141072,
              max_depth=17, min_child_weight=7.0, reg_alpha=40.0,
              reg_lambda=0.6523469701368025)

In [None]:
pred=clf_best_model.predict(X_test)

In [None]:
print(classification_report(y_test,pred))

              precision    recall  f1-score   support

           0       0.93      0.95      0.94      4270
           1       0.49      0.40      0.44       485

    accuracy                           0.90      4755
   macro avg       0.71      0.68      0.69      4755
weighted avg       0.89      0.90      0.89      4755



In [None]:
print(confusion_matrix(y_test,pred))

[[4068  202]
 [ 292  193]]


In [None]:
test = pd.read_csv('cleaned_test.csv')

In [None]:
test.head()

Unnamed: 0.1,Unnamed: 0,customer_id,customer_age,job_type,marital,education,default,balance,housing_loan,personal_loan,communication_type,last_contact_duration,day_of_month,month,num_contacts_in_campaign,num_contacts_prev_campaign,prev_campaign_outcome
0,0,id_43823,28.0,management,single,tertiary,no,285.0,yes,no,unknown,849,26,jun,4.0,0,unknown
1,1,id_10523,46.0,technician,married,secondary,no,656.0,no,no,cellular,990,5,feb,4.0,0,unknown
2,2,id_43951,34.0,services,single,secondary,no,2.0,yes,no,unknown,886,20,may,3.0,0,unknown
3,3,id_12681,65.0,retired,married,primary,no,2880.0,no,no,cellular,913,23,apr,1.0,0,unknown
4,4,id_2153,52.0,blue-collar,married,tertiary,no,1779.0,no,no,cellular,703,19,nov,3.0,1,failure


In [None]:
test.drop('Unnamed: 0',axis=1,inplace=True)

In [None]:
final = test.drop('customer_id',axis=1)

In [None]:
final = pd.get_dummies(final,drop_first=True)

In [None]:
final.head()

Unnamed: 0,customer_age,balance,last_contact_duration,day_of_month,num_contacts_in_campaign,num_contacts_prev_campaign,job_type_blue-collar,job_type_entrepreneur,job_type_housemaid,job_type_management,...,month_jul,month_jun,month_mar,month_may,month_nov,month_oct,month_sep,prev_campaign_outcome_other,prev_campaign_outcome_success,prev_campaign_outcome_unknown
0,28.0,285.0,849,26,4.0,0,0,0,0,1,...,0,1,0,0,0,0,0,0,0,1
1,46.0,656.0,990,5,4.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,34.0,2.0,886,20,3.0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1
3,65.0,2880.0,913,23,1.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,52.0,1779.0,703,19,3.0,1,1,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [None]:
final['customer_age']=normalizer.fit_transform(np.array(final['customer_age']).reshape(-1,1))
final['num_contacts_in_campaign'] = normalizer.fit_transform(np.array(final['num_contacts_in_campaign']).reshape(-1,1))
final['balance'] = normalizer.fit_transform(np.array(final['balance']).reshape(-1,1))
final['day_of_month'] = normalizer.fit_transform(np.array(final['day_of_month']).reshape(-1,1))
final['last_contact_duration'] = normalizer.fit_transform(np.array(final['last_contact_duration']).reshape(-1,1))

In [None]:
final = final.loc[:,['customer_age', 'balance', 'day_of_month', 'last_contact_duration',
       'num_contacts_in_campaign', 'num_contacts_prev_campaign',
       'job_type_blue-collar', 'job_type_entrepreneur', 'job_type_housemaid',
       'job_type_management', 'job_type_retired', 'job_type_self-employed',
       'job_type_services', 'job_type_student', 'job_type_technician',
       'job_type_unemployed', 'job_type_unknown', 'marital_married',
       'marital_single', 'education_secondary', 'education_tertiary',
       'education_unknown', 'default_yes', 'housing_loan_yes',
       'personal_loan_unknown', 'personal_loan_yes',
       'communication_type_telephone', 'communication_type_unknown',
       'month_aug', 'month_dec', 'month_feb', 'month_jan', 'month_jul',
       'month_jun', 'month_mar', 'month_may', 'month_nov', 'month_oct',
       'month_sep', 'prev_campaign_outcome_other',
       'prev_campaign_outcome_success', 'prev_campaign_outcome_unknown']]

In [None]:
pred = clf_best_model.predict(final)

In [None]:
np.unique(pred,return_counts=True)

(array([0, 1]), array([4982, 2742]))

In [None]:
pred_proba=clf_best_model.predict_proba(final)

In [None]:
test['term_deposit']=pred

In [None]:
pred_proba

array([[0.68852437, 0.31147563],
       [0.6254612 , 0.37453878],
       [0.5824297 , 0.4175703 ],
       ...,
       [0.6791896 , 0.32081038],
       [0.22057807, 0.7794219 ],
       [0.9248178 , 0.07518219]], dtype=float32)

In [None]:
pred

array([0, 0, 0, ..., 0, 1, 0])

In [None]:
proba_yes = []
for i in pred_proba:
  proba_yes.append(i[1])

In [None]:
test['proba_yes'] = proba_yes

In [None]:
test.head()

Unnamed: 0,customer_id,customer_age,job_type,marital,education,default,balance,housing_loan,personal_loan,communication_type,last_contact_duration,day_of_month,month,num_contacts_in_campaign,num_contacts_prev_campaign,prev_campaign_outcome,term_deposit,proba_yes
0,id_43823,28.0,management,single,tertiary,no,285.0,yes,no,unknown,849,26,jun,4.0,0,unknown,0,0.311476
1,id_10523,46.0,technician,married,secondary,no,656.0,no,no,cellular,990,5,feb,4.0,0,unknown,0,0.374539
2,id_43951,34.0,services,single,secondary,no,2.0,yes,no,unknown,886,20,may,3.0,0,unknown,0,0.41757
3,id_12681,65.0,retired,married,primary,no,2880.0,no,no,cellular,913,23,apr,1.0,0,unknown,1,0.572879
4,id_2153,52.0,blue-collar,married,tertiary,no,1779.0,no,no,cellular,703,19,nov,3.0,1,failure,1,0.774162


In [None]:
test.to_csv('predicted.csv')

In [None]:
adarsh = pd.read_excel('/content/test_predicted.xlsx')

In [None]:
adarsh.head()

Unnamed: 0,customer_id,customer_age,job_type,marital,education,default,balance,housing_loan,personal_loan,communication_type,...,prev_campaign_outcome,rf_pred,svc_pred,xgb_pred,knn_pred,cnb_pred,xgb_proba,rf_proba,knn_proba,cnb_proba
0,id_41602,30.0,services,married,secondary,no,412,yes,no,unknown,...,unknown,0,0,0,0,0,[0.99881214 0.00118787],[0.88 0.12],[1. 0.],[0.99121983 0.00878017]
1,id_20926,34.0,entrepreneur,married,secondary,no,3115,yes,no,cellular,...,failure,0,0,0,1,0,[0.98641914 0.01358083],[0.82 0.18],[0.33333333 0.66666667],[0.6676755 0.3323245]
2,id_10463,40.0,blue-collar,single,primary,no,3380,yes,no,telephone,...,unknown,0,0,0,0,0,[0.97772604 0.02227394],[0.73 0.27],[1. 0.],[0.89274192 0.10725808]
3,id_25779,51.0,management,married,tertiary,no,1886,no,no,cellular,...,unknown,0,0,0,0,1,[0.9554882 0.0445118],[0.54 0.46],[0.66666667 0.33333333],[0.45029139 0.54970861]
4,id_13459,30.0,self-employed,married,secondary,no,580,yes,no,cellular,...,failure,0,0,0,1,1,[0.9319287 0.0680713],[0.54 0.46],[0.33333333 0.66666667],[0.06033406 0.93966594]


In [None]:
a=adarsh['xgb_proba'][0].split(' ')

In [None]:
xgb_proba_yes=[]
for i in adarsh['xgb_proba']:
  a=i.split(' ')
  x=a[1].replace(']','')
  xgb_proba_yes.append(x)

In [None]:
proba_yes=[]
for i in range(len(xgb_proba_yes)):
  proba_yes.append(float(xgb_proba_yes[i]))

ValueError: ignored

In [None]:
float(xgb_proba_yes[34])

0.65395033

In [None]:
adarsh['proba_yes']=xgb_proba_yes

In [None]:
adarsh.to_csv('wejfnf.csv')