In [1]:
import pandas as pd
from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
bank_df = pd.read_csv(r'C:\Users\Ashish\Desktop\machine learning\bank.csv')
bank_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4521 entries, 0 to 4520
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   age                4521 non-null   int64 
 1   job                4521 non-null   object
 2   marital            4521 non-null   object
 3   education          4521 non-null   object
 4   default            4521 non-null   object
 5   balance            4521 non-null   int64 
 6   housing-loan       4521 non-null   object
 7   personal-loan      4521 non-null   object
 8   current-campaign   4521 non-null   int64 
 9   previous-campaign  4521 non-null   int64 
 10  subscribed         4521 non-null   object
dtypes: int64(4), object(7)
memory usage: 388.7+ KB


In [3]:
bank_df.subscribed.value_counts()

subscribed
no     4000
yes     521
Name: count, dtype: int64

In [4]:
# separate the case of yes-subscribe and no-subscribe
bank_subscribed_no= bank_df[bank_df.subscribed == 'no']
bank_subscribed_yes= bank_df[bank_df.subscribed == 'yes']

# upsample the yes-subscribe cases
df_minority_upsampled = resample(bank_subscribed_yes, replace = True, n_samples = 2000)

In [5]:
from sklearn.utils import shuffle
new_bank_df = shuffle(new_bank_df)

NameError: name 'new_bank_df' is not defined

In [None]:
#assigning list of all column names in the DataFrame
X_features = list(new_bank_df.columns)
#remove the responce variable from the list
X_features.remove('subscribed')
X_features

In [None]:
encoded_bank_df = pd.get_dummies(new_bank_df[X_features], drop_first = True)
X = encoded_bank_df

In [None]:
# Encoded the subcribed columns and assigning to Y
Y = new_bank_df.subscribed.map(lambda x: int(x == 'yes'))

In [None]:
train_X, test_X, train_Y, test_Y = train_test_split(X, Y, test_size = 0.3, random_state = 42)

In [None]:
logit = LogisticRegression()
logit.fit(train_X, train_Y)

In [None]:
pred_y = logit.predict(test_X)

In [None]:
# defining the matrix to draw the confusion matrix from actual and predicting class labels
def draw_cm(actual, predicted):
    # Invoking confusion_matrix from metric packege. the matrix 
    #will be oriented as [1,0] i.e. the classes with label 1 will be
    #represented by the first row and 0 as second row
    cm = metrics.confusion_matrix(actual, predicted, labels=[1,0])
    sns.heatmap(cm, annot = True, fmt = '.2f',
                xticklabels = ["Subscribed", "Not subscribed"],
                yticklabels = ["Subscribed", "Not subscribed "])
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.show()

In [None]:
cm= draw_cm(test_Y,pred_y)

In [None]:
print(metrics.classification_report(test_Y, pred_y))

In [None]:
predict_proba_df = pd.DataFrame(logit.predict_proba(test_X))
predict_proba_df.head()

In [None]:
# inititalize the dataframe with actual class labels
test_result_df = pd.DataFrame({'actual' : test_Y})
test_result_df = test_result_df.reset_index()
#assigning the probability values for class label 1
test_result_df['chd_1']= predict_proba_df.iloc[:, 1:2]

In [None]:
test_result_df.head()

In [None]:
#passing actual class labels and predicted probability values
#to compute ROC AUC score
auc_score = metrics.roc_auc_score(test_result_df.actual, test_result_df.chd_1)
round( float(auc_score), 2)

In [None]:
#the maethod takes the following three parameters
## model : the classification model
## test_x : X features of the test set
## test_y: actual labels of the test set
## returns
## - ROC Auc score
## FPR and TPR for different threshold values
def draw_roc_curve(model, test_X, test_Y):
    ## creating and initialising a result dataframe with actual labels
    test_result_df = pd.DataFrame({'actual' : test_Y})
    test_result_df = test_result_df.reset_index()
    # predict the probabilities on the test set
    predict_proba_df = pd.DataFrame(model.predict_proba(test_X))
    ## selecting the probabilities that test example belong to class 1
    test_result_df['chd_1'] = predict_proba_df.iloc[:, 1:2]
    ## invoke the roc curve to return fpr, tpr and threshold values
    ## threshold vlaues contain values 0.0 to 1.0 
    fpr, tpr, thresholds = metrics.roc_curve(test_result_df.actual, test_result_df.chd_1, drop_intermediate=False)
    ## getting roc auc score by invoking metrics.roce_auc_score method
    auc_score = metrics.roc_auc_score(test_result_df.actual, test_result_df.chd_1)
    ## setting the size of the plot 
    plt.figure(figsize = (8,6))
    plt.plot(fpr, tpr, label = 'ROC curve(area = %a.2f)' % auc_score)
    plt.plot ([0,1],[0,1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('FALSE Positive Rate or [1- true negative rate]')
    plt.ylabel('True Positive Rate')
    plt.legend(loc = "lower right")
    plt.show()
    return auc_score, fpr, tpr, thresholds

In [None]:
_, _, _, _ = draw_roc_curve(logit, test_X, test_Y)

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
# initializing the classifire
knn_cif = KNeighborsClassifier()
## fititing the model with the training set
knn_cif.fit(train_X, train_Y)

In [None]:
_, _, _, _ = draw_roc_curve(knn_cif, test_X, test_Y)

In [None]:
#predicting on test set
pred_y = knn_cif.predict(test_X)
## drawing the confusion matrix
draw_cm(test_Y, pred_y)

In [None]:
print(metrics.classification_report(test_Y, pred_y))

In [None]:
from sklearn.model_selection import GridSearchCV
tuned_parameters = [{'n_neighbors' : range(5,10), 'metric': ['canberra', 'euclidean', 'minkowski']}]
## congifuring grid search 
cif = GridSearchCV(KNeighborsClassifier(), tuned_parameters, cv=  10, scoring = 'roc_auc')
## fit the search with training set
cif.fit(train_X, train_Y)

In [None]:
cif.best_score_

In [None]:
cif.best_params_

In [None]:
cif.cv_results_

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
radm_cif = RandomForestClassifier(max_depth = 10, n_estimators = 10)
radm_cif.fit(train_X, train_Y)

In [None]:
tuned_parameters = [{'max_depth' : [10,15], 'n_estimators' : [10,20], 'max_features' : ['sqrt', 0.2]}]
radm_cif = RandomForestClassifier()
cif = GridSearchCV(radm_cif,tuned_parameters, cv=5, scoring= 'roc_auc')
cif.fit(train_X, train_Y)

In [None]:
cif.best_score_

In [None]:
cif.best_params_

In [None]:
radm_cif = RandomForestClassifier(max_depth=15, n_estimators=20, max_features='sqrt')
radm_cif.fit(train_X, train_Y)

In [None]:
_, _, _, _ = draw_roc_curve(cif, test_X, test_Y)

In [None]:
pred_y = radm_cif.predict(test_X)
draw_cm(test_Y, pred_y)

In [None]:
print(metrics.classification_report(test_Y, pred_y))

In [None]:
import numpy as np
#create a dataframe to store the features and their corresponding importance
feature_rank = pd.DataFrame({'feature' : train_X.columns, 'importance' : radm_cif.feature_importances_})
# sorting the feature based on their importance with most
# important features at top
feature_rank = feature_rank.sort_values('importance', ascending = False)
plt.figure(figsize = (8,6))
#plot the values
sns.barplot(y = 'feature', x = 'importance', data = feature_rank);

In [None]:
feature_rank['cumsum'] = feature_rank.importance.cumsum()*100
feature_rank.head(10)

In [None]:
from sklearn.ensemble import AdaBoostClassifier
# initialise the logistic regression to use as base classifier
log_reg = LogisticRegression()
# inititaliseing adaboost classifier
ada_clf = AdaBoostClassifier(log_reg, n_estimators=50)
#fitting adaboost model to training set
ada_clf.fit(train_X, train_Y)

In [None]:
_, _, _, _ = draw_roc_curve(ada_clf, test_X, test_Y)

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
gboost_clf = GradientBoostingClassifier(n_estimators=500, max_depth=10)
gboost_clf.fit(train_X, train_Y)

In [None]:
_, _, _, _ = draw_roc_curve(gboost_clf, test_X, test_Y)

In [None]:
from sklearn.model_selection import cross_val_score
gboost_clf = GradientBoostingClassifier(n_estimators=500, max_depth=10)
cv_scores = cross_val_score(gboost_clf, train_X, train_Y, cv= 10, scoring = 'roc_auc')

In [None]:
print(cv_scores)
print("Mean Accuracy :" , np.mean(cv_scores), "with standard deviation of:", np.std(cv_scores))

In [None]:
gboost_clf.fit(train_X, train_Y)
pred_y = gboost_clf.predict(test_X)
draw_cm(test_Y, pred_y)

In [None]:
print(metrics.classification_report(test_Y, pred_y))

In [None]:
feature_rank = pd.DataFrame({'feature': train_X.columns, 'importance': gboost_clf.feature_importances_})
feature_rank = feature_rank.sort_values('importance', ascending = False)
plt.figure(figsize = (8,6))
sns.barplot(y = 'feature', x = 'importance', data = feature_rank);