
# Bank Marketing Campaign Analysis

This notebook explores and analyzes a dataset related to direct marketing campaigns by a banking institution. The target variable indicates whether a customer subscribed to a term deposit product. The project includes:

- Data loading and inspection
- Data cleaning and preprocessing
- Exploratory data analysis
- Classification using machine learning models (e.g., logistic regression, decision trees)
- Insights and recommendations

_Originally developed as part of a university assignment._

## Dataset Description
- The dataset contains 17 input features including customer information, campaign data, and historical interactions.
- The output variable (`y`) indicates whether the client subscribed (`yes`) or not (`no`) to a term deposit.



In [None]:
!pip install bnlearn

In [None]:
!pip install pymc3==3.10  #issue fixed for version 3
!pip install --upgrade numpy==1.20.3
!pip install --upgrade arviz==0.11.0

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import load_iris
#from sklearn.naive_bayes import CategoricalNB
from sklearn import datasets
from sklearn import svm
from sklearn import metrics
from sklearn.metrics import roc_auc_score
from datetime import datetime
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler

In [None]:
from google.colab import drive
drive.mount('/content/drive')
df1=pd.read_csv('/content/drive/MyDrive/Colab Notebooks/dataset_039.csv')
df2=pd.read_csv('/content/drive/MyDrive/Colab Notebooks/dataset_110.csv')
df3=pd.read_csv('/content/drive/MyDrive/Colab Notebooks/dataset_173.csv')
df4=pd.read_csv('/content/drive/MyDrive/Colab Notebooks/dataset_196.csv')
#print(df1)

In [None]:
#from google.colab import files
#uploaded = files.upload()

In [None]:
#!wget --no-check-certificate --output-document=dataset_110.csv 'https://docs.google.com/spreadsheets/d/1M6uJSwQUwScg0sNttlSM06IN1a5QA3yqgRKP5rsKYNc/edit#gid=900037210&single=true&output=csv'
#!wget --no-check-certificate --output-document=dataset_039.csv 'https://docs.google.com/spreadsheets/d/1_h6kL4HdEcVHuhiMFB4lSC1p80ZCa88ZCkVRb3bSktg/edit?usp=sharing'
#df1=pd.read_csv('dataset_039.csv')
#df2=pd.read_csv('dataset_110.csv')

# Dataset1

In [None]:
#Remove Education==unknown
#drop1x=df1x[df1x.education!='unknown']
df1=df1[df1["education"].str.contains("unknown") == False]
df1


In [None]:
df1.replace({'yes':1,'no':0},inplace=True)
y=df1['y']
print(y)
df1=df1.drop(columns=['poutcome'])
df1x=df1.drop(columns=['y'])
print(df1x)

In [None]:
#classify job
df1x=df1x.replace({'job':{'unknown':0, "student":0,'retired':0,"unemployed":0}})

df1x['job']=[1 if a!=0 else 0 for a in df1x['job'] ]
df1x['job'].value_counts()
print(df1x)

In [None]:
print(pd.get_dummies(df1x['education']).head())

In [None]:
#Conver month to numeric
#pd.to_datetime(drop1x.month, format='%b').dt.month
mon_num=pd.to_datetime(df1x.month, format='%b').dt.month
mon_num
df1x['month']=mon_num

In [None]:
df1x

In [None]:
#Contact: 3类变2类 0/1
#drop1x['contact'].replace('unknown',0)#replace('telephone',1)
rp = df1x.replace({'contact' : {'unknown' : 0, 'cellular' : 0, 'telephone' : 1}})
#rp=drop1x['contact'].replace('cellular',0)
print(rp)

In [None]:
#Dummy
def dummy_df(df, dummyList):
  for i in dummyList:
    dummies=pd.get_dummies(df[i], prefix=i)
    df=df.drop(columns=[i])
    df=pd.concat([df, dummies], axis=1)
  return df

dummyList=['marital','education']
X1=dummy_df(rp, dummyList)
#dummies=pd.get_dummies(rp['education'])


In [None]:
print(X1)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X1, y, test_size=0.3, random_state=33151938)

In [None]:
X_train

## **Random Forest**

In [None]:
rf=RandomForestClassifier()
scores = cross_val_score(rf, X_train, y_train, cv=5, scoring='accuracy')
scoresROC = cross_val_score(rf, X_train, y_train, cv=5, scoring='roc_auc')
print(scores)
print(scoresROC)

In [None]:
print(scores.mean(),scoresROC.mean())

In [None]:
from sklearn.model_selection import cross_validate

In [None]:
scores = cross_validate(rf, X_train, y_train, cv=5, return_train_score=True)
score_df=pd.DataFrame(scores)
score_df

In [None]:
#print(score_df.mean(),score_df.std())
rf.fit(X_train, y_train)
print(rf.get_params())

In [None]:
from sklearn.model_selection import GridSearchCV
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_features': ['sqrt', 'log2'],
    'max_depth': [3, 6, 9]
}
grid_search = GridSearchCV(RandomForestClassifier(),
                           param_grid=param_grid,cv=5,return_train_score=True)
grid_search.fit(X_train, y_train)


In [None]:
train_scores = pd.DataFrame(grid_search.cv_results_)
score_result=train_scores.loc[:,['params','param_max_depth','mean_train_score','mean_test_score']]
score_result
#train_scores

In [None]:
print(train_scores)

In [None]:
import matplotlib.pyplot as plt
import altair as alt
source = score_result.melt(id_vars=['param_max_depth'] ,
                              value_vars=['mean_train_score', 'mean_test_score'],
                              var_name='score_type', value_name='accuracy')
chart1 = alt.Chart(source).mark_line().encode(
    alt.X('param_max_depth:Q', axis=alt.Axis(title="Tree Depth")),
    alt.Y('accuracy:Q'),
    alt.Color('score_type:N', scale=alt.Scale(domain=['mean_train_score', 'mean_test_score'],
                                           range=['teal', 'gold'])))
chart1

In [None]:
print(grid_search.best_params_)

In [None]:


rf_grid=RandomForestClassifier(max_depth=6, max_features='sqrt', n_estimators= 100)
rf_grid.fit(X_train, y_train)


In [None]:
# predict the mode
y_pred_grid = rf_grid.predict(X_test)
print(classification_report(y_pred_grid, y_test))
y_pred_prob = rf_grid.predict_proba(X_test)[:, 1]
auc = roc_auc_score(y_test, y_pred_prob)
print('RandomForest AUC:',auc)

In [None]:
print(f'Train Acc:{rf_grid.score(X_train,y_train)}')
print(f'Test Acc:{rf_grid.score(X_test,y_test)}')

In [None]:
rf_grid.feature_importances_

In [None]:
X1.columns

In [None]:
sorted_indices = np.argsort(rf_grid.feature_importances_)
sorted_feature_importance = np.array(rf_grid.feature_importances_)[sorted_indices]
sorted_column_headings = np.array(X1.columns)[sorted_indices]

# Plotting
fig, ax = plt.subplots()
ax.barh(sorted_column_headings, sorted_feature_importance)
ax.set_xlabel('Importance')
ax.set_ylabel('Features')
ax.set_title('Feature Importance_RandomForest ')



# Rotate x-axis labels for better visibility if needed
plt.xticks(rotation=88)

# Display the plot
plt.show()

In [None]:
#Dafault param
rf=RandomForestClassifier().fit(X_train,y_train)
rf.score(X_test,y_test)


##**Decision Tree**

In [None]:
#Decision Tree
from sklearn.tree import DecisionTreeClassifier

dtree = DecisionTreeClassifier()
dtree = dtree.fit(X_train, y_train)

cv_scores = cross_val_score(dtree, X_train, y_train, cv=5)
# Print the accuracy scores for each fold
print("Cross-validation scores:", cv_scores)
# Calculate the mean accuracy across all folds
mean_accuracy = cv_scores.mean()
print("Mean accuracy:", mean_accuracy)

In [None]:
#DT GRid search
dt_param_grid={
    'criterion': ['gini', 'entropy'],
    'max_depth': range(3,10),
}

dt_grid=GridSearchCV(DecisionTreeClassifier(),dt_param_grid,cv=5,return_train_score=True)
dt_grid.fit(X_train, y_train)


In [None]:
bestParams=dt_grid.best_params_
bestScore=dt_grid.best_score_
print(bestParams,'\n', bestScore)

In [None]:
train_scores = pd.DataFrame(dt_grid.cv_results_)
score_result=train_scores.loc[:,['params','param_max_depth','mean_train_score','mean_test_score']]
score_result

In [None]:
source = score_result.melt(id_vars=['param_max_depth'] ,
                              value_vars=['mean_train_score', 'mean_test_score'],
                              var_name='score_type', value_name='accuracy')
chart2 = alt.Chart(source).mark_line().encode(
    alt.X('param_max_depth:Q', axis=alt.Axis(title="Tree Depth")),
    alt.Y('accuracy:Q'),
    alt.Color('score_type:N', scale=alt.Scale(domain=['mean_train_score', 'mean_test_score'],
                                           range=['teal', 'gold'])))
chart2

In [None]:
best_dt=DecisionTreeClassifier(criterion= 'gini', max_depth=5)
best_dt.fit(X_train, y_train)
dt_acc=dtree.score(X_test,y_test)
print(dt_acc)

In [None]:
from sklearn.metrics import roc_auc_score
y_pred = dtree.predict(X_test)
print(classification_report(y_pred, y_test))
#accuracy = accuracy_score(y_test, y_pred)
#print("Accuracy:", accuracy)
y_pred_prob = best_dt.predict_proba(X_test)[:, 1]
auc = roc_auc_score(y_test, y_pred_prob)
print('Decisiontree AUC:',auc)

## **Logistic** **Regression**

In [None]:
#Logistic Regression

sc = StandardScaler()
sc.fit(X_train)
X_train_std = sc.transform(X_train)
X_test_std = sc.transform(X_test)
lr=LogisticRegression(C=5,max_iter=100)

lr_scores = cross_val_score(lr, X_train_std, y_train, cv=5)
lr_auc_scores = cross_val_score(lr, X_train_std, y_train, scoring='roc_auc',cv=5)
print('Acc',lr_scores)
print('AUC',lr_auc_scores)
#lr.score(X_test,y_test)

In [None]:
X_train_std

In [None]:
lr.fit(X_train_std,y_train)
lr_acc=lr.score(X_test_std,y_test)
print(lr_acc)

y_pred_prob = lr.predict_proba(X_test_std)[:, 1]
auc = roc_auc_score(y_test, y_pred_prob)
print('Logistic Regression AUC:',auc)

In [None]:
from sklearn.metrics import roc_auc_score
y_pred = lr.predict(X_test_std)
print(classification_report(y_pred, y_test))
#accuracy = accuracy_score(y_test, y_pred)
#print("Accuracy:", accuracy)


# Dataset2

In [None]:
df2

In [None]:
df2=df2[df2["education"].str.contains("unknown") == False]

df2.replace({'yes':1,'no':0},inplace=True)
y=df2['y']
print(y)

df2=df2.drop(columns=['poutcome'])
df2x=df2.drop(columns=['y'])
print(df2x)

#classify job
df2x=df2x.replace({'job':{'unknown':0, "student":0,'retired':0,"unemployed":0}})

df2x['job']=[1 if a!=0 else 0 for a in df2x['job'] ]
df2x['job'].value_counts()
print(df2x)

mon_num=pd.to_datetime(df2x.month, format='%b').dt.month
mon_num
df2x['month']=mon_num

rp = df2x.replace({'contact' : {'unknown' : 0, 'cellular' : 0, 'telephone' : 1}})
print(rp)

dummyList=['marital','education']
X2=dummy_df(rp, dummyList)
#dummies=pd.get_dummies(rp['education'])
print(X2)
X_train, X_test, y_train, y_test = train_test_split(X2, y, test_size=0.3, random_state=33151938)
print(X_train)

## RF

In [None]:
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_features': ['sqrt', 'log2'],
    'max_depth': [3, 6, 9]
}
grid_search = GridSearchCV(RandomForestClassifier(),
                           param_grid=param_grid,cv=5,return_train_score=True)
grid_search.fit(X_train, y_train)

train_scores = pd.DataFrame(grid_search.cv_results_)
score_result=train_scores.loc[:,['params','param_max_depth','mean_train_score','mean_test_score']]
score_result

In [None]:
source = score_result.melt(id_vars=['param_max_depth'] ,
                              value_vars=['mean_train_score', 'mean_test_score'],
                              var_name='score_type', value_name='accuracy')
chart2rf = alt.Chart(source).mark_line().encode(
    alt.X('param_max_depth:Q', axis=alt.Axis(title="Tree Depth")),
    alt.Y('accuracy:Q'),
    alt.Color('score_type:N', scale=alt.Scale(domain=['mean_train_score', 'mean_test_score'],
                                           range=['teal', 'gold'])))
chart2rf

In [None]:
print(grid_search.best_params_)

In [None]:
rf_grid=RandomForestClassifier(max_depth=6, max_features='sqrt', n_estimators= 100)
rf_grid.fit(X_train, y_train)


In [None]:
# predict the mode
y_pred_grid = rf_grid.predict(X_test)
print(classification_report(y_pred_grid, y_test))
y_pred_prob = rf_grid.predict_proba(X_test)[:, 1]
auc = roc_auc_score(y_test, y_pred_prob)
print('RandomForest AUC:',auc)

## DT

In [None]:
#Decision Tree
from sklearn.tree import DecisionTreeClassifier

dtree = DecisionTreeClassifier()
dtree = dtree.fit(X_train, y_train)

cv_scores = cross_val_score(dtree, X_train, y_train, cv=5)
# Print the accuracy scores for each fold
print("Cross-validation scores:", cv_scores)
# Calculate the mean accuracy across all folds
mean_accuracy = cv_scores.mean()
print("Mean accuracy:", mean_accuracy)

In [None]:
#DT GRid search
dt_param_grid={
    'criterion': ['gini', 'entropy'],
    'max_depth': range(3,10),
}

dt_grid=GridSearchCV(DecisionTreeClassifier(),dt_param_grid,cv=5,return_train_score=True)
dt_grid.fit(X_train, y_train)


In [None]:
bestParams=dt_grid.best_params_
bestScore=dt_grid.best_score_
print(bestParams,'\n', bestScore)

In [None]:
train_scores = pd.DataFrame(dt_grid.cv_results_)
score_result=train_scores.loc[:,['params','param_max_depth','mean_train_score','mean_test_score']]
score_result

In [None]:
source = score_result.melt(id_vars=['param_max_depth'] ,
                              value_vars=['mean_train_score', 'mean_test_score'],
                              var_name='score_type', value_name='accuracy')
chart2dt = alt.Chart(source).mark_line().encode(
    alt.X('param_max_depth:Q', axis=alt.Axis(title="Tree Depth")),
    alt.Y('accuracy:Q'),
    alt.Color('score_type:N', scale=alt.Scale(domain=['mean_train_score', 'mean_test_score'],
                                           range=['teal', 'gold'])))
chart2dt

In [None]:
best_dt=DecisionTreeClassifier(criterion= 'gini', max_depth=5)
best_dt.fit(X_train, y_train)
dt_acc=dtree.score(X_test,y_test)
print(dt_acc)

In [None]:
from sklearn.metrics import roc_auc_score
y_pred = dtree.predict(X_test)
print(classification_report(y_pred, y_test))
#accuracy = accuracy_score(y_test, y_pred)
#print("Accuracy:", accuracy)
y_pred_prob = best_dt.predict_proba(X_test)[:, 1]
auc = roc_auc_score(y_test, y_pred_prob)
print('Decisiontree AUC:',auc)

## LG

In [None]:
#Logistic Regression

sc = StandardScaler()
sc.fit(X_train)
X_train_std = sc.transform(X_train)
X_test_std = sc.transform(X_test)
lr=LogisticRegression(C=5,max_iter=100)

lr_scores = cross_val_score(lr, X_train_std, y_train, cv=5)
lr_auc_scores = cross_val_score(lr, X_train_std, y_train, scoring='roc_auc',cv=5)
print('Acc',lr_scores)
print('AUC',lr_auc_scores)
#lr.score(X_test,y_test)

In [None]:
lr.fit(X_train_std,y_train)
lr_acc=lr.score(X_test_std,y_test)
print(lr_acc)

y_pred_prob = lr.predict_proba(X_test_std)[:, 1]
auc = roc_auc_score(y_test, y_pred_prob)
print('Logistic Regression AUC:',auc)

In [None]:
from sklearn.metrics import roc_auc_score
y_pred = lr.predict(X_test_std)
print(classification_report(y_pred, y_test))
#accuracy = accuracy_score(y_test, y_pred)
#print("Accuracy:", accuracy)


# Dataset3

In [None]:
df3=df3[df3["education"].str.contains("unknown") == False]

df3.replace({'yes':1,'no':0},inplace=True)
y=df3['y']
print(y)

df3=df3.drop(columns=['poutcome'])
df3x=df3.drop(columns=['y'])
print(df3x)

#classify job
df3x=df3x.replace({'job':{'unknown':0, "student":0,'retired':0,"unemployed":0}})

df3x['job']=[1 if a!=0 else 0 for a in df3x['job'] ]
df3x['job'].value_counts()
print(df3x)

mon_num=pd.to_datetime(df3x.month, format='%b').dt.month
mon_num
df3x['month']=mon_num

rp = df3x.replace({'contact' : {'unknown' : 0, 'cellular' : 0, 'telephone' : 1}})
print(rp)

dummyList=['marital','education']
X3=dummy_df(rp, dummyList)
#dummies=pd.get_dummies(rp['education'])
print(X3)
X_train, X_test, y_train, y_test = train_test_split(X3, y, test_size=0.3, random_state=33151938)
print(X_train)


## RF

In [None]:
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_features': ['sqrt', 'log2'],
    'max_depth': [3, 6, 9]
}
grid_search = GridSearchCV(RandomForestClassifier(),
                           param_grid=param_grid,cv=5,return_train_score=True)
grid_search.fit(X_train, y_train)

train_scores = pd.DataFrame(grid_search.cv_results_)
score_result=train_scores.loc[:,['params','param_max_depth','mean_train_score','mean_test_score']]
score_result

In [None]:
source = score_result.melt(id_vars=['param_max_depth'] ,
                              value_vars=['mean_train_score', 'mean_test_score'],
                              var_name='score_type', value_name='accuracy')
chart3rf = alt.Chart(source).mark_line().encode(
    alt.X('param_max_depth:Q', axis=alt.Axis(title="Tree Depth")),
    alt.Y('accuracy:Q'),
    alt.Color('score_type:N', scale=alt.Scale(domain=['mean_train_score', 'mean_test_score'],
                                           range=['teal', 'gold'])))
chart3rf

In [None]:
print(grid_search.best_params_)

In [None]:
rf_grid=RandomForestClassifier(max_depth=6, max_features='sqrt', n_estimators= 100)
rf_grid.fit(X_train, y_train)


In [None]:
# predict the mode
y_pred_grid = rf_grid.predict(X_test)
print(classification_report(y_pred_grid, y_test))
y_pred_prob = rf_grid.predict_proba(X_test)[:, 1]
auc = roc_auc_score(y_test, y_pred_prob)
print('RandomForest AUC:',auc)

## DT

In [None]:
#Decision Tree
from sklearn.tree import DecisionTreeClassifier

dtree = DecisionTreeClassifier()
dtree = dtree.fit(X_train, y_train)

cv_scores = cross_val_score(dtree, X_train, y_train, cv=5)
# Print the accuracy scores for each fold
print("Cross-validation scores:", cv_scores)
# Calculate the mean accuracy across all folds
mean_accuracy = cv_scores.mean()
print("Mean accuracy:", mean_accuracy)

In [None]:
#DT GRid search
dt_param_grid={
    'criterion': ['gini', 'entropy'],
    'max_depth': range(3,10),
}

dt_grid=GridSearchCV(DecisionTreeClassifier(),dt_param_grid,cv=5,return_train_score=True)
dt_grid.fit(X_train, y_train)


In [None]:
bestParams=dt_grid.best_params_
bestScore=dt_grid.best_score_
print(bestParams,'\n', bestScore)

In [None]:
train_scores = pd.DataFrame(dt_grid.cv_results_)
score_result=train_scores.loc[:,['params','param_max_depth','mean_train_score','mean_test_score']]
score_result

In [None]:
source = score_result.melt(id_vars=['param_max_depth'] ,
                              value_vars=['mean_train_score', 'mean_test_score'],
                              var_name='score_type', value_name='accuracy')
chart3dt = alt.Chart(source).mark_line().encode(
    alt.X('param_max_depth:Q', axis=alt.Axis(title="Tree Depth")),
    alt.Y('accuracy:Q'),
    alt.Color('score_type:N', scale=alt.Scale(domain=['mean_train_score', 'mean_test_score'],
                                           range=['teal', 'gold'])))
chart3dt

In [None]:
best_dt=DecisionTreeClassifier(criterion= 'gini', max_depth=5)
best_dt.fit(X_train, y_train)
dt_acc=dtree.score(X_test,y_test)
print(dt_acc)

In [None]:
from sklearn.metrics import roc_auc_score
y_pred = dtree.predict(X_test)
print(classification_report(y_pred, y_test))
#accuracy = accuracy_score(y_test, y_pred)
#print("Accuracy:", accuracy)
y_pred_prob = best_dt.predict_proba(X_test)[:, 1]
auc = roc_auc_score(y_test, y_pred_prob)
print('Decisiontree AUC:',auc)

## LG

In [None]:
#Logistic Regression

sc = StandardScaler()
sc.fit(X_train)
X_train_std = sc.transform(X_train)
X_test_std = sc.transform(X_test)
lr=LogisticRegression(C=5,max_iter=100)

lr_scores = cross_val_score(lr, X_train_std, y_train, cv=5)
lr_auc_scores = cross_val_score(lr, X_train_std, y_train, scoring='roc_auc',cv=5)
print('Acc',lr_scores)
print('AUC',lr_auc_scores)
#lr.score(X_test,y_test)

In [None]:
lr.fit(X_train_std,y_train)
lr_acc=lr.score(X_test_std,y_test)
print(lr_acc)


In [None]:
from sklearn.metrics import roc_auc_score
y_pred = lr.predict(X_test_std)
print(classification_report(y_pred, y_test))
#accuracy = accuracy_score(y_test, y_pred)
#print("Accuracy:", accuracy)
y_pred_prob = lr.predict_proba(X_test_std)[:, 1]
auc = roc_auc_score(y_test, y_pred_prob)
print('Logistic AUC:',auc)

# Dataset4

In [None]:
df4=df4[df4["education"].str.contains("unknown") == False]

df4.replace({'yes':1,'no':0},inplace=True)
y=df4['y']
print(y)

df4=df4.drop(columns=['poutcome'])
df4x=df4.drop(columns=['y'])
print(df4x)

#classify job
df4x=df4x.replace({'job':{'unknown':0, "student":0,'retired':0,"unemployed":0}})

df4x['job']=[1 if a!=0 else 0 for a in df4x['job'] ]
df4x['job'].value_counts()
print(df4x)

mon_num=pd.to_datetime(df4x.month, format='%b').dt.month
mon_num
df4x['month']=mon_num

rp = df4x.replace({'contact' : {'unknown' : 0, 'cellular' : 0, 'telephone' : 1}})
print(rp)

dummyList=['marital','education']
X4=dummy_df(rp, dummyList)
#dummies=pd.get_dummies(rp['education'])
print(X4)
X_train, X_test, y_train, y_test = train_test_split(X4, y, test_size=0.3, random_state=33151938)
print(X_train)


## RF

In [None]:
rf_grid=RandomForestClassifier(max_depth=6, max_features='sqrt', n_estimators= 100)
rf_grid.fit(X_train, y_train)


In [None]:
# predict the mode
y_pred_grid = rf_grid.predict(X_test)
print(classification_report(y_pred_grid, y_test))
y_pred_prob = rf_grid.predict_proba(X_test)[:, 1]
auc = roc_auc_score(y_test, y_pred_prob)
print('RandomForest AUC:',auc)

## DT

In [None]:
#Decision Tree
from sklearn.tree import DecisionTreeClassifier

dtree = DecisionTreeClassifier()
dtree = dtree.fit(X_train, y_train)

cv_scores = cross_val_score(dtree, X_train, y_train, cv=5)
# Print the accuracy scores for each fold
print("Cross-validation scores:", cv_scores)
# Calculate the mean accuracy across all folds
mean_accuracy = cv_scores.mean()
print("Mean accuracy:", mean_accuracy)

In [None]:
best_dt=DecisionTreeClassifier(criterion= 'gini', max_depth=5)
best_dt.fit(X_train, y_train)
dt_acc=dtree.score(X_test,y_test)
print(dt_acc)

In [None]:
from sklearn.metrics import roc_auc_score
y_pred = dtree.predict(X_test)
print(classification_report(y_pred, y_test))
#accuracy = accuracy_score(y_test, y_pred)
#print("Accuracy:", accuracy)
y_pred_prob = best_dt.predict_proba(X_test)[:, 1]
auc = roc_auc_score(y_test, y_pred_prob)
print('Decisiontree AUC:',auc)

## LR

In [None]:
#Logistic Regression

sc = StandardScaler()
sc.fit(X_train)
X_train_std = sc.transform(X_train)
X_test_std = sc.transform(X_test)
lr=LogisticRegression(C=5,max_iter=100)

lr_scores = cross_val_score(lr, X_train_std, y_train, cv=5)
lr_auc_scores = cross_val_score(lr, X_train_std, y_train, scoring='roc_auc',cv=5)
print('Acc',lr_scores)
print('AUC',lr_auc_scores)
#lr.score(X_test,y_test)

In [None]:
lr.fit(X_train_std,y_train)
lr_acc=lr.score(X_test_std,y_test)
print(lr_acc)


In [None]:
from sklearn.metrics import roc_auc_score
y_pred = lr.predict(X_test_std)
print(classification_report(y_pred, y_test))
#accuracy = accuracy_score(y_test, y_pred)
#print("Accuracy:", accuracy)
y_pred_prob = lr.predict_proba(X_test_std)[:, 1]
auc = roc_auc_score(y_test, y_pred_prob)
print('Logistic AUC:',auc)

In [None]:
rf_auc=[0.899985253373849,0.888862995403576,0.888364878244099,0.887140605579752]
dt_auc=[0.847997587,0.837933711,0.832224335,0.849518883]
lr_auc=[0.858133881,0.8603328,0.85890189,0.842868112]

In [None]:
from scipy import stats
print(stats.ttest_ind(rf_auc, dt_auc))
print(stats.ttest_ind(rf_auc, lr_auc))
print(stats.ttest_ind(dt_auc, lr_auc))

In [None]:
from scipy.stats import ranksums
print(ranksums(rf_auc, dt_auc))
print(ranksums(rf_auc, lr_auc))
print(ranksums(dt_auc,lr_auc))