In [2]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.ensemble import RandomForestClassifier,VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,precision_score,classification_report,confusion_matrix
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

In [3]:
data = pd.read_csv("bank.csv")
categorical_cols = data.select_dtypes(include=['object']).columns
label_encoder = LabelEncoder()

for col in categorical_cols:
    data[col] = label_encoder.fit_transform(data[col])

x = data.drop('y',axis = 1)
y = data['y']
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.2,random_state =42)

In [4]:
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

In [5]:
DeciT= DecisionTreeClassifier()
DeciT.fit(x_train,y_train)
y_pred = DeciT.predict(x_test)
accuracy = accuracy_score(y_pred,y_test)
print("Accuracy for decision tree:",accuracy)
precision = precision_score(y_pred,y_test)
print("Precision for decision tree:",precision)

Logs= LogisticRegression()
Logs.fit(x_train,y_train)
y_pred = Logs.predict(x_test)
accuracy = accuracy_score(y_pred,y_test)
print("Accuracy for Logistic Regression:",accuracy)
precision = precision_score(y_pred,y_test)
print("Precision for Logistic Regression:",precision)

RFC = RandomForestClassifier()
RFC.fit(x_train,y_train)
y_pred = RFC.predict(x_test)
accuracy = accuracy_score(y_pred,y_test)
print("Accuracy for Random Forest Classifier:",accuracy)
precision = precision_score(y_pred,y_test)
print("Precision for Random Forest Classifier:",precision)

Accuracy for decision tree: 0.8721663164878912
Precision for decision tree: 0.4747937671860678
Accuracy for Logistic Regression: 0.8878690699988941
Precision for Logistic Regression: 0.21723189734188816
Accuracy for Random Forest Classifier: 0.9023554130266505
Precision for Random Forest Classifier: 0.4271310724106325


In [6]:
voting_model = VotingClassifier(
       estimators=[
           ('logistic', Logs),
           ('decision_tree', DeciT),
           ('random_forest', RFC)
       ],
       voting='hard'  # 'hard' for majority voting, 'soft' for averaging probabilities
   )

# Train the voting classifier
voting_model.fit(x_train, y_train)

In [7]:
y_pred_voting = voting_model.predict(x_test)
# Evaluate the model
print("Voting Classifier Accuracy:", accuracy_score(y_test, y_pred_voting))
print("Voting Classifier Precision:", precision_score(y_test, y_pred_voting))
print("Classification Report:\n", classification_report(y_test, y_pred_voting))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_voting))

Voting Classifier Accuracy: 0.8992590954329316
Voting Classifier Precision: 0.6428571428571429
Classification Report:
               precision    recall  f1-score   support

           0       0.92      0.97      0.94      7952
           1       0.64      0.37      0.47      1091

    accuracy                           0.90      9043
   macro avg       0.78      0.67      0.71      9043
weighted avg       0.89      0.90      0.89      9043

Confusion Matrix:
 [[7727  225]
 [ 686  405]]


In [8]:
import joblib
joblib.dump(voting_model, 'voting_model.pkl')


['voting_model.pkl']

In [9]:
from collections import Counter
print(Counter(y_train))  # Replace y_train with your actual target variable


Counter({0: 31970, 1: 4198})


In [10]:
not_subscribed = data[data['y'] == 0]

# Display the first 10 entries
print(not_subscribed.head(10))

   age  job  marital  education  default  balance  housing  loan  contact  \
0   58    4        1          2        0     2143        1     0        2   
1   44    9        2          1        0       29        1     0        2   
2   33    2        1          1        0        2        1     1        2   
3   47    1        1          3        0     1506        1     0        2   
4   33   11        2          3        0        1        0     0        2   
5   35    4        1          2        0      231        1     0        2   
6   28    4        2          2        0      447        1     1        2   
7   42    2        0          2        1        2        1     0        2   
8   58    5        1          0        0      121        1     0        2   
9   43    9        2          1        0      593        1     0        2   

   day  month  duration  campaign  pdays  previous  poutcome  y  
0    5      8       261         1     -1         0         3  0  
1    5      8       

In [11]:
#Assigning weights(WEIGHTED VOTING SYSTEM)
from sklearn.model_selection import cross_val_score

# Assuming X_train and y_train are your training data and labels
lr_score = cross_val_score(voting_model.estimators_[0], x_train, y_train, cv=5, scoring='accuracy').mean()
dt_score = cross_val_score(voting_model.estimators_[1], x_train, y_train, cv=5, scoring='accuracy').mean()
rf_score = cross_val_score(voting_model.estimators_[2], x_train, y_train, cv=5, scoring='accuracy').mean()

print(f"Logistic Regression Accuracy: {lr_score}")
print(f"Decision Tree Accuracy: {dt_score}")
print(f"Random Forest Accuracy: {rf_score}")


Logistic Regression Accuracy: 0.8916723419344745
Decision Tree Accuracy: 0.8727604651689923
Random Forest Accuracy: 0.9042524507428992


In [12]:
total_score = lr_score + dt_score + rf_score
lr_weight = lr_score / total_score
dt_weight = dt_score / total_score
rf_weight = rf_score / total_score

model_weights = [lr_weight, dt_weight, rf_weight]
print(f"Assigned Weights: Logistic Regression: {lr_weight}, Decision Tree: {dt_weight}, Random Forest: {rf_weight}")


Assigned Weights: Logistic Regression: 0.33412420566000195, Decision Tree: 0.32703761622054733, Random Forest: 0.3388381781194507


In [13]:
z = accuracy_score(y_pred,y_test)
print(z)

0.9023554130266505
