In [2]:
import requests
import zipfile
import io
import pandas as pd

# URL of the dataset
url = "https://archive.ics.uci.edu/static/public/222/bank+marketing.zip"
inner_zip_file = "bank.zip"
csv_file = "bank-full.csv"

# Stream the outer zip file from the URL
response = requests.get(url)
response.raise_for_status()  # Ensure the request was successful
with zipfile.ZipFile(io.BytesIO(response.content)) as outer_zip:
    # Extract the inner zip file (bank.zip)
    with outer_zip.open(inner_zip_file) as inner_zip_stream:
        with zipfile.ZipFile(io.BytesIO(inner_zip_stream.read())) as inner_zip:
            # Extract the CSV file from the inner zip file
            with inner_zip.open(csv_file) as file:
                df = pd.read_csv(file, sep=';')

# Display the first few rows of the DataFrame
columns = ["age", "job", "marital", "education", "balance", "housing", "contact", "day", "month", "duration", "campaign", "pdays", "previous", "poutcome", "y"]

df = df[columns]
df

HTTPError: 502 Server Error: Bad Gateway for url: https://archive.ics.uci.edu/static/public/222/bank+marketing.zip

In [None]:
# Data Splitting
# Target Encoding

df['y'] = df['y'].replace({'yes': 1, 'no': 0})


#Train_Test_Split
from sklearn.model_selection import train_test_split
X = df.copy()

X_train_full, X_test = train_test_split(X, test_size=0.2, random_state=42, shuffle=True)
X_train, X_val = train_test_split(X_train_full, test_size=0.25, random_state=42, shuffle=True)

df_train_full = X_train_full.reset_index(drop=True)
df_train = X_train.reset_index(drop=True)
df_val = X_val.reset_index(drop =True)
df_test = X_test.reset_index(drop=True)

y_train = df_train.y.values
y_val = df_val.y.values
y_test = df_test.y.values

del df_train['y']
del df_val['y']
del df_test['y']                                                    

In [3]:

from sklearn.metrics import *
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression



def train_and_evaluate(train, val, y_train, y_val, features):
    train_dict = train[features].to_dict(orient='records')
    val_dict = val[features].to_dict(orient='records')

    dv = DictVectorizer(sparse=False)
    x_train = dv.fit_transform(train_dict)
    x_val = dv.transform(val_dict)

    # Train logistic regression model
    model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000)
    model.fit(x_train, y_train)

    # Predict on validation set
    y_pred = model.predict(x_val)

    # Calculate accuracy
    return roc_auc_score(y_val, y_pred)

def train_and_evaluate_allfeat(train, val, y_train, y_val):
    train_dict = train.to_dict(orient='records')
    val_dict = val.to_dict(orient='records')

    dv = DictVectorizer(sparse=False)
    x_train = dv.fit_transform(train_dict)
    x_val = dv.transform(val_dict)

    # Train logistic regression model
    model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000)
    model.fit(x_train, y_train)

    # Predict on validation set
    y_pred = model.predict(x_val)

    return roc_auc_score(y_val, y_pred)


In [None]:
#ROC AUC feature importance

feat = ['balance', 'day', 'duration', 'previous']
score = {}

for n in feat:
    aucscore = roc_auc_score(y_train, df_train[n])
    if aucscore < 0.5:
        aucscore = roc_auc_score(y_train, -df_train[n])
    score[n] = aucscore

max_score = max(score, key=score.get)
print(f"The numerical variable with the highest AUC is: {max_score} with an AUC of {score[max_score]}")


In [None]:
# Logistic regression and auc_score

scores = train_and_evaluate_allfeat(df_train, df_val, y_train, y_val)
auc_scores = round(scores, 3)
auc_scores

In [None]:
# Precision and Recall
# Define the start, stop, and step size
import numpy as np
import matplotlib.pyplot as plt
start = 0
stop = 1
step = 0.01

# Calculate the number of samples
num_samples = int((stop - start) / step) + 1

def train_and_evaluate_allfeat1(train, val, y_train, y_val):
    train_dict = train.to_dict(orient='records')
    val_dict = val.to_dict(orient='records')

    dv = DictVectorizer(sparse=False)
    x_train = dv.fit_transform(train_dict)
    x_val = dv.transform(val_dict)

    # Train logistic regression model
    model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000)
    model.fit(x_train, y_train)

    # Predict on validation set
    y_pred = model.predict_proba(x_val)[:, 1]
    return y_pred

thresh = np.linspace(start, stop, num=num_samples)
y_pred = train_and_evaluate_allfeat1(df_train, df_val, y_train, y_val)

# Initialize lists to store precision and recall scores
pre_scores = []
rec_scores = []

for i in thresh:
    # Binarize predictions based on the threshold
    y_pred_binary = (y_pred >= i).astype(int)
    
    # Compute precision and recall scores
    pre_score = precision_score(y_val, y_pred_binary)
    rec_score = recall_score(y_val, y_pred_binary)
    
    # Append the scores to the lists
    pre_scores.append(pre_score)
    rec_scores.append(rec_score)

# Plot precision and recall curves
plt.figure(figsize=(10, 6))
plt.plot(thresh, pre_scores, label='Precision')
plt.plot(thresh, rec_scores, label='Recall')
plt.xlabel('Threshold')
plt.ylabel('Score')
plt.title('Precision and Recall vs Threshold')
plt.legend()
plt.grid(True)
plt.show()

# Find the threshold where precision and recall intersect
intersection_threshold = thresh[np.argmin(np.abs(np.array(pre_scores) - np.array(rec_scores)))]
print(f"The threshold at which precision and recall curves intersect is: {intersection_threshold}")
