In [1]:
import requests
import zipfile
import io
import pandas as pd

# URL of the dataset
url = "https://archive.ics.uci.edu/static/public/222/bank+marketing.zip"
inner_zip_file = "bank.zip"
csv_file = "bank-full.csv"

# Stream the outer zip file from the URL
response = requests.get(url)
response.raise_for_status()  # Ensure the request was successful
with zipfile.ZipFile(io.BytesIO(response.content)) as outer_zip:
    # Extract the inner zip file (bank.zip)
    with outer_zip.open(inner_zip_file) as inner_zip_stream:
        with zipfile.ZipFile(io.BytesIO(inner_zip_stream.read())) as inner_zip:
            # Extract the CSV file from the inner zip file
            with inner_zip.open(csv_file) as file:
                df = pd.read_csv(file, sep=';')

# Display the first few rows of the DataFrame
print(df)

       age           job   marital  education default  balance housing loan  \
0       58    management   married   tertiary      no     2143     yes   no   
1       44    technician    single  secondary      no       29     yes   no   
2       33  entrepreneur   married  secondary      no        2     yes  yes   
3       47   blue-collar   married    unknown      no     1506     yes   no   
4       33       unknown    single    unknown      no        1      no   no   
...    ...           ...       ...        ...     ...      ...     ...  ...   
45206   51    technician   married   tertiary      no      825      no   no   
45207   71       retired  divorced    primary      no     1729      no   no   
45208   72       retired   married  secondary      no     5715      no   no   
45209   57   blue-collar   married  secondary      no      668      no   no   
45210   37  entrepreneur   married  secondary      no     2971      no   no   

         contact  day month  duration  campaign  pd

In [2]:
columns = ["age", "job", "marital", "education", "balance", "housing", "contact", "day", "month", "duration", "campaign", "pdays", "previous", "poutcome", "y"]

df = df[columns]
df.isna().sum()

age          0
job          0
marital      0
education    0
balance      0
housing      0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64

In [3]:
# Most Frequent Observation in Education

df['education'].mode()

0    secondary
Name: education, dtype: object

In [4]:
df.corr(numeric_only=True)

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous
age,1.0,0.097783,-0.00912,-0.004648,0.00476,-0.023758,0.001288
balance,0.097783,1.0,0.004503,0.02156,-0.014578,0.003435,0.016674
day,-0.00912,0.004503,1.0,-0.030206,0.16249,-0.093044,-0.05171
duration,-0.004648,0.02156,-0.030206,1.0,-0.08457,-0.001565,0.001203
campaign,0.00476,-0.014578,0.16249,-0.08457,1.0,-0.088628,-0.032855
pdays,-0.023758,0.003435,-0.093044,-0.001565,-0.088628,1.0,0.45482
previous,0.001288,0.016674,-0.05171,0.001203,-0.032855,0.45482,1.0


In [5]:
# Target Encoding

df['y'] = df['y'].replace({'yes': 1, 'no': 0})
df.head()

  df['y'] = df['y'].replace({'yes': 1, 'no': 0})


Unnamed: 0,age,job,marital,education,balance,housing,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,2143,yes,unknown,5,may,261,1,-1,0,unknown,0
1,44,technician,single,secondary,29,yes,unknown,5,may,151,1,-1,0,unknown,0
2,33,entrepreneur,married,secondary,2,yes,unknown,5,may,76,1,-1,0,unknown,0
3,47,blue-collar,married,unknown,1506,yes,unknown,5,may,92,1,-1,0,unknown,0
4,33,unknown,single,unknown,1,no,unknown,5,may,198,1,-1,0,unknown,0


In [6]:
#Train_Test_Split
from sklearn.model_selection import train_test_split
X = df.copy()

X_train_full, X_test = train_test_split(X, test_size=0.2, random_state=42, shuffle=True)
X_train, X_val = train_test_split(X_train_full, test_size=0.25, random_state=42, shuffle=True)

df_train_full = X_train_full.reset_index(drop=True)
df_train = X_train.reset_index(drop=True)
df_val = X_val.reset_index(drop =True)
df_test = X_test.reset_index(drop=True)

y_train = df_train.y.values
y_val = df_val.y.values
y_test = df_test.y.values

del df_train['y']
del df_val['y']
del df_test['y']                                                    

In [7]:
#Mutual Info Score
from sklearn.metrics import mutual_info_score

cat = ["job",	"marital",	"education", "housing", "contact", "month",	"poutcome"]

def mutual_info_y_score(series):
    return mutual_info_score(series, df_train_full.y)

mi = round(df_train_full[cat].apply(mutual_info_y_score), 2)
mi.sort_values(ascending = False)

poutcome     0.03
month        0.02
job          0.01
contact      0.01
housing      0.01
education    0.00
marital      0.00
dtype: float64

In [8]:
# Train Logistic Regression
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

all_features = df_train.columns
# Function to train the model and calculate accuracy
def train_and_evaluate(train, val, y_train, y_val, features):
    train_dict = train[features].to_dict(orient='records')
    val_dict = val[features].to_dict(orient='records')

    dv = DictVectorizer(sparse=False)
    x_train = dv.fit_transform(train_dict)
    x_val = dv.transform(val_dict)

    # Train logistic regression model
    model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
    model.fit(x_train, y_train)

    # Predict on validation set
    y_pred = model.predict(x_val)

    # Calculate accuracy
    return accuracy_score(y_val, y_pred)


# Base model with all features
base_accuracy = train_and_evaluate(df_train, df_val, y_train, y_val, all_features)
print(f"Base model accuracy with all features: {base_accuracy:.4f}")


Base model accuracy with all features: 0.9010


In [9]:
# Feature Importance

# Features to evaluate
features_to_evaluate = ['age', 'balance', 'marital', 'previous']

# Store results
results = {}

# Iterate through each feature and exclude it
for feature in features_to_evaluate:
    features_subset = [f for f in all_features if f != feature]
    accuracy = train_and_evaluate(df_train, df_val, y_train, y_val, features_subset)
    accuracy_diff = base_accuracy - accuracy
    results[feature] = accuracy_diff
    print(f"Accuracy without {feature}: {accuracy:.4f} (Difference: {accuracy_diff:.4f})")

# Find the feature with the smallest difference
least_useful_feature = min(results, key=results.get)
print(f"\nFeature with the smallest difference in accuracy: {least_useful_feature}")

Accuracy without age: 0.9010 (Difference: 0.0000)
Accuracy without balance: 0.9008 (Difference: 0.0002)
Accuracy without marital: 0.9002 (Difference: 0.0008)
Accuracy without previous: 0.9012 (Difference: -0.0002)

Feature with the smallest difference in accuracy: previous


In [10]:
#Regularized Logistic Regression

# Function to train and evaluate the model for a given value of C
def train_and_evaluate_regularized(train, val, y_train, y_val, features, C_value):
    train_dict = train[features].to_dict(orient='records')
    val_dict = val[features].to_dict(orient='records')

    dv = DictVectorizer(sparse=False)
    x_train = dv.fit_transform(train_dict)
    x_val = dv.transform(val_dict)

    # Train logistic regression model with regularization parameter C
    model = LogisticRegression(solver='liblinear', C=C_value, max_iter=1000, random_state=42)
    model.fit(x_train, y_train)

    # Predict on validation set
    y_pred = model.predict(x_val)

    # Calculate accuracy
    return accuracy_score(y_val, y_pred)

# List of C values to try
C_values = [0.01, 0.1, 1, 10, 100]

# Store results
best_accuracy = 0
best_C = None
results = {}

# Iterate over each C value
for C in C_values:
    accuracy = train_and_evaluate_regularized(df_train, df_val, y_train, y_val, all_features, C)
    accuracy_rounded = round(accuracy, 3)
    results[C] = accuracy_rounded
    print(f"Accuracy with C={C}: {accuracy_rounded}")

    # Track the best accuracy and corresponding C
    if accuracy_rounded > best_accuracy:
        best_accuracy = accuracy_rounded
        best_C = C

print(f"\nBest C value: {best_C} with accuracy: {best_accuracy:.3f}")

Accuracy with C=0.01: 0.899
Accuracy with C=0.1: 0.9
Accuracy with C=1: 0.901
Accuracy with C=10: 0.901
Accuracy with C=100: 0.901

Best C value: 1 with accuracy: 0.901
