In [2]:
import requests
import zipfile
import io
import os

# URL of the main zip file
url = "https://archive.ics.uci.edu/static/public/222/bank+marketing.zip"

# Function to unzip a file from memory or disk
def unzip_file(file_content, extract_to='.'):
    with zipfile.ZipFile(io.BytesIO(file_content)) as zip_ref:
        zip_ref.extractall(extract_to)

# Download the main zip file
response = requests.get(url)

if response.status_code == 200:
    print("Main zip file downloaded.")
    # Extract the first zip file from memory
    with zipfile.ZipFile(io.BytesIO(response.content)) as main_zip:
        main_zip.extractall()  # Extracts to the current directory
        print("Main zip file extracted.")
        
        # Check if there are any zip files in the extracted contents
        for file_name in main_zip.namelist():
            if file_name.endswith('.zip'):
                print(f"Found a nested zip file: {file_name}")
                # Open and extract the nested zip file
                with open(file_name, 'rb') as nested_zip_file:
                    unzip_file(nested_zip_file.read())
                print(f"Extracted nested zip file: {file_name}")
else:
    print(f"Failed to download the file. Status code: {response.status_code}")


Main zip file downloaded.
Main zip file extracted.
Found a nested zip file: bank.zip
Extracted nested zip file: bank.zip
Found a nested zip file: bank-additional.zip
Extracted nested zip file: bank-additional.zip


In [28]:
import pandas as pd
import numpy as np
# Load the data
df = pd.read_csv('bank.csv', sep=';')  


# Select only required columns
columns_to_use  = [
    'age',
    'job',
    'marital',
    'education',
    'balance',
    'housing',
    'contact',
    'day',
    'month',
    'duration',
    'campaign',
    'pdays',
    'previous',
    'poutcome',
    'y'
]

df = df[columns_to_use]

print(df)


      age            job  marital  education  balance housing   contact  day  \
0      30     unemployed  married    primary     1787      no  cellular   19   
1      33       services  married  secondary     4789     yes  cellular   11   
2      35     management   single   tertiary     1350     yes  cellular   16   
3      30     management  married   tertiary     1476     yes   unknown    3   
4      59    blue-collar  married  secondary        0     yes   unknown    5   
...   ...            ...      ...        ...      ...     ...       ...  ...   
4516   33       services  married  secondary     -333     yes  cellular   30   
4517   57  self-employed  married   tertiary    -3313     yes   unknown    9   
4518   57     technician  married  secondary      295      no  cellular   19   
4519   28    blue-collar  married  secondary     1137      no  cellular    6   
4520   44   entrepreneur   single   tertiary     1136     yes  cellular    3   

     month  duration  campaign  pdays  

In [16]:
# Question 1
# What is the most frequent observation (mode) for the column education?
# Get the mode for the 'education' column
education_mode = df['education'].mode()[0]

print(f"The most frequent observation (mode) for the 'education' column is: {education_mode}")

The most frequent observation (mode) for the 'education' column is: secondary


In [17]:
# Question 2
# Create the correlation matrix for the numerical features of your dataset. In a correlation matrix, you compute the correlation coefficient between every pair of features.

# What are the two features that have the biggest correlation?
# Select only the numerical features (automatically detects numerical columns)
numerical_df = df.select_dtypes(include=['float64', 'int64'])

# Compute the correlation matrix
correlation_matrix = numerical_df.corr()

# Display the correlation matrix
print(correlation_matrix)

               age   balance       day  duration  campaign     pdays  previous
age       1.000000  0.083820 -0.017853 -0.002367 -0.005148 -0.008894 -0.003511
balance   0.083820  1.000000 -0.008677 -0.015950 -0.009976  0.009437  0.026196
day      -0.017853 -0.008677  1.000000 -0.024629  0.160706 -0.094352 -0.059114
duration -0.002367 -0.015950 -0.024629  1.000000 -0.068382  0.010380  0.018080
campaign -0.005148 -0.009976  0.160706 -0.068382  1.000000 -0.093137 -0.067833
pdays    -0.008894  0.009437 -0.094352  0.010380 -0.093137  1.000000  0.577562
previous -0.003511  0.026196 -0.059114  0.018080 -0.067833  0.577562  1.000000


In [29]:
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import mutual_info_classif


# Split the data into features and target variable
X = df.drop(columns=['y']).copy()
y = df['y']

# Split the data into train/val/test sets (60%/20%/20%)
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)


# Reset_index for y_train, y_val and y_test.
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

# Define y_train, y_val and y_test.
y_train = df_train.y.values
y_val = df_val.y.values
y_test = df_test.y.values

# Drop y from y_train, y_val and y_test.
df_train = df_train.drop('y', axis=1)
df_val = df_val.drop('y', axis=1)
df_test = df_test.drop('y', axis=1)


In [32]:
# Question 3
# Calculate the mutual information score between y and other categorical variables in the dataset. Use the training set only.
# Round the scores to 2 decimals using round(score, 2).
# Which of these variables has the biggest mutual information score?

# contact
# education
# housing
# poutcome

# Define cal_mi function.
def cal_mi(series):
    return mutual_info_score(series, y_train)


# List the categorical columns.
df_cat = df.copy().select_dtypes(exclude='number').columns

# List the categorical columns.
cat_list = ['job', 'marital', 'education', 'housing', 'contact', 'month', 'poutcome']

# Calculate MI.
df_mi = df_train[cat_list].apply(cal_mi).round(2)
df_mi = df_mi.sort_values(ascending=False).to_frame(name='MI')
df_mi

Unnamed: 0,MI
poutcome,0.03
month,0.02
job,0.01
housing,0.01
contact,0.01
marital,0.0
education,0.0


In [35]:
# Question 4
# Now let's train a logistic regression.
# Remember that we have several categorical variables in the dataset. Include them using one-hot encoding.
# Fit the model on the training dataset.
# To make sure the results are reproducible across different versions of Scikit-Learn, fit the model with these parameters:
# model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
# Calculate the accuracy on the validation dataset and round it to 2 decimal digits.

# Derive X_train.
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
dv = DictVectorizer(sparse=False)
train_dict = df_train.to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

#Fit X_train and y_train to model.
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

# Derive X_val.
val_dict = df_val.to_dict(orient='records')
X_val = dv.transform(val_dict)

# Obtain y_pred.
y_pred = model.predict(X_val)
# Calculate accuracy.
accuracy = np.round(accuracy_score(y_val, y_pred), 2)
print(f'Accuracy = {accuracy}')

Accuracy = 0.89


In [36]:
# Question 5
# Let's find the least useful feature using the feature elimination technique.
# Train a model with all these features (using the same parameters as in Q4).
# Now exclude each feature from this set and train a model without it. Record the accuracy for each model.
# For each feature, calculate the difference between the original accuracy and the accuracy without the feature.
# Which of following feature has the smallest difference?

# age
# balance
# marital
# previous

# List the features.
features = df_train.columns.to_list()

# Apply the feature elimination technique.
original_score = accuracy
scores = pd.DataFrame(columns=['eliminated_feature', 'accuracy', 'difference'])
for feature in features:
    subset = features.copy()
    subset.remove(feature)
    
    dv = DictVectorizer(sparse=False)
    train_dict = df_train[subset].to_dict(orient='records')
    X_train = dv.fit_transform(train_dict)

    model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)
    
    val_dict = df_val[subset].to_dict(orient='records')
    X_val = dv.transform(val_dict)
    
    y_pred = model.predict(X_val)
    score = accuracy_score(y_val, y_pred)
    
    scores.loc[len(scores)] = [feature, score, original_score - score]
    
    

# Print the scores.
scores

Unnamed: 0,eliminated_feature,accuracy,difference
0,age,0.888274,0.001726
1,job,0.892699,-0.002699
2,marital,0.890487,-0.000487
3,education,0.889381,0.000619
4,balance,0.886062,0.003938
5,housing,0.888274,0.001726
6,contact,0.884956,0.005044
7,day,0.887168,0.002832
8,month,0.886062,0.003938
9,duration,0.875,0.015
