# 2. Classification - Homework

In [1]:
import pandas as pd

In [4]:
df = pd.read_csv("./bank-full.csv", sep=";")

## Question 1

In [5]:
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [6]:
# Select relevant columns
columns = [
    "age", "job", "marital", "education", "balance", "housing", "contact",
    "day", "month", "duration", "campaign", "pdays", "previous", "poutcome", "y"
]
df = df[columns]

# Check for missing values
print(df.isnull().sum())

age          0
job          0
marital      0
education    0
balance      0
housing      0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64


In [7]:
# Find the most frequent value (mode) in the 'education' column
most_frequent_education = df['education'].mode()[0]
print("Most frequent observation for education:", most_frequent_education)

Most frequent observation for education: secondary


## Question 2

In [8]:
# Select only numerical columns
numerical_columns = ["age", "balance", "day", "duration", "campaign", "pdays", "previous"]

# Create correlation matrix
correlation_matrix = df[numerical_columns].corr()

# Find the two features with the highest correlation
correlations = correlation_matrix.unstack().sort_values(ascending=False)
# Exclude self-correlation
correlations = correlations[correlations < 1]

# Display the two features with the highest correlation
print("Two features with the highest correlation:", correlations.idxmax())

Two features with the highest correlation: ('previous', 'pdays')


## Question 3

In [17]:
# Encode the 'y' variable (yes/no -> 1/0)
df['y'] = df['y'].map({'yes': 1, 'no': 0})

In [10]:
from sklearn.model_selection import train_test_split

# Split the data into train/validation/test sets (60%/20%/20%)
train_df, temp_df = train_test_split(df, test_size=0.4, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

# Make sure to remove 'y' from the features for later models
X_train = train_df.drop('y', axis=1)
y_train = train_df['y']
X_val = val_df.drop('y', axis=1)
y_val = val_df['y']
X_test = test_df.drop('y', axis=1)
y_test = test_df['y']

In [16]:
from sklearn.feature_selection import mutual_info_classif

# Select the categorical variables
categorical_columns = ['job', 'marital', 'education', 'housing', 'contact', 'month', 'poutcome']

# Apply one-hot encoding to the categorical columns
X_train_encoded = pd.get_dummies(X_train[categorical_columns])

# Calculate mutual information score
mi_scores = mutual_info_classif(X_train_encoded, y_train)

# Display the mutual information scores
mi_scores_rounded = [round(score, 2) for score in mi_scores]
mi_df = pd.DataFrame({'Feature': X_train_encoded.columns, 'MI Score': mi_scores_rounded})
print(mi_df.sort_values(by='MI Score', ascending=False))

                Feature  MI Score
38     poutcome_success      0.03
39     poutcome_unknown      0.02
23      contact_unknown      0.02
13      marital_married      0.01
32            month_may      0.01
21     contact_cellular      0.01
19           housing_no      0.01
17   education_tertiary      0.01
16  education_secondary      0.01
20          housing_yes      0.01
10       job_unemployed      0.00
11          job_unknown      0.00
2      job_entrepreneur      0.00
37       poutcome_other      0.00
36     poutcome_failure      0.00
35            month_sep      0.00
34            month_oct      0.00
33            month_nov      0.00
3         job_housemaid      0.00
31            month_mar      0.00
30            month_jun      0.00
29            month_jul      0.00
28            month_jan      0.00
27            month_feb      0.00
26            month_dec      0.00
25            month_aug      0.00
24            month_apr      0.00
4        job_management      0.00
22    contact_

In [19]:
from sklearn.metrics import mutual_info_score, accuracy_score


# List the categorical columns.
cat_list = ['job', 'marital', 'education', 'housing', 'contact', 'month', 'poutcome']

# Define cal_mi function.
def cal_mi(series):
    return mutual_info_score(series, y_train)

# Calculate MI.
df_mi = X_train[cat_list].apply(cal_mi).round(2)
df_mi = df_mi.sort_values(ascending=False).to_frame(name='MI')
df_mi

Unnamed: 0,MI
poutcome,0.03
month,0.02
job,0.01
housing,0.01
contact,0.01
marital,0.0
education,0.0


## Question 4

In [12]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Apply one-hot encoding to the categorical columns for training and validation
X_train_encoded = pd.get_dummies(X_train)
X_val_encoded = pd.get_dummies(X_val)

# Ensure columns in the train and validation sets match
X_val_encoded = X_val_encoded.reindex(columns=X_train_encoded.columns, fill_value=0)

# Train logistic regression model
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train_encoded, y_train)

# Predict on the validation set
y_val_pred = model.predict(X_val_encoded)

# Calculate accuracy
accuracy = accuracy_score(y_val, y_val_pred)
print("Validation Accuracy:", round(accuracy, 2))

Validation Accuracy: 0.9


## Question 5

In [20]:
# List the features
features = X_train.columns.to_list()
features

['age',
 'job',
 'marital',
 'education',
 'balance',
 'housing',
 'contact',
 'day',
 'month',
 'duration',
 'campaign',
 'pdays',
 'previous',
 'poutcome']

In [None]:
from sklearn.feature_extraction import DictVectorizer

# Apply the feature elimination technique.
original_score = accuracy
scores = pd.DataFrame(columns=['eliminated_feature', 'accuracy', 'difference'])
for feature in features:
    subset = features.copy()
    subset.remove(feature)
    
    dv = DictVectorizer(sparse=False)
    train_dict = X_train[subset].to_dict(orient='records')
    X_train = dv.fit_transform(train_dict)

    model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)
    
    val_dict = X_val[subset].to_dict(orient='records')
    X_val = dv.transform(val_dict)
    
    y_pred = model.predict(X_val)
    score = accuracy_score(y_val, y_pred)
    
    scores.loc[len(scores)] = [feature, score, original_score - score]

## Question 6