# Homework 3

In this dataset our desired target for classification task will be `y` variable - has the client subscribed a term deposit or not.

In [110]:
import shutil
import os
import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score


In [111]:
dv = DictVectorizer(sparse=False)

In [112]:
data = "https://archive.ics.uci.edu/static/public/222/bank+marketing.zip"

!wget -P ../homework/data $data 

--2024-10-15 07:10:26--  https://archive.ics.uci.edu/static/public/222/bank+marketing.zip
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified
Saving to: ‘../homework/data/bank+marketing.zip’

bank+marketing.zip      [     <=>            ] 999,85K   861KB/s    in 1,2s    

2024-10-15 07:10:28 (861 KB/s) - ‘../homework/data/bank+marketing.zip’ saved [1023843]



In [113]:
shutil.unpack_archive("../homework/data/bank+marketing.zip", "../homework/data")
shutil.unpack_archive("../homework/data/bank.zip", "../homework/data")

In [114]:
files_paths = [
    "../homework/data/bank-additional.zip",
    "../homework/data/bank-names.txt", 
    "../homework/data/bank.csv", 
    "../homework/data/bank.zip", 
    "../homework/data/bank+marketing.zip"
]

for file in files_paths:
    os.remove(file)

### Data preparation

In [115]:
df = pd.read_csv("../homework/data/bank-full.csv", sep=";")
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [116]:
columns = ["age", "job", "marital", "education", "balance", "housing", "contact", "day", "month", "duration", "campaign", "pdays", "previous", "poutcome", "y"]

In [117]:
df = df[columns]
df.head()

Unnamed: 0,age,job,marital,education,balance,housing,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,2143,yes,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,29,yes,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,2,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,1506,yes,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,1,no,unknown,5,may,198,1,-1,0,unknown,no


In [118]:
df.dtypes

age           int64
job          object
marital      object
education    object
balance       int64
housing      object
contact      object
day           int64
month        object
duration      int64
campaign      int64
pdays         int64
previous      int64
poutcome     object
y            object
dtype: object

In [119]:
df.isnull().sum()

age          0
job          0
marital      0
education    0
balance      0
housing      0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64

### Question 1

What is the most frequent observation (mode) for the column `education`?

- `unknown`
- `primary`
- `secondary`
- `tertiary`

In [120]:
df.education.mode()

0    secondary
Name: education, dtype: object

### Question 2

Create the [correlation matrix](https://www.google.com/search?q=correlation+matrix) for the numerical features of your dataset. 
In a correlation matrix, you compute the correlation coefficient between every pair of features.

What are the two features that have the biggest correlation?

- `age` and `balance`
- `day` and `campaign`
- `day` and `pdays`
- `pdays` and `previous`

In [121]:
age_balance_correlation = df[["age"]].corrwith(df["balance"])
day_campaign_correlation = df[["day"]].corrwith(df["campaign"])
day_pdays_correlation = df[["day"]].corrwith(df["pdays"])
pdays_previous_correlation = df[["pdays"]].corrwith(df["previous"])

print("Correlation between age and balance:", age_balance_correlation)
print("Correlation between day and campaign:", day_campaign_correlation)
print("Correlation between day and pdays:", day_pdays_correlation)
print("Correlation between pdays and previous:", pdays_previous_correlation)

Correlation between age and balance: age    0.097783
dtype: float64
Correlation between day and campaign: day    0.16249
dtype: float64
Correlation between day and pdays: day   -0.093044
dtype: float64
Correlation between pdays and previous: pdays    0.45482
dtype: float64


Answer: The highest correlation is between `pdays` and `previous` with `0.45482`

### Target encoding

* Now we want to encode the `y` variable.
* Let's replace the values `yes`/`no` with `1`/`0`.

In [122]:
df.y.unique()

array(['no', 'yes'], dtype=object)

In [123]:
df.y = (df.y == 'yes').astype(int)

In [124]:
df.y.unique()

array([0, 1])

### Split the data

* Split your data in train/val/test sets with 60%/20%/20% distribution.
* Use Scikit-Learn for that (the `train_test_split` function) and set the seed to `42`.
* Make sure that the target value `y` is not in your dataframe.

In [125]:
df.shape

(45211, 15)

In [126]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

len(df_full_train), len(df_train), len(df_val), len(df_test)

(36168, 27126, 9042, 9043)

In [127]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [128]:
y_train = df_train.y.values
y_val = df_val.y.values
y_test = df_test.y.values

In [129]:
del df_train['y']
del df_val['y']
del df_test['y']

### Question 3

* Calculate the mutual information score between `y` and other categorical variables in the dataset. Use the training set only.
* Round the scores to 2 decimals using `round(score, 2)`.

Which of these variables has the biggest mutual information score?
  
- `contact`
- `education`
- `housing`
- `poutcome`

In [130]:
df_full_train.head()

Unnamed: 0,age,job,marital,education,balance,housing,contact,day,month,duration,campaign,pdays,previous,poutcome,y
3344,41,blue-collar,married,primary,849,yes,unknown,15,may,72,1,-1,0,unknown,0
17965,49,technician,married,primary,1415,yes,cellular,30,jul,269,2,-1,0,unknown,0
18299,42,admin.,married,secondary,3842,no,cellular,31,jul,130,4,-1,0,unknown,0
10221,37,management,single,tertiary,-119,yes,unknown,11,jun,375,11,-1,0,unknown,0
32192,56,blue-collar,married,primary,3498,no,cellular,15,apr,264,2,-1,0,unknown,1


In [131]:
contact = mutual_info_score(df_full_train.y, df_full_train.contact).round(2)
education = mutual_info_score(df_full_train.y, df_full_train.education).round(2)
housing = mutual_info_score(df_full_train.y, df_full_train.housing).round(2)
poutcome = mutual_info_score(df_full_train.y, df_full_train.poutcome).round(2)

mi = contact, education, housing, poutcome
print(mi)


(0.01, 0.0, 0.01, 0.03)


Answer: The highest mutual information score is `poutcome`

### Question 4

* Now let's train a logistic regression.
* Remember that we have several categorical variables in the dataset. Include them using one-hot encoding.
* Fit the model on the training dataset.
    - To make sure the results are reproducible across different versions of Scikit-Learn, fit the model with these parameters:
    - `model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)`
* Calculate the accuracy on the validation dataset and round it to 2 decimal digits.

What accuracy did you get?

- 0.6
- 0.7
- 0.8
- 0.9

In [132]:
train_dict = df_train.to_dict(orient="records")
X_train = dv.fit_transform(train_dict)

val_dict = df_val.to_dict(orient="records")
X_val = dv.transform(val_dict)

In [133]:
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)

In [134]:
model.fit(X_train, y_train)

In [135]:
y_pred = model.predict_proba(X_val)[:, 1]

In [136]:
term_deposit = (y_pred >= 0.5)

In [89]:
accuracy = (y_val == term_deposit).mean()
accuracy.round(2)

0.9

Answer: The accuracy is `0.9`

### Question 5 

* Let's find the least useful feature using the *feature elimination* technique.
* Train a model with all these features (using the same parameters as in Q4).
* Now exclude each feature from this set and train a model without it. Record the accuracy for each model.
* For each feature, calculate the difference between the original accuracy and the accuracy without the feature. 

Which of following feature has the smallest difference?

- `age`
- `balance`
- `marital`
- `previous`

> **Note**: The difference doesn't have to be positive.

In [141]:
features = ["age", "balance", "marital", "previous"]

df_train_e = df_train[features]
df_val_e = df_val[features]

In [142]:
train_dict_e = df_train_e.to_dict(orient="records")
X_train_e = dv.fit_transform(train_dict_e)

train_dict_e = df_val_e.to_dict(orient="records")
X_val_e = dv.transform(train_dict_e)

In [143]:
model_e = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)

In [144]:
model_e.fit(X_train_e, y_train)

In [145]:
y_pred_e = model_e.predict_proba(X_val_e)[:, 1]
term_deposit_e = (y_pred_e >= 0.5)
accuracy_e = (term_deposit_e == y_val).mean()
accuracy_e

0.880336208803362

In [159]:
records = {}

for f in features:
    train_dict_e = df_train_e.drop(columns=f).to_dict(orient="records")
    X_train = dv.fit_transform(train_dict_e)
    
    val_dict = df_val_e.drop(columns=f).to_dict(orient="records")
    X_val = dv.transform(val_dict)
    
    model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)
    
    y_pred = model.predict_proba(X_val)[:, 1]
    term_deposit = (y_pred >= 0.5)
    accuracy = (term_deposit == y_val).mean()
    diff = abs(accuracy_e - accuracy)
    
    records[f] = diff    
    
records

{'age': 0.0001105950011059953,
 'balance': 0.0,
 'marital': 0.00011059500110588427,
 'previous': 0.0013271400132713884}

In [163]:
smallest_diff = min(records, key=records.get)
print(f"The least important useful feature is: {smallest_diff}")

The least important useful feature is: balance


### Question 6

* Now let's train a regularized logistic regression.
* Let's try the following values of the parameter `C`: `[0.01, 0.1, 1, 10, 100]`.
* Train models using all the features as in Q4.
* Calculate the accuracy on the validation dataset and round it to 3 decimal digits.

Which of these `C` leads to the best accuracy on the validation set?

- 0.01
- 0.1
- 1
- 10
- 100

> **Note**: If there are multiple options, select the smallest `C`.

In [166]:
values = [0.01, 0.1, 1, 10, 100]
records = {}

for n in values:
    train_dict = df_train.drop(columns=f).to_dict(orient="records")
    X_train = dv.fit_transform(train_dict)
    
    val_dict = df_val.drop(columns=f).to_dict(orient="records")
    X_val = dv.transform(val_dict)
    
    model = LogisticRegression(solver='liblinear', C=n, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)
    
    y_pred = model.predict_proba(X_val)[:, 1]
    term_deposit = (y_pred >= 0.5)
    accuracy = (term_deposit == y_val).mean()
    
    records[n] = accuracy.round(3)

records


{0.01: 0.899, 0.1: 0.901, 1: 0.901, 10: 0.901, 100: 0.901}

Answer: Since there are multiple options, the smallest `C` was `0.01`.