<a href="https://colab.research.google.com/github/bsalami-092/Blessing-ML-Zoomcamp-2025/blob/main/ML_Zoomcamp_Assignment_4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import roc_auc_score, roc_curve, precision_score, recall_score, precision_recall_curve
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from tqdm.auto import tqdm
import warnings
warnings.filterwarnings('ignore')

In [None]:
data = 'https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv'

In [None]:
!wget $data

In [None]:
df = pd.read_csv(data)

df

In [None]:
df.info()

In [None]:
cat_var = list(df.dtypes[df.dtypes == 'object'].index)

cat_var

In [None]:
num_var = list(df.dtypes[df.dtypes != 'object'].index)

num_var

In [None]:
 df.isna().sum()

In [None]:
for i in cat_var:
  df[i] = df[i].fillna('NA')

In [None]:
for z in num_var:
  df[z] = df[z].fillna(0)

In [None]:
df.isna().sum()

In [None]:
df_train_full, df_test = train_test_split(df, test_size=0.2, random_state=1)
df_train, df_val = train_test_split(df_train_full, test_size=0.25, random_state=1)

df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

y_train = df_train.converted.values
y_val = df_val.converted.values
y_test = df_test.converted.values

del df_train['converted']
del df_val['converted']
del df_test['converted']

**Question 1: ROC AUC feature importance**
* ROC AUC could also be used to evaluate feature importance of numerical variables.

* For each numerical variable, use it as score (aka prediction) and compute the AUC with the y variable as ground truth.
* Use the training dataset for that
* If your AUC is < 0.5, invert this variable by putting "-" in front

* (e.g. -df_train['balance'])

* AUC can go below 0.5 if the variable is negatively correlated with the target variable. You can change the direction of the correlation by negating this variable - then negative correlation becomes positive.

* Which numerical variable (among the following 4) has the highest AUC?

* lead_score
* number_of_courses_viewed
* interaction_count
* annual_income

In [None]:
auc_score = {}

for i in ['lead_score', 'number_of_courses_viewed', 'interaction_count', 'annual_income']:
    auc = roc_auc_score(df_train_full['converted'], df_train_full[i])

    if auc < 0.5:
      auc = roc_auc_score(df_train_full['converted'], -df_train_full[i])

    auc_score[i] = auc

auc_score = {k: float(v) for k, v in auc_score.items()}.items()
auc_score = sorted(auc_score, key=lambda x: x[1], reverse=True)

In [None]:
auc_score

**Question 2: Training the model**
* Apply one-hot-encoding using DictVectorizer and train the logistic regression with these parameters:

* LogisticRegression(solver='liblinear', C=1.0, max_iter=1000)
* What's the AUC of this model on the validation dataset? (round to 3 digits)

In [None]:
num_var.remove('converted')

In [None]:
dv = DictVectorizer(sparse=False)

train_dict = df_train[cat_var + num_var].to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

In [None]:
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000)
model.fit(X_train, y_train)

In [None]:
val_dict = df_val[cat_var + num_var].to_dict(orient='records')
X_val = dv.transform(val_dict)

In [None]:
yval_preds = model.predict(X_val)

In [None]:
auc = roc_auc_score(y_val, yval_preds)

round(float(auc), 3)

In [None]:
fpr, tpr, thresholds = roc_curve(y_val, yval_preds)

In [None]:
plt.figure(figsize=(5, 5))

plt.plot(fpr, tpr, label='model')
plt.plot([0, 1], [0, 1], label='random', linestyle='--')
#plt.plot(df_rand.fpr, df_rand.tpr, label='random')
#plt.plot(df_ideal.fpr, df_ideal.tpr, label='ideal')

plt.xlabel('FPR')
plt.ylabel('TPR')

plt.legend()

**Question 3: Precision and Recall**
* Now let's compute precision and recall for our model.

* Evaluate the model on all thresholds from 0.0 to 1.0 with step 0.01
For each threshold, compute precision and recall

* Plot them at which threshold precision and recall curves intersect?

In [None]:
np.arange(0, 11, 0.01)

In [None]:
model = LogisticRegression(solver='liblinear', C=1, max_iter=1000)
model.fit(X_train, y_train)
yval_probs = model.predict_proba(X_val)[:, 1]

thresholds = np.arange(0.0, 1.01, 0.01)
precisions, recalls = [], []

for t in thresholds:
    yval_preds = (yval_probs >= t).astype(int)
    precisions.append(precision_score(y_val, yval_preds))
    recalls.append(recall_score(y_val, yval_preds))

In [None]:
plt.figure(figsize=(8,5))
plt.plot(thresholds, precisions, label='Precision')
plt.plot(thresholds, recalls, label='Recall')



In [None]:
diff = np.abs(np.array(precisions) - np.array(recalls))

np.where(diff < 0.01)

In [None]:
# We can see that the curves intercepted more than once
diff = np.abs(np.array(precisions) - np.array(recalls))
close_points = np.where(diff < 0.01)[0]

if len(close_points) > 0:
    idx = close_points[0]
else:
    idx = np.argmin(diff)

best_threshold = thresholds[idx]


plt.figure(figsize=(8,5))
plt.plot(thresholds, precisions, label='Precision')
plt.plot(thresholds, recalls, label='Recall')
plt.axvline(best_threshold, color='red', linestyle='--', label=f'First Intersection at {best_threshold:.2f}')
plt.scatter(best_threshold, precisions[idx], color='red')

plt.xlabel('Probability Threshold')
plt.ylabel('Score')
plt.title('Precision and Recall vs Threshold')
plt.legend()
plt.grid(True)
plt.show()

print(f"Lowest intersection threshold = {best_threshold:.2f}")
print(f"Precision = {precisions[idx]:.3f}, Recall = {recalls[idx]:.3f}")



### Question 4: F₁ score

Precision and recall are often in conflict — when one grows, the other tends to go down.  
To balance both we use the **F₁ score**, the harmonic mean of precision (**P**) and recall (**R**):

$$
F_1 = \frac{2 \cdot P \cdot R}{P + R}
$$

Where:  
- **P** = Precision  
- **R** = Recall  

Compute **F₁** for all thresholds from **0.00** to **1.00** (step **0.01**).  
Which threshold gives the **maximum F₁**?


In [None]:
model = LogisticRegression(solver='liblinear', C=1, max_iter=1000)
model.fit(X_train, y_train)
yval_probs = model.predict_proba(X_val)[:, 1]

thresholds = np.arange(0.0, 1.01, 0.01)
precisions, recalls, f1 = [], [], []

for t in thresholds:
    yval_preds = (yval_probs >= t).astype(int)
    precision = precision_score(y_val, yval_preds)
    recall = recall_score(y_val, yval_preds)
    precisions.append(precision)
    recalls.append(recall)
    if (precision + recall) == 0:
        f1.append(0)
    else:
        f1.append(2 * (precision * recall) / (precision + recall))

In [None]:
f1_scores = np.array(f1)
idx = np.argmax(f1_scores)
best_threshold = thresholds[idx]
best_f1 = f1_scores[idx]

print(f"Best threshold: {best_threshold:.2f}, Best F1: {best_f1:.3f}")


### 🧠 Question 5: 5-Fold Cross-Validation

Use the **KFold** class from **Scikit-Learn** to evaluate our model on 5 different folds.

- Use the following parameters for KFold:  
  `KFold(n_splits=5, shuffle=True, random_state=1)`

- Iterate over different folds of `df_full_train`
- Split the data into **train** and **validation** sets
- Train the model on the train set using:  
  `LogisticRegression(solver='liblinear', C=1.0, max_iter=1000)`
- Use **AUC (Area Under the Curve)** to evaluate the model on the validation set

**Question:**  
👉 How large is the **standard deviation** of the AUC scores across the different folds?


In [None]:
n_splits = 5

scores = []

kfold =KFold(n_splits=n_splits, shuffle=True, random_state=1)


for train_idx, val_idx in tqdm(kfold.split(df_train_full), total = n_splits):
    df_train = df_train_full.iloc[train_idx]
    df_val = df_train_full.iloc[val_idx]

    y_train = df_train.converted.values
    y_val = df_val.converted.values


    dv = DictVectorizer(sparse=False)
    train_dict = df_train[cat_var + num_var].to_dict(orient='records')
    X_train = dv.fit_transform(train_dict)

    model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000)
    model.fit(X_train, y_train)

    val_dict = df_val[cat_var + num_var].to_dict(orient='records')
    X_val = dv.transform(val_dict)
    yval_preds = model.predict_proba(X_val)[:, 1]


    auc = roc_auc_score(y_val, yval_preds)
    scores.append(auc)

    print("Mean AUC: %.3f ± %.3f" % (np.mean(scores), np.std(scores)))

### ⚙️ Question 6: Hyperparameter Tuning

Now let's use **5-Fold Cross-Validation** to find the best parameter **C**.

- Iterate over the following values of **C**:  
  `[0.000001, 0.001, 1]`
- Initialize **KFold** with the same parameters as before:  
  `KFold(n_splits=5, shuffle=True, random_state=1)`
- Use these parameters for the model:  
  `LogisticRegression(solver='liblinear', C=C, max_iter=1000)`
- For each value of **C**, compute:
  - The **mean AUC score** across all folds  
  - The **standard deviation** of the AUC scores  

(👉 Round both the mean and standard deviation to **3 decimal digits**.)

**Question:**  
Which value of **C** gives the **best mean AUC score**?


In [None]:
n_splits = 5

results = []

for C in [0.000001, 0.001, 1]:
    kfold = KFold(n_splits=n_splits, shuffle=True, random_state=1)
    scores = []

    for train_idx, val_idx in tqdm(kfold.split(df_train_full), total=n_splits):
        df_train = df_train_full.iloc[train_idx]
        df_val = df_train_full.iloc[val_idx]

        y_train = df_train.converted.values
        y_val = df_val.converted.values

        dv = DictVectorizer(sparse=False)
        train_dict = df_train[cat_var + num_var].to_dict(orient='records')
        X_train = dv.fit_transform(train_dict)

        model = LogisticRegression(solver='liblinear', C=C, max_iter=1000)
        model.fit(X_train, y_train)

        val_dict = df_val[cat_var + num_var].to_dict(orient='records')
        X_val = dv.transform(val_dict)
        yval_pred = model.predict_proba(X_val)[:, 1]

        auc = roc_auc_score(y_val, yval_pred)
        scores.append(auc)

    mean_auc = np.mean(scores)
    std_auc = np.std(scores)

    print(f'C={C}  {mean_auc:.3f} ± {std_auc:.3f}')
    results.append({'C': C, 'mean_auc': round(mean_auc, 3), 'std_auc': round(std_auc, 3)})

results
