<a href="https://colab.research.google.com/github/ahmed123234/zoomcamp-ML/blob/main/homework-03.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [65]:
import pandas as pd

!wget -O course_lead_scoring.csv https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv

--2025-10-15 09:51:33--  https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.111.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 80876 (79K) [text/plain]
Saving to: ‘course_lead_scoring.csv’


2025-10-15 09:51:33 (13.2 MB/s) - ‘course_lead_scoring.csv’ saved [80876/80876]



In [66]:
# --- 1. Load the Dataset ---
df = pd.read_csv('course_lead_scoring.csv')

# Clean up column names by lowercasing and replacing spaces
df.columns = df.columns.str.lower().str.replace(' ', '_')

print("--- Initial Dataset Load and Inspection ---")
print(f"Dataset size: {len(df)} rows, {len(df.columns)} columns")
print("\nFirst 5 rows:")
print(df.head())
print("-" * 40)

# --- 2. Identify Feature Types and Check Missing Values ---
numerical_features = ['annual_income', 'number_of_courses_viewed', 'interaction_count', 'lead_score']
categorical_features = ['industry', 'location', 'lead_source', 'employment_status']
target = 'converted'

print("Missing values BEFORE imputation:")
print(df.isnull().sum()[df.isnull().sum() > 0])
print("-" * 40)

# --- 3. Data Preparation: Handle Missing Values ---

# Impute numerical features with 0.0
for col in numerical_features:
    df[col] = df[col].fillna(0.0)

# Impute categorical features with 'NA'
for col in categorical_features:
    df[col] = df[col].fillna('NA')

print("Missing values AFTER imputation:")
print(df.isnull().sum()[df.isnull().sum() > 0])

# Verify data types and show clean data
print("-" * 40)
print("Data types check (Target is not yet separated):")
print(df.dtypes)

print("-" * 40)
print("Step 1 Complete: Data is loaded and missing values have been handled.")


--- Initial Dataset Load and Inspection ---
Dataset size: 1462 rows, 9 columns

First 5 rows:
    lead_source    industry  number_of_courses_viewed  annual_income  \
0      paid_ads         NaN                         1        79450.0   
1  social_media      retail                         1        46992.0   
2        events  healthcare                         5        78796.0   
3      paid_ads      retail                         2        83843.0   
4      referral   education                         3        85012.0   

  employment_status       location  interaction_count  lead_score  converted  
0        unemployed  south_america                  4        0.94          1  
1          employed  south_america                  1        0.80          0  
2        unemployed      australia                  3        0.69          1  
3               NaN      australia                  1        0.87          0  
4     self_employed         europe                  3        0.62          1  

In [67]:
# --- Question 1 Calculation: 1: The Mode of 'industry' ---
industry_mode = df['industry'].mode().iloc[0]

print(f"The total number of observations in 'industry' is: {len(df)}")
print(f"The most frequent observation (mode) for the column 'industry' is: '{industry_mode}'")


The total number of observations in 'industry' is: 1462
The most frequent observation (mode) for the column 'industry' is: 'retail'


In [68]:
# --- Question 2: Correlation Matrix ---
print("--- Question 2 ---")
# Calculate correlation matrix for numerical features
corr_matrix = df[numerical_features].corr()

# Extract the required correlation pairs
pairs = {
    'interaction_count vs lead_score': corr_matrix.loc['interaction_count', 'lead_score'],
    'number_of_courses_viewed vs lead_score': corr_matrix.loc['number_of_courses_viewed', 'lead_score'],
    'number_of_courses_viewed vs interaction_count': corr_matrix.loc['number_of_courses_viewed', 'interaction_count'],
    'annual_income vs interaction_count': corr_matrix.loc['annual_income', 'interaction_count']
}

# Find the pair with the biggest absolute correlation
max_corr_pair = max(pairs.items(), key=lambda item: abs(item[1]))

print("Correlation Pairs:")
for pair, corr in pairs.items():
    print(f"  {pair}: {corr:.4f}")
print(f"The two features with the biggest correlation (absolute value) are: {max_corr_pair[0]} ({max_corr_pair[1]:.4f})")
# The biggest correlation is between 'number_of_courses_viewed' and 'interaction_count'
print("-" * 20)


--- Question 2 ---
Correlation Pairs:
  interaction_count vs lead_score: 0.0099
  number_of_courses_viewed vs lead_score: -0.0049
  number_of_courses_viewed vs interaction_count: -0.0236
  annual_income vs interaction_count: 0.0270
The two features with the biggest correlation (absolute value) are: annual_income vs interaction_count (0.0270)
--------------------


In [69]:
from sklearn.model_selection import train_test_split

# --- Split Data into Train (60%), Validation (20%), and Test (20%) ---

print("--- Data Splitting ---")

# Split into Train (80%) and test (20%)
df_full_train, df_test =  train_test_split(df, test_size=0.2, random_state=42)

# Split Train (80%) into Validation (20%) and Train (60%)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)


df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

# determine the y value for the target coulmn
TARGET = 'converted'
y_train = df_train[TARGET].values
y_val = df_val[TARGET].values
y_test = df_test[TARGET].values

# del the y value from the trained data
del df_train[TARGET]
del df_val[TARGET]
del df_test[TARGET]



# --- 4. Verification and Output ---
print("--- Data Split Verification ---")
print(f"Total dataset size: {len(df)} rows")
print("-" * 35)
print(f"Training set size (60%): {len(df_train)} rows")
print(f"Validation set size (20%): {len(df_val)} rows")
print(f"Test set size (20%): {len(df_test)} rows")
print("-" * 35)
print("Data split successfully into 60/20/20 distribution.")


--- Data Splitting ---
--- Data Split Verification ---
Total dataset size: 1462 rows
-----------------------------------
Training set size (60%): 876 rows
Validation set size (20%): 293 rows
Test set size (20%): 293 rows
-----------------------------------
Data split successfully into 60/20/20 distribution.


In [70]:
# --- Question 3: Mutual Information ---
from sklearn.metrics import mutual_info_score

def mutual_info_score_binary(series):
    return mutual_info_score(series, y_train)

# apply for every categorical_features
mi = df_train[categorical_features].apply(mutual_info_score_binary)
mi = round(mi.sort_values(ascending=False), 2)

print("--- Mutual Information Scores ---")
print(mi)
print("-" * 35)

# find the biggest matual information feature
print(f"The feature with the biggest mutual information is: {mi.index[0]}")


--- Mutual Information Scores ---
lead_source          0.04
employment_status    0.01
industry             0.01
location             0.00
dtype: float64
-----------------------------------
The feature with the biggest mutual information is: lead_source


In [71]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import accuracy_score

# --- Question 4: Logistic Regression Baseline ---

print("--- Logistic Regression Baseline ---")

# --- Feature Preparation for Model Training ---
dv = DictVectorizer(sparse=False)
train_dicts = df_train[categorical_features + numerical_features].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)

val_dicts = df_val[categorical_features + numerical_features].to_dict(orient='records')
X_val = dv.transform(val_dicts)


# --- Model Training ---
print("--- Logistic Regression Training ---")

model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

# Calculate the accuracy on the validation dataset and round it to 2 decimal digits.
# y_pred_val = model.predict_proba(X_val)[:, 1]
# accuracy_baseline = accuracy_score(y_val, (y_pred_val >= 0.5))
# accuracy_q4 = round(accuracy_baseline, 2)
# print(f"The accuracy of the model on the validation dataset is: {accuracy_q4}")

# y_pred = y_pred_val >= 0.5
# print(round((y_pred == y_val).mean(), 2))

accuracy = round(model.score(X_val, y_val), 2)
print(f"The accuracy of the model on the validation dataset is: {accuracy}")

--- Logistic Regression Baseline ---
--- Logistic Regression Training ---
The accuracy of the model on the validation dataset is: 0.7


In [72]:
# print("Processed training data shape:", X_train_processed.shape)
print("Processed training data shape:", X_train.shape)
df_train.lead_score

Processed training data shape: (876, 31)


Unnamed: 0,lead_score
0,0.03
1,0.77
2,0.59
3,0.34
4,0.98
...,...
871,0.33
872,0.18
873,0.75
874,0.65


In [73]:
from sklearn.metrics import accuracy_score

# --- Question 5: Feature Elimination ---

print("--- Question 5: Feature Elimination ---")

features_to_eliminate = ['industry', 'employment_status', 'lead_score']
baseline_accuracy = model.score(X_val, y_val)
print(f"Baseline accuracy: {baseline_accuracy}")

accuracy_differences = {}

for feature in features_to_eliminate:
    # Create modified datasets by excluding the current feature
    train_dicts_modified = df_train.drop(columns=[feature]).to_dict(orient='records')
    X_train_modified = dv.fit_transform(train_dicts_modified)
    val_dicts_modified = df_val.drop(columns=[feature]).to_dict(orient='records')
    X_val_modified = dv.transform(val_dicts_modified)

    # Train a new model without the current feature
    model_modified = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
    model_modified.fit(X_train_modified, y_train)

    # Calculate accuracy on the modified validation data
    # y_pred_val_modified = model_modified.predict(X_val_modified)
    # accuracy_modified = accuracy_score(y_val, y_pred_val_modified)
    accuracy_modified = model_modified.score(X_val_modified, y_val)

    # Calculate the accuracy difference
    accuracy_difference = baseline_accuracy - accuracy_modified
    accuracy_differences[feature] = accuracy_difference

    print(f"Accuracy without '{feature}': {accuracy_modified}, Difference: {accuracy_difference}")

# Find the feature with the smallest accuracy difference
feature_with_smallest_difference = min(accuracy_differences, key=accuracy_differences.get)

print(f"\nThe feature with the smallest accuracy difference is: '{feature_with_smallest_difference}'")



--- Question 5: Feature Elimination ---
Baseline accuracy: 0.6996587030716723
Accuracy without 'industry': 0.6996587030716723, Difference: 0.0
Accuracy without 'employment_status': 0.6962457337883959, Difference: 0.0034129692832763903
Accuracy without 'lead_score': 0.7064846416382252, Difference: -0.0068259385665528916

The feature with the smallest accuracy difference is: 'lead_score'


In [74]:
# --- Question 6: Regularized Logistic Regression (Tuning C) ---

print("--- Question 6: Regularized Logistic Regression (Tuning C) ---")

c_parameters = [0.01, 0.1, 1, 10, 100]

# initilize the accuracy_dict
accuracy_dict = {}

# train the model with different parameter as c value
for c in c_parameters:
    model = LogisticRegression(solver='liblinear', C=c, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)
    accuracy = round(model.score(X_val, y_val), 3)
    # print(f"Accuracy with C={parameter}: {accuracy}")
    print(f"C={c}: Validation Accuracy (rounded to 3 decimals): {accuracy}")


    accuracy_dict[c] = accuracy

# find the max one
max_accuracy = max(accuracy_dict, key=accuracy_dict.get)

print(f"The parameter with the biggest accuracy is: {max_accuracy}")

--- Question 6: Regularized Logistic Regression (Tuning C) ---
C=0.01: Validation Accuracy (rounded to 3 decimals): 0.7
C=0.1: Validation Accuracy (rounded to 3 decimals): 0.7
C=1: Validation Accuracy (rounded to 3 decimals): 0.7
C=10: Validation Accuracy (rounded to 3 decimals): 0.7
C=100: Validation Accuracy (rounded to 3 decimals): 0.7
The parameter with the biggest accuracy is: 0.01
