<a href="https://colab.research.google.com/github/bhung-chung/ML-Zoomcamp/blob/main/Homework3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [19]:
# Install additional libraries if needed
!pip install scikit-learn pandas numpy matplotlib seaborn

# Import all required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import mutual_info_classif
import warnings
warnings.filterwarnings('ignore')

print("✅ All libraries imported successfully!")


✅ All libraries imported successfully!


In [20]:
# Load the dataset directly from the URL
url = "https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv"
df = pd.read_csv(url)
print(f"Dataset loaded successfully!")
print(f"Dataset shape: {df.shape}")
print(f"\nColumn names: {df.columns.tolist()}")
print(f"\nFirst 5 rows:")
print(df.head())



Dataset loaded successfully!
Dataset shape: (1462, 9)

Column names: ['lead_source', 'industry', 'number_of_courses_viewed', 'annual_income', 'employment_status', 'location', 'interaction_count', 'lead_score', 'converted']

First 5 rows:
    lead_source    industry  number_of_courses_viewed  annual_income  \
0      paid_ads         NaN                         1        79450.0   
1  social_media      retail                         1        46992.0   
2        events  healthcare                         5        78796.0   
3      paid_ads      retail                         2        83843.0   
4      referral   education                         3        85012.0   

  employment_status       location  interaction_count  lead_score  converted  
0        unemployed  south_america                  4        0.94          1  
1          employed  south_america                  1        0.80          0  
2        unemployed      australia                  3        0.69          1  
3            

In [21]:
print("=== DATA PREPARATION ===")
print("=" * 30)

# Check for missing values
print("Missing values before cleaning:")
print(df.isnull().sum())

# Handle missing values as per homework instructions
print("\nHandling missing values...")

# For categorical features, replace with 'NA'
categorical_cols = df.select_dtypes(include=['object']).columns
for col in categorical_cols:
    df[col] = df[col].fillna('NA')
    print(f"   • {col}: Filled missing values with 'NA'")

# For numerical features, replace with 0.0
numerical_cols = df.select_dtypes(include=['float64', 'int64']).columns
for col in numerical_cols:
    if col != 'converted':  # Don't modify target variable
        df[col] = df[col].fillna(0.0)

print("\nMissing values after cleaning:")
print(df.isnull().sum())
print("Data preparation completed!")


=== DATA PREPARATION ===
Missing values before cleaning:
lead_source                 128
industry                    134
number_of_courses_viewed      0
annual_income               181
employment_status           100
location                     63
interaction_count             0
lead_score                    0
converted                     0
dtype: int64

Handling missing values...
   • lead_source: Filled missing values with 'NA'
   • industry: Filled missing values with 'NA'
   • employment_status: Filled missing values with 'NA'
   • location: Filled missing values with 'NA'

Missing values after cleaning:
lead_source                 0
industry                    0
number_of_courses_viewed    0
annual_income               0
employment_status           0
location                    0
interaction_count           0
lead_score                  0
converted                   0
dtype: int64
Data preparation completed!


In [22]:
print("\n" + "=" * 50)
print("MACHINE LEARNING ZOOMCAMP 2025 - HOMEWORK 3")
print("=" * 50)

# QUESTION 1: Most frequent industry
print("\nQUESTION 1: Most frequent industry")
print("-" * 40)
industry_counts = df['industry'].value_counts()
industry_mode = df['industry'].mode()
print(f"Industry value counts:")
print(industry_counts)
print(f"\nAnswer: {industry_mode.iloc[0]}") # Access the first element of the mode series

# QUESTION 2: Feature correlation
print("\nQUESTION 2: Feature correlation")
print("-" * 40)
numerical_features = ['interaction_count', 'lead_score', 'number_of_courses_viewed', 'annual_income']
correlation_matrix = df[numerical_features].corr()

print("Correlation matrix:")
print(correlation_matrix.round(4))

# Check specific pairs
pairs = [
    ('interaction_count', 'lead_score'),
    ('number_of_courses_viewed', 'lead_score'),
    ('number_of_courses_viewed', 'interaction_count'),
    ('annual_income', 'interaction_count')
]

print(f"\nSpecific pair correlations:")
max_corr = 0
max_pair = None

for pair in pairs:
    # Access the scalar correlation value
    corr_val = abs(correlation_matrix.loc[pair[0], pair[1]])
    print(f"{pair[0]} and {pair[1]}: {corr_val:.4f}")
    if corr_val > max_corr:
        max_corr = corr_val
        max_pair = pair

print(f"\nAnswer: {max_pair[0]} and {max_pair[1]} (correlation: {max_corr:.4f})")


# Split the data
print("\nDATA SPLITTING")
print("-" * 40)
X = df.drop('converted', axis=1)
y = df['converted']

# 60%/20%/20% split
X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.25, random_state=42, stratify=y_temp)

print(f"Training set: {X_train.shape} samples ({X_train.shape[0]/len(df)*100:.1f}%)") # Corrected calculation
print(f"Validation set: {X_val.shape} samples ({X_val.shape[0]/len(df)*100:.1f}%)") # Corrected calculation
print(f"Test set: {X_test.shape} samples ({X_test.shape[0]/len(df)*100:.1f}%)")     # Corrected calculation

# QUESTION 3: Mutual information
print("\nQUESTION 3: Mutual information analysis")
print("-" * 40)
categorical_features = ['industry', 'location', 'lead_source', 'employment_status']
mutual_info_scores = {}

for col in categorical_features:
    le = LabelEncoder()
    # Ensure X_train[col] is treated as a column vector for mutual_info_classif
    X_train_encoded = le.fit_transform(X_train[col]).reshape(-1, 1)
    mi_score = mutual_info_classif(X_train_encoded, y_train, random_state=42)[0] # Extract the scalar value
    mutual_info_scores[col] = round(mi_score, 2)
    print(f"   • {col}: {mutual_info_scores[col]}")

max_mi_var = max(mutual_info_scores, key=mutual_info_scores.get)
print(f"\nAnswer: {max_mi_var} (MI score: {mutual_info_scores[max_mi_var]})")

# QUESTION 4: Logistic regression training
print("\nQUESTION 4: Logistic regression model")
print("-" * 40)

# Create preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', numerical_features),
        ('cat', OneHotEncoder(drop='first', sparse_output=False), categorical_features)
    ])

# Create model pipeline
model = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42))
])

# Train model
print("Training logistic regression model...")
model.fit(X_train, y_train)

# Calculate validation accuracy
y_val_pred = model.predict(X_val)
val_accuracy = accuracy_score(y_val, y_val_pred)
val_accuracy_rounded = round(val_accuracy, 2)

print(f"Validation accuracy: {val_accuracy:.4f}")
print(f"Answer: {val_accuracy_rounded}")

# Store original accuracy for Question 5
original_accuracy = val_accuracy

# QUESTION 5: Feature elimination
print("\nQUESTION 5: Feature importance analysis")
print("-" * 40)

features_to_test = ['industry', 'employment_status', 'lead_score']
feature_differences = {}

for feature in features_to_test:
    print(f"Testing removal of: {feature}")

    if feature in categorical_features:
        # Remove categorical feature
        remaining_categorical = [col for col in categorical_features if col != feature]
        remaining_numerical = numerical_features
    else:
        # Remove numerical feature
        remaining_categorical = categorical_features
        remaining_numerical = [col for col in numerical_features if col != feature]

    # Create new preprocessor
    if remaining_categorical and remaining_numerical:
        preprocessor_reduced = ColumnTransformer(
            transformers=[
                ('num', 'passthrough', remaining_numerical),
                ('cat', OneHotEncoder(drop='first', sparse_output=False), remaining_categorical)
            ])
    elif remaining_categorical:
        preprocessor_reduced = ColumnTransformer(
            transformers=[
                ('cat', OneHotEncoder(drop='first', sparse_output=False), remaining_categorical)
            ])
    else:
        preprocessor_reduced = ColumnTransformer(
            transformers=[
                ('num', 'passthrough', remaining_numerical)
            ])

    # Train model without feature
    model_reduced = Pipeline([
        ('preprocessor', preprocessor_reduced),
        ('classifier', LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42))
    ])

    # Select only the relevant columns for training
    X_train_reduced = X_train[remaining_numerical + remaining_categorical]
    model_reduced.fit(X_train_reduced, y_train)

    # Select only the relevant columns for prediction
    X_val_reduced = X_val[remaining_numerical + remaining_categorical]
    y_val_pred_reduced = model_reduced.predict(X_val_reduced)
    accuracy_reduced = accuracy_score(y_val, y_val_pred_reduced)


    difference = original_accuracy - accuracy_reduced
    feature_differences[feature] = difference

    print(f"   Accuracy without {feature}: {accuracy_reduced:.4f}")
    print(f"   Difference: {difference:.4f}")

# Find the feature whose removal resulted in the smallest difference (i.e., largest accuracy)
min_diff_feature = min(feature_differences, key=feature_differences.get)
print(f"\nAnswer: {min_diff_feature} (difference: {feature_differences[min_diff_feature]:.4f})")

# QUESTION 6: Regularization
print("\nQUESTION 6: Regularization analysis")
print("-" * 40)

C_values = [0.01, 0.1, 1, 10, 100]
C_results = {}

for C in C_values:
    print(f"Testing C = {C}")

    model_reg = Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', LogisticRegression(solver='liblinear', C=C, max_iter=1000, random_state=42))
    ])

    model_reg.fit(X_train, y_train)
    y_val_pred_reg = model_reg.predict(X_val)
    accuracy_reg = accuracy_score(y_val, y_val_pred_reg)
    accuracy_reg_rounded = round(accuracy_reg, 3)

    C_results[C] = accuracy_reg_rounded
    print(f"   Accuracy: {accuracy_reg_rounded}")

# Find best C (smallest if tied)
best_accuracy = max(C_results.values())
best_C_values = [C for C, acc in C_results.items() if acc == best_accuracy]
best_C = min(best_C_values)

print(f"\nBest accuracy: {best_accuracy}")
print(f"Answer: {best_C}")


MACHINE LEARNING ZOOMCAMP 2025 - HOMEWORK 3

QUESTION 1: Most frequent industry
----------------------------------------
Industry value counts:
industry
retail           203
finance          200
other            198
healthcare       187
education        187
technology       179
manufacturing    174
NA               134
Name: count, dtype: int64

Answer: retail

QUESTION 2: Feature correlation
----------------------------------------
Correlation matrix:
                          interaction_count  lead_score  \
interaction_count                    1.0000      0.0099   
lead_score                           0.0099      1.0000   
number_of_courses_viewed            -0.0236     -0.0049   
annual_income                        0.0270      0.0156   

                          number_of_courses_viewed  annual_income  
interaction_count                          -0.0236         0.0270  
lead_score                                 -0.0049         0.0156  
number_of_courses_viewed                  