In [59]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

In [60]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold

In [61]:
import random

from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
from sklearn.metrics import roc_auc_score

from collections import Counter
from tqdm.auto import tqdm

In [62]:
data = 'https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv'

!wget $data

--2025-10-18 19:36:30--  https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.110.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 80876 (79K) [text/plain]
Saving to: ‘course_lead_scoring.csv.1’


2025-10-18 19:36:30 (8.28 MB/s) - ‘course_lead_scoring.csv.1’ saved [80876/80876]



# Data Preparation

In [63]:
df = pd.read_csv('course_lead_scoring.csv')

df.head()

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.8,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1


In [64]:
# it looks like data is already low-cased and camel_cased but just in case:

df.columns = df.columns.str.lower().str.replace(' ', '_')

In [65]:
df.head()

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.8,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1


In [66]:
# Are there null or missing values
df.isnull().sum()

lead_source                 128
industry                    134
number_of_courses_viewed      0
annual_income               181
employment_status           100
location                     63
interaction_count             0
lead_score                    0
converted                     0
dtype: int64

In [67]:
# categorical = ['lead_source', 'industry', 'employment_status', 'location']
# numerical = ['number_of_courses_viewed', 'annual_income', 'interaction_count', 'lead_score']

df.dtypes

lead_source                  object
industry                     object
number_of_courses_viewed      int64
annual_income               float64
employment_status            object
location                     object
interaction_count             int64
lead_score                  float64
converted                     int64
dtype: object

In [93]:
# replace missing categorical values with NA and missing numeric values with 0.

categorical_columns = list(df.dtypes[df.dtypes == 'object'].index)

for c in categorical_columns:
    df[c] = df[c].fillna('na').str.lower()

df.annual_income = pd.to_numeric(df.annual_income, errors='coerce')
df.annual_income = df.annual_income.fillna(0)

In [94]:
# confirm there are no missing values

df.isnull().sum()

lead_source                 0
industry                    0
number_of_courses_viewed    0
annual_income               0
employment_status           0
location                    0
interaction_count           0
lead_score                  0
converted                   0
dtype: int64

In [95]:
# we need to split df into train, test, and validation with 60%/20%/20% distribution.
# 1462-(292+292) = 878
# .2 then .33
len(df)

1462

In [120]:
# Split the data into 3 parts: train/validation/test with 60%/20%/20% distribution

df_full_train, df_test = train_test_split(df,  test_size=0.2, random_state=1)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1)

df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

y_train = df_train.converted.values
y_val = df_val.converted.values
y_test = df_test.converted.values

del df_train['converted']
del df_val['converted']
del df_test['converted']

In [97]:
len(df_train), len(df_val), len(df_test)

(876, 293, 293)

In [98]:
# double check eveyrthing looks right
df_train.head()

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score
0,events,manufacturing,2,95543.0,unemployed,europe,3,0.78
1,referral,na,1,54924.0,student,south_america,6,0.39
2,organic_search,healthcare,2,77352.0,unemployed,europe,2,0.22
3,paid_ads,other,2,34600.0,employed,south_america,2,0.31
4,paid_ads,education,0,43615.0,unemployed,south_america,2,0.01


# Question 1

In [113]:
numeric_features = ['lead_score', 'number_of_courses_viewed', 'interaction_count','annual_income']

['lead_score',
 'number_of_courses_viewed',
 'interaction_count',
 'annual_income']

In [100]:
# compute AUC for a single feature, with inversion if needed

def auc_with_possible_inversion(scores, y_true):
    """Compute ROC AUC for a score vector; if < 0.5, invert scores and recompute.
    Returns (final_auc, inverted_flag)."""
    
    s = np.asarray(scores)
    y = np.asarray(y_true)
    
    if np.all(s == s[0]):
        return 0.5, False

    auc = roc_auc_score(y, s)
    if auc < 0.5:
        auc = roc_auc_score(y, -s)
        return auc, True
    else:
        return auc, False


In [101]:
# Compute per-feature AUCs on the training set only

records = []
for col in numeric_features:
    scores = df_train[col].values
    auc, inverted = auc_with_possible_inversion(scores, y_train)
    records.append({
        'feature': col,
        'auc': auc,
        'inverted': inverted,
    })

results_df = pd.DataFrame(records).sort_values('auc', ascending=False).reset_index(drop=True)
results_df

Unnamed: 0,feature,auc,inverted
0,number_of_courses_viewed,0.763568,False
1,interaction_count,0.73827,False
2,lead_score,0.614499,False
3,annual_income,0.551958,False


# Question 2

In [102]:
categorical_columns

['lead_source', 'industry', 'employment_status', 'location']

In [103]:
numeric_features

['lead_score',
 'number_of_courses_viewed',
 'interaction_count',
 'annual_income']

In [121]:
dv = DictVectorizer(sparse=False)

train_dict = df_train[categorical_columns + numeric_features].to_dict(orient='records')
val_dict = df_val[categorical_columns + numeric_features].to_dict(orient='records')

X_train = dv.fit_transform(train_dict)
X_val   = dv.transform(val_dict)

In [116]:
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000)
model.fit(X_train, y_train)

y_pred = model.predict_proba(X_val)[:, 1]
auc = roc_auc_score(y_val, y_pred)

print('AUC:', auc)
print('AUC (rounded to 3 d.p.):', round(auc, 3))

AUC: 0.8171316268814112
AUC (rounded to 3 d.p.): 0.817
