In [15]:
import wget
import pandas as pd
import numpy as np
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
import joblib

CSV_URL = "https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv"

def download_if_needed(path="course_lead_scoring.csv"):
    if not pd.io.common.file_exists(path):
        wget.download(CSV_URL, path)
    return path

csv_path = download_if_needed()
df = pd.read_csv(csv_path)
df.head()
df.isnull().sum()

# For caterogiral features, replace them with 'NA'
cat_cols = df.select_dtypes(include=['object']).columns
df[cat_cols] = df[cat_cols].fillna('NA')
# For numerical features, replace with with 0.0
num_cols = df.select_dtypes(include=[np.number]).columns
df[num_cols] = df[num_cols].fillna(0.0)
# verify if all missing values are replaced
df.isnull().sum().sum()

# 5. Define target and features
target = 'converted'   # converted = 1 if signed up, else 0
y = df[target].values
X = df.drop(columns=[target])

# 6. Shuffle and split: 60% train, 20% val, 20% test (seed=42)
df_shuffled = df.sample(frac=1, random_state=42).reset_index(drop=True)

n = len(df_shuffled)
train_end = int(0.6 * n)
val_end = train_end + int(0.2 * n)

df_train = df_shuffled.iloc[:train_end].reset_index(drop=True)
df_val   = df_shuffled.iloc[train_end:val_end].reset_index(drop=True)
df_test  = df_shuffled.iloc[val_end:].reset_index(drop=True)

X_train = df_train.drop(columns=[target])
y_train = df_train[target].values
X_val = df_val.drop(columns=[target])
y_val = df_val[target].values
X_test = df_test.drop(columns=[target])
y_test = df_test[target].values

print(len(X_train), len(X_val), len(X_test))

# 7. Vectorize features (DictVectorizer)
dv = DictVectorizer(sparse=False)

# convert DF -> list of dicts
train_dicts = X_train.to_dict(orient='records')
X_train_vec = dv.fit_transform(train_dicts)

val_dicts = X_val.to_dict(orient='records')
X_val_vec = dv.transform(val_dicts)

test_dicts = X_test.to_dict(orient='records')
X_test_vec = dv.transform(test_dicts)

# 8. Train Logistic Regression
model = LogisticRegression(C=1.0, solver='liblinear', max_iter=1000, random_state=42)
model.fit(X_train_vec, y_train)

# 9. Predict on validation set and compute accuracy
y_val_pred = model.predict(X_val_vec)
acc_val = accuracy_score(y_val, y_val_pred)
print("Validation accuracy:", round(acc_val, 2))   # expect ~0.84
print("\nConfusion matrix:\n", confusion_matrix(y_val, y_val_pred))
print("\nClassification report:\n", classification_report(y_val, y_val_pred, digits=3))

# 10. Final evaluation on test set (optional)
y_test_pred = model.predict(X_test_vec)
acc_test = accuracy_score(y_test, y_test_pred)
print("Test accuracy:", round(acc_test, 2))

# 11. Save model + vectorizer for serving
joblib.dump(dv, "dict_vectorizer.joblib")
joblib.dump(model, "logreg_model.joblib")
print("Saved dv and model.")


877 292 293
Validation accuracy: 0.72

Confusion matrix:
 [[ 38  78]
 [  4 172]]

Classification report:
               precision    recall  f1-score   support

           0      0.905     0.328     0.481       116
           1      0.688     0.977     0.808       176

    accuracy                          0.719       292
   macro avg      0.796     0.652     0.644       292
weighted avg      0.774     0.719     0.678       292

Test accuracy: 0.75
Saved dv and model.


Q1: What is the most frequent observation (mode) for the column industry?

In [8]:
df['industry'].mode()[0]

'retail'

In [27]:
corr=df.corr(numeric_only=True)
corr_pairs = {
    "interaction_count and lead_score": corr.loc["interaction_count", "lead_score"],
    "number_of_courses_viewed and lead_score": corr.loc["number_of_courses_viewed", "lead_score"],
    "number_of_courses_viewed and interaction_count": corr.loc["number_of_courses_viewed", "interaction_count"],
    "annual_income and interaction_count": corr.loc["annual_income", "interaction_count"],
}

corr_pairs

{'interaction_count and lead_score': np.float64(0.009888182496913105),
 'number_of_courses_viewed and lead_score': np.float64(-0.004878998354681265),
 'number_of_courses_viewed and interaction_count': np.float64(-0.023565222882888055),
 'annual_income and interaction_count': np.float64(0.027036472404814396)}

Ans: annual_income and interaction_count

Q2: Question 3. Biggest MI (1 point)

industry

location

lead_source

employment_status

In [11]:
from sklearn.metrics import mutual_info_score
# categorical features
cat_features = ["industry", "location", "lead_source", "employment_status"]

# target variable
target = df["converted"]

# compute mutual information for each categorical feature
mi_scores = {col: mutual_info_score(df[col], target) for col in cat_features}

mi_scores

{'industry': 0.007267435279688886,
 'location': 0.0014269064526338665,
 'lead_source': 0.026573987738060995,
 'employment_status': 0.011069560968622674}

In [17]:
df.info()
df.describe()
df['industry'].value_counts()
df['employment_status'].value_counts()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1462 entries, 0 to 1461
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   lead_source               1462 non-null   object 
 1   industry                  1462 non-null   object 
 2   number_of_courses_viewed  1462 non-null   int64  
 3   annual_income             1462 non-null   float64
 4   employment_status         1462 non-null   object 
 5   location                  1462 non-null   object 
 6   interaction_count         1462 non-null   int64  
 7   lead_score                1462 non-null   float64
 8   converted                 1462 non-null   int64  
dtypes: float64(2), int64(3), object(4)
memory usage: 102.9+ KB


employment_status
self_employed    352
student          348
unemployed       334
employed         328
NA               100
Name: count, dtype: int64

In [20]:
df_encoded = pd.get_dummies(df, columns=['industry', 'employment_status'], drop_first=True)


In [28]:
from sklearn.feature_selection import mutual_info_classif

X = df_encoded.drop('target', axis=1)
y = df_encoded['target']
print(df_encoded.columns)

mi_scores = mutual_info_classif(X, y)
mi_scores = pd.Series(mi_scores, index=
X = df_encoded.drop('converted', axis=1)  # features
y = df_encoded['converted']               # target.columns)
mi_scores.sort_values(ascending=False)

SyntaxError: '(' was never closed (1072800364.py, line 8)

Ans: lead_time