In [49]:
import wget
import pandas as pd
import numpy as np
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
import joblib

CSV_URL = "https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv"

def download_if_needed(path="course_lead_scoring.csv"):
    if not pd.io.common.file_exists(path):
        wget.download(CSV_URL, path)
    return path

csv_path = download_if_needed()
df = pd.read_csv(csv_path)
df.head()
df.isnull().sum()

# For caterogiral features, replace them with 'NA'
cat_cols = df.select_dtypes(include=['object']).columns
df[cat_cols] = df[cat_cols].fillna('NA')
# For numerical features, replace with with 0.0
num_cols = df.select_dtypes(include=[np.number]).columns
df[num_cols] = df[num_cols].fillna(0.0)
# verify if all missing values are replaced
df.isnull().sum().sum()

# 5. Define target and features
target = 'converted'   # converted = 1 if signed up, else 0
y = df[target].values
X = df.drop(columns=[target])

# 6. Shuffle and split: 60% train, 20% val, 20% test (seed=42)
df_shuffled = df.sample(frac=1, random_state=42).reset_index(drop=True)

n = len(df_shuffled)
train_end = int(0.6 * n)
val_end = train_end + int(0.2 * n)

df_train = df_shuffled.iloc[:train_end].reset_index(drop=True)
df_val   = df_shuffled.iloc[train_end:val_end].reset_index(drop=True)
df_test  = df_shuffled.iloc[val_end:].reset_index(drop=True)

X_train = df_train.drop(columns=[target])
y_train = df_train[target].values
X_val = df_val.drop(columns=[target])
y_val = df_val[target].values
X_test = df_test.drop(columns=[target])
y_test = df_test[target].values

print(len(X_train), len(X_val), len(X_test))

# 7. Vectorize features (DictVectorizer)
dv = DictVectorizer(sparse=False)

# convert DF -> list of dicts
train_dicts = X_train.to_dict(orient='records')
X_train_vec = dv.fit_transform(train_dicts)

val_dicts = X_val.to_dict(orient='records')
X_val_vec = dv.transform(val_dicts)

test_dicts = X_test.to_dict(orient='records')
X_test_vec = dv.transform(test_dicts)

# 8. Train Logistic Regression
model = LogisticRegression(C=1.0, solver='liblinear', max_iter=1000, random_state=42)
model.fit(X_train_vec, y_train)

# 9. Predict on validation set and compute accuracy
y_val_pred = model.predict(X_val_vec)
acc_val = accuracy_score(y_val, y_val_pred)
print("Validation accuracy:", round(acc_val, 2))   # expect ~0.84
print("\nConfusion matrix:\n", confusion_matrix(y_val, y_val_pred))
print("\nClassification report:\n", classification_report(y_val, y_val_pred, digits=3))

# 10. Final evaluation on test set (optional)
y_test_pred = model.predict(X_test_vec)
acc_test = accuracy_score(y_test, y_test_pred)
print("Test accuracy:", round(acc_test, 2))

# 11. Save model + vectorizer for serving
joblib.dump(dv, "dict_vectorizer.joblib")
joblib.dump(model, "logreg_model.joblib")
print("Saved dv and model.")


877 292 293
Validation accuracy: 0.72

Confusion matrix:
 [[ 38  78]
 [  4 172]]

Classification report:
               precision    recall  f1-score   support

           0      0.905     0.328     0.481       116
           1      0.688     0.977     0.808       176

    accuracy                          0.719       292
   macro avg      0.796     0.652     0.644       292
weighted avg      0.774     0.719     0.678       292

Test accuracy: 0.75
Saved dv and model.


Q1: What is the most frequent observation (mode) for the column industry?

In [50]:
df['industry'].mode()[0]

'retail'

In [51]:
corr=df.corr(numeric_only=True)
corr_pairs = {
    "interaction_count and lead_score": corr.loc["interaction_count", "lead_score"],
    "number_of_courses_viewed and lead_score": corr.loc["number_of_courses_viewed", "lead_score"],
    "number_of_courses_viewed and interaction_count": corr.loc["number_of_courses_viewed", "interaction_count"],
    "annual_income and interaction_count": corr.loc["annual_income", "interaction_count"],
}

corr_pairs

{'interaction_count and lead_score': np.float64(0.009888182496913105),
 'number_of_courses_viewed and lead_score': np.float64(-0.004878998354681265),
 'number_of_courses_viewed and interaction_count': np.float64(-0.023565222882888055),
 'annual_income and interaction_count': np.float64(0.027036472404814396)}

Ans: annual_income and interaction_count

Q2: Question 3. Biggest MI (1 point)

industry

location

lead_source

employment_status

In [52]:
from sklearn.metrics import mutual_info_score
# categorical features
cat_features = ["industry", "location", "lead_source", "employment_status"]

# target variable
target = df["converted"]

# compute mutual information for each categorical feature
mi_scores = {col: mutual_info_score(df[col], target) for col in cat_features}

mi_scores

{'industry': 0.007267435279688886,
 'location': 0.0014269064526338665,
 'lead_source': 0.026573987738060995,
 'employment_status': 0.011069560968622674}

Ans: lead_source

In [17]:
df.info()
df.describe()
df['industry'].value_counts()
df['employment_status'].value_counts()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1462 entries, 0 to 1461
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   lead_source               1462 non-null   object 
 1   industry                  1462 non-null   object 
 2   number_of_courses_viewed  1462 non-null   int64  
 3   annual_income             1462 non-null   float64
 4   employment_status         1462 non-null   object 
 5   location                  1462 non-null   object 
 6   interaction_count         1462 non-null   int64  
 7   lead_score                1462 non-null   float64
 8   converted                 1462 non-null   int64  
dtypes: float64(2), int64(3), object(4)
memory usage: 102.9+ KB


employment_status
self_employed    352
student          348
unemployed       334
employed         328
NA               100
Name: count, dtype: int64

In [54]:
df_encoded = pd.get_dummies(df, columns=['industry', 'employment_status'], drop_first=True)


In [55]:
from sklearn.feature_selection import mutual_info_classif
import pandas as pd

print(df_encoded.columns.tolist())

target_column = 'converted'

categorical_features = ['industry', 'employment_status']

df_encoded = pd.get_dummies(df[categorical_features + ['lead_score', target_column]], drop_first=True)

X = df_encoded.drop(target_column, axis=1)
y = df_encoded[target_column]

mi_scores = mutual_info_classif(X, y)
mi_scores = pd.Series(mi_scores, index=X.columns)
mi_scores.sort_values(ascending=False)

df_encoded.corr()

['lead_source', 'number_of_courses_viewed', 'annual_income', 'location', 'interaction_count', 'lead_score', 'converted', 'industry_education', 'industry_finance', 'industry_healthcare', 'industry_manufacturing', 'industry_other', 'industry_retail', 'industry_technology', 'employment_status_employed', 'employment_status_self_employed', 'employment_status_student', 'employment_status_unemployed']


Unnamed: 0,lead_score,converted,industry_education,industry_finance,industry_healthcare,industry_manufacturing,industry_other,industry_retail,industry_technology,employment_status_employed,employment_status_self_employed,employment_status_student,employment_status_unemployed
lead_score,1.0,0.193673,0.057794,0.0013,-0.015995,-0.034742,-0.011017,0.044587,-0.017102,-0.041528,-0.02552,-0.033226,0.089943
converted,0.193673,1.0,0.102241,-0.019686,-0.011622,0.036065,-0.006442,-0.027128,-0.037829,0.077532,0.039883,0.038307,-0.136712
industry_education,0.057794,0.102241,1.0,-0.152458,-0.146667,-0.140761,-0.151574,-0.15378,-0.143047,0.000228,-0.028852,-0.007269,0.040385
industry_finance,0.0013,-0.019686,-0.152458,1.0,-0.152458,-0.146319,-0.157559,-0.159853,-0.148696,0.024478,0.050498,-0.035549,-0.041204
industry_healthcare,-0.015995,-0.011622,-0.146667,-0.152458,1.0,-0.140761,-0.151574,-0.15378,-0.143047,-0.048866,-0.024062,0.03601,0.015995
industry_manufacturing,-0.034742,0.036065,-0.140761,-0.146319,-0.140761,1.0,-0.145471,-0.147588,-0.137287,-0.000187,0.005468,0.027691,-0.033967
industry_other,-0.011017,-0.006442,-0.151574,-0.157559,-0.151574,-0.145471,1.0,-0.158926,-0.147833,-0.049937,0.010886,0.018165,0.003647
industry_retail,0.044587,-0.027128,-0.15378,-0.159853,-0.15378,-0.147588,-0.158926,1.0,-0.149985,0.025876,0.023708,-0.015421,0.002939
industry_technology,-0.017102,-0.037829,-0.143047,-0.148696,-0.143047,-0.137287,-0.147833,-0.149985,1.0,0.019215,-0.029758,0.001924,-0.01438
employment_status_employed,-0.041528,0.077532,0.000228,0.024478,-0.048866,-0.000187,-0.049937,0.025876,0.019215,1.0,-0.302859,-0.300592,-0.29265


In [56]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier()
model.fit(X, y)
importances = pd.Series(model.feature_importances_, index=X.columns)
importances.sort_values(ascending=False)

lead_score                         0.834383
employment_status_unemployed       0.025211
industry_education                 0.017421
employment_status_employed         0.016654
employment_status_student          0.015215
employment_status_self_employed    0.014105
industry_technology                0.013476
industry_finance                   0.013376
industry_retail                    0.013371
industry_other                     0.012490
industry_healthcare                0.012380
industry_manufacturing             0.011918
dtype: float64

In [57]:
from sklearn.feature_selection import RFE

rfe = RFE(model, n_features_to_select=2)
rfe.fit(X, y)
selected_features = X.columns[rfe.support_]

Q-6: parameter tuning

Options: 0.01, 0.1, 1, 10, 100

In [58]:
param_grid = {'C': [0.01, 0.1, 1, 10, 100]}

from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.svm import SVC

import pandas as pd

categorical_features = ['industry', 'employment_status']
X = pd.get_dummies(df[categorical_features + ['lead_score']], drop_first=True)
y = df['converted']  # replace with your actual target

print(X.isnull().sum())
print(y.isnull().sum())

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

grid = GridSearchCV(SVC(), param_grid, cv=5)
grid.fit(X_train, y_train)
print(grid.best_params_)

lead_score                         0
industry_education                 0
industry_finance                   0
industry_healthcare                0
industry_manufacturing             0
industry_other                     0
industry_retail                    0
industry_technology                0
employment_status_employed         0
employment_status_self_employed    0
employment_status_student          0
employment_status_unemployed       0
dtype: int64
0
{'C': 10}


Ans: 10

In [59]:
from sklearn.metrics import accuracy_score, classification_report

y_pred = grid.predict(X_test)
print("Test Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Test Accuracy: 0.5802047781569966
              precision    recall  f1-score   support

           0       0.35      0.31      0.33        98
           1       0.67      0.72      0.69       195

    accuracy                           0.58       293
   macro avg       0.51      0.51      0.51       293
weighted avg       0.57      0.58      0.57       293

