In [124]:
#!curl -o course_lead_scoring.csv $url https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv

In [147]:
import numpy
import pandas

In [148]:
df = pandas.read_csv("../../data/raw/course_lead_scoring.csv")
df.head()

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.8,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1


In [149]:
df.isnull().sum()

lead_source                 128
industry                    134
number_of_courses_viewed      0
annual_income               181
employment_status           100
location                     63
interaction_count             0
lead_score                    0
converted                     0
dtype: int64

In [150]:
df.dtypes

lead_source                  object
industry                     object
number_of_courses_viewed      int64
annual_income               float64
employment_status            object
location                     object
interaction_count             int64
lead_score                  float64
converted                     int64
dtype: object

In [151]:
numerical = [
    "number_of_courses_viewed",
    "annual_income",
    "interaction_count",
    "lead_score",
]
categorical = ["lead_source", "industry", "employment_status", "location"]

df[numerical] = df[numerical].fillna(0.0)
df[categorical] = df[categorical].fillna("NA")

Question 1


In [152]:
df.industry.value_counts()

industry
retail           203
finance          200
other            198
healthcare       187
education        187
technology       179
manufacturing    174
NA               134
Name: count, dtype: int64

Question 2


In [153]:
from sklearn.model_selection import train_test_split

df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
len(df_full_train), len(df_test)

df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)
len(df_train), len(df_test), len(df_val)

(876, 293, 293)

In [154]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [155]:
y_train = df_train.converted.values
y_val = df_val.converted.values
y_test = df_test.converted.values

In [156]:
del df_train["converted"]
del df_val["converted"]
del df_test["converted"]

In [157]:
print(df_full_train[["interaction_count"]].corrwith(df_full_train.lead_score))
print(df_full_train[["number_of_courses_viewed"]].corrwith(df_full_train.lead_score))
print(
    df_full_train[["number_of_courses_viewed"]].corrwith(
        df_full_train.interaction_count
    )
)
print(df_full_train[["annual_income"]].corrwith(df_full_train.interaction_count))

interaction_count    0.025393
dtype: float64
number_of_courses_viewed    0.009427
dtype: float64
number_of_courses_viewed   -0.044381
dtype: float64
annual_income    0.011959
dtype: float64


Question 3


In [159]:
from sklearn.metrics import mutual_info_score


def mutual_score(series):
    return round(mutual_info_score(series, y_train), 2)


mi = df_train[categorical].apply(mutual_score)
mi.sort_values(ascending=False)

lead_source          0.04
industry             0.01
employment_status    0.01
location             0.00
dtype: float64

Question 4


In [160]:
from sklearn.feature_extraction import DictVectorizer

train_dicts = df_train[categorical + numerical].to_dict(orient="records")
train_dicts

dv = DictVectorizer(sparse=False)
X_train = dv.fit_transform(train_dicts)

val_dicts = df_val[categorical + numerical].to_dict(orient="records")
X_val = dv.transform(val_dicts)

In [161]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict_proba(X_val)[:, 1]
y_pred

decision = y_pred > 0.5
global_acc = (y_val == decision).mean()
global_acc.round(2)

np.float64(0.7)

Question 5

In [162]:
dv.get_feature_names_out()

array(['annual_income', 'employment_status=NA',
       'employment_status=employed', 'employment_status=self_employed',
       'employment_status=student', 'employment_status=unemployed',
       'industry=NA', 'industry=education', 'industry=finance',
       'industry=healthcare', 'industry=manufacturing', 'industry=other',
       'industry=retail', 'industry=technology', 'interaction_count',
       'lead_score', 'lead_source=NA', 'lead_source=events',
       'lead_source=organic_search', 'lead_source=paid_ads',
       'lead_source=referral', 'lead_source=social_media', 'location=NA',
       'location=africa', 'location=asia', 'location=australia',
       'location=europe', 'location=middle_east',
       'location=north_america', 'location=south_america',
       'number_of_courses_viewed'], dtype=object)

In [163]:
train_dicts = df_train[ [x for x in categorical if x != 'industry']+ numerical].to_dict(orient="records")
train_dicts

dv = DictVectorizer(sparse=False)
X_train = dv.fit_transform(train_dicts)

val_dicts = df_val[[x for x in categorical if x != 'industry'] + numerical].to_dict(orient="records")
X_val = dv.transform(val_dicts)

model = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict_proba(X_val)[:, 1]
y_pred

decision = y_pred > 0.5
acc = (y_val == decision).mean()
acc-global_acc

np.float64(0.0)

In [141]:
train_dicts = df_train[ [x for x in categorical if x != 'employment_status']+ numerical].to_dict(orient="records")
train_dicts

dv = DictVectorizer(sparse=False)
X_train = dv.fit_transform(train_dicts)

val_dicts = df_val[[x for x in categorical if x != 'employment_status'] + numerical].to_dict(orient="records")
X_val = dv.transform(val_dicts)

model = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict_proba(X_val)[:, 1]
y_pred

decision = y_pred > 0.5
acc = (y_val == decision).mean()
acc-global_acc

np.float64(-0.0034129692832763903)

In [142]:
train_dicts = df_train[
    categorical + [x for x in numerical if x != "lead_score"]
].to_dict(orient="records")
train_dicts

dv = DictVectorizer(sparse=False)
X_train = dv.fit_transform(train_dicts)

val_dicts = df_val[categorical + [x for x in numerical if x != "lead_score"]].to_dict(
    orient="records"
)
X_val = dv.transform(val_dicts)

model = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict_proba(X_val)[:, 1]
y_pred

decision = y_pred > 0.5
acc = (y_val == decision).mean()
acc-global_acc

np.float64(0.0068259385665528916)

Question 6

In [165]:
from sklearn.linear_model import Ridge

train_dicts = df_train[categorical + numerical].to_dict(orient="records")
train_dicts

dv = DictVectorizer(sparse=False)
X_train = dv.fit_transform(train_dicts)

val_dicts = df_val[categorical + numerical].to_dict(orient="records")
X_val = dv.transform(val_dicts)

for c in [0.01, 0.1, 1, 10, 100]:

    model = Ridge(alpha=c)
    model.fit(X_train, y_train)

    y_pred = model.predict(X_val)
    y_pred

    decision = y_pred > 0.5
    acc = (y_val == decision).mean().round(3)
    print("c:", c, "acc:", acc)

c: 0.01 acc: 0.85
c: 0.1 acc: 0.85
c: 1 acc: 0.846
c: 10 acc: 0.857
c: 100 acc: 0.836
