In [None]:
#!curl -o course_lead_scoring.csv $url https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv

In [None]:
import numpy
import pandas

In [None]:
df = pandas.read_csv("../../data/raw/course_lead_scoring.csv")
df.head()

In [None]:
df.isnull().sum()

In [None]:
df.dtypes

In [None]:
numerical = [
    "number_of_courses_viewed",
    "annual_income",
    "interaction_count",
    "lead_score",
]
categorical = ["lead_source", "industry", "employment_status", "location"]

df[numerical] = df[numerical].fillna(0.0)
df[categorical] = df[categorical].fillna("NA")

Question 1


In [None]:
df.industry.value_counts()

Question 2


In [None]:
from sklearn.model_selection import train_test_split

df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
len(df_full_train), len(df_test)

df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)
len(df_train), len(df_test), len(df_val)

In [None]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [None]:
y_train = df_train.converted.values
y_val = df_val.converted.values
y_test = df_test.converted.values

In [None]:
del df_train["converted"]
del df_val["converted"]
del df_test["converted"]

In [None]:
print(df_full_train[["interaction_count"]].corrwith(df_full_train.lead_score))
print(df_full_train[["number_of_courses_viewed"]].corrwith(df_full_train.lead_score))
print(
    df_full_train[["number_of_courses_viewed"]].corrwith(
        df_full_train.interaction_count
    )
)
print(df_full_train[["annual_income"]].corrwith(df_full_train.interaction_count))

Question 3


In [None]:
from sklearn.metrics import mutual_info_score


def mutual_score(series):
    return round(mutual_info_score(series, y_train), 2)


mi = df_train[categorical].apply(mutual_score)
mi.sort_values(ascending=False)

Question 4


In [None]:
from sklearn.feature_extraction import DictVectorizer

train_dicts = df_train[categorical + numerical].to_dict(orient="records")
train_dicts

dv = DictVectorizer(sparse=False)
X_train = dv.fit_transform(train_dicts)

val_dicts = df_val[categorical + numerical].to_dict(orient="records")
X_val = dv.transform(val_dicts)

In [None]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict_proba(X_val)[:, 1]
y_pred

decision = y_pred >= 0.5
global_acc = (y_val == decision).mean()
global_acc.round(2)

Question 5

In [None]:
dv.get_feature_names_out()

In [None]:
train_dicts = df_train[ [x for x in categorical if x != 'industry']+ numerical].to_dict(orient="records")
train_dicts

dv = DictVectorizer(sparse=False)
X_train = dv.fit_transform(train_dicts)

val_dicts = df_val[[x for x in categorical if x != 'industry'] + numerical].to_dict(orient="records")
X_val = dv.transform(val_dicts)

model = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict_proba(X_val)[:, 1]
y_pred

decision = y_pred >= 0.5
acc = (y_val == decision).mean()
acc-global_acc

In [None]:
train_dicts = df_train[ [x for x in categorical if x != 'employment_status']+ numerical].to_dict(orient="records")
train_dicts

dv = DictVectorizer(sparse=False)
X_train = dv.fit_transform(train_dicts)

val_dicts = df_val[[x for x in categorical if x != 'employment_status'] + numerical].to_dict(orient="records")
X_val = dv.transform(val_dicts)

model = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict_proba(X_val)[:, 1]
y_pred

decision = y_pred >= 0.5
acc = (y_val == decision).mean()
acc-global_acc

In [None]:
train_dicts = df_train[
    categorical + [x for x in numerical if x != "lead_score"]
].to_dict(orient="records")
train_dicts

dv = DictVectorizer(sparse=False)
X_train = dv.fit_transform(train_dicts)

val_dicts = df_val[categorical + [x for x in numerical if x != "lead_score"]].to_dict(
    orient="records"
)
X_val = dv.transform(val_dicts)

model = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict_proba(X_val)[:, 1]
y_pred

decision = y_pred >= 0.5
acc = (y_val == decision).mean()
acc-global_acc

Question 6

In [None]:
from sklearn.linear_model import Ridge

train_dicts = df_train[categorical + numerical].to_dict(orient="records")
train_dicts

dv = DictVectorizer(sparse=False)
X_train = dv.fit_transform(train_dicts)

val_dicts = df_val[categorical + numerical].to_dict(orient="records")
X_val = dv.transform(val_dicts)

for c in [0.01, 0.1, 1, 10, 100]:

    model = Ridge(alpha=c)
    model.fit(X_train, y_train)

    y_pred = model.predict(X_val)
    y_pred

    decision = y_pred >= 0.5
    acc = (y_val == decision).mean().round(3)
    print("c:", c, "acc:", acc)