In [1021]:
import pandas as pd
import numpy as np

In [1022]:
df = pd.read_csv('course_lead_scoring.csv')
df

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.80,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1
...,...,...,...,...,...,...,...,...,...
1457,referral,manufacturing,1,,self_employed,north_america,4,0.53,1
1458,referral,technology,3,65259.0,student,europe,2,0.24,1
1459,paid_ads,technology,1,45688.0,student,north_america,3,0.02,1
1460,referral,,5,71016.0,self_employed,north_america,0,0.25,1


In [1023]:
df.isnull().sum()

lead_source                 128
industry                    134
number_of_courses_viewed      0
annual_income               181
employment_status           100
location                     63
interaction_count             0
lead_score                    0
converted                     0
dtype: int64

In [1024]:
df.dtypes

lead_source                  object
industry                     object
number_of_courses_viewed      int64
annual_income               float64
employment_status            object
location                     object
interaction_count             int64
lead_score                  float64
converted                     int64
dtype: object

In [1025]:
df.lead_source = df.lead_source.fillna('NA')
df.industry = df.industry.fillna('NA')
df.annual_income = df.annual_income.fillna(0.0)
df.employment_status = df.employment_status.fillna('NA')
df.location = df.location.fillna('NA')

In [1026]:
df.isnull().sum()

lead_source                 0
industry                    0
number_of_courses_viewed    0
annual_income               0
employment_status           0
location                    0
interaction_count           0
lead_score                  0
converted                   0
dtype: int64

In [1027]:
df.industry.mode()

0    retail
Name: industry, dtype: object

In [1028]:
new_df = df[['number_of_courses_viewed', 'annual_income', 'interaction_count', 'lead_score', 'converted']]
new_df.corr()

Unnamed: 0,number_of_courses_viewed,annual_income,interaction_count,lead_score,converted
number_of_courses_viewed,1.0,0.00977,-0.023565,-0.004879,0.435914
annual_income,0.00977,1.0,0.027036,0.01561,0.053131
interaction_count,-0.023565,0.027036,1.0,0.009888,0.374573
lead_score,-0.004879,0.01561,0.009888,1.0,0.193673
converted,0.435914,0.053131,0.374573,0.193673,1.0


In [1029]:
from sklearn.model_selection import train_test_split

In [1030]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

In [1031]:
y_train = df_train.converted.values
y_val = df_val.converted.values
y_test = df_test.converted.values

del df_train['converted']
del df_val['converted']
del df_test['converted']

In [1032]:
from sklearn.metrics import mutual_info_score

In [1033]:
df['converted'].isna().sum()

0

In [1034]:
round(mutual_info_score(df_full_train.converted, df_full_train.industry), 2)

0.01

In [1035]:
round(mutual_info_score(df_full_train.converted, df_full_train.location), 2)

0.0

In [1036]:
round(mutual_info_score(df_full_train.converted, df_full_train.lead_source), 2)

0.03

In [1037]:
round(mutual_info_score(df_full_train.converted, df_full_train.employment_status), 2)

0.01

In [1038]:
from sklearn.feature_extraction import DictVectorizer

dicts = df_train[['industry', 'location']].iloc[:10].to_dict(orient='records')

In [1039]:
dv = DictVectorizer(sparse=False)

In [1040]:
dv.fit(dicts)

In [1041]:
dv.get_feature_names_out()

array(['industry=NA', 'industry=education', 'industry=finance',
       'industry=manufacturing', 'industry=retail', 'industry=technology',
       'location=africa', 'location=asia', 'location=australia',
       'location=europe', 'location=middle_east',
       'location=north_america', 'location=south_america'], dtype=object)

In [1042]:
dv.transform(dicts)

array([[0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 0., 1., 0., 0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0., 0., 0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 1.],
       [0., 0., 1., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 1., 0., 0.],
       [1., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.]])

In [1043]:
categorical = ['lead_source', 'industry', 'employment_status', 'location']
numerical = ['number_of_courses_viewed', 'annual_income', 'interaction_count', 'lead_score']

In [1044]:
dv = DictVectorizer(sparse=False)

train_dict = df_train[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dict)

In [1045]:
from sklearn.linear_model import LogisticRegression

In [1046]:
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

In [1047]:
converted_dec = model.predict_proba(X_val)[:, 1] >= 0.5

In [1048]:
round((y_val == converted_dec.astype(int)).mean(), 2)

0.7

In [1049]:
dict(zip(dv.get_feature_names_out(), model.coef_[0]))

{'annual_income': -1.778438787433868e-05,
 'employment_status=NA': -0.014715442322527165,
 'employment_status=employed': 0.033909522476138036,
 'employment_status=self_employed': 0.002662484320898002,
 'employment_status=student': 0.011523851798964565,
 'employment_status=unemployed': -0.10252769655174766,
 'industry=NA': -0.024851099469834454,
 'industry=education': 0.04936042217241218,
 'industry=finance': -0.020125834431371156,
 'industry=healthcare': -0.013421486506612804,
 'industry=manufacturing': -0.0030023220026027674,
 'industry=other': -0.009259918296687762,
 'industry=retail': -0.031795730351031705,
 'industry=technology': -0.016051311392545873,
 'interaction_count': 0.31133915476372276,
 'lead_score': 0.0512012527888777,
 'lead_source=NA': 0.020151169797463418,
 'lead_source=events': -0.012034628430066778,
 'lead_source=organic_search': -0.01160215208502322,
 'lead_source=paid_ads': -0.11525188018426467,
 'lead_source=referral': 0.07953034355373848,
 'lead_source=social_med

In [1050]:
small_industry = ['industry', 'number_of_courses_viewed', 'annual_income']

In [1051]:
df_train[small_industry].iloc[:10].to_dict(orient='records')

[{'industry': 'retail',
  'number_of_courses_viewed': 0,
  'annual_income': 58472.0},
 {'industry': 'manufacturing',
  'number_of_courses_viewed': 3,
  'annual_income': 71738.0},
 {'industry': 'technology',
  'number_of_courses_viewed': 3,
  'annual_income': 81973.0},
 {'industry': 'technology',
  'number_of_courses_viewed': 1,
  'annual_income': 74956.0},
 {'industry': 'retail',
  'number_of_courses_viewed': 3,
  'annual_income': 59335.0},
 {'industry': 'retail',
  'number_of_courses_viewed': 4,
  'annual_income': 50961.0},
 {'industry': 'finance', 'number_of_courses_viewed': 0, 'annual_income': 0.0},
 {'industry': 'retail',
  'number_of_courses_viewed': 3,
  'annual_income': 45170.0},
 {'industry': 'NA', 'number_of_courses_viewed': 0, 'annual_income': 73717.0},
 {'industry': 'education',
  'number_of_courses_viewed': 2,
  'annual_income': 61872.0}]

In [1052]:
dicts_train_small = df_train[small_industry].to_dict(orient='records')
dicts_val_small = df_val[small_industry].to_dict(orient='records')

In [1053]:
dv_small_industry = DictVectorizer(sparse=False)
dv_small_industry.fit(dicts_train_small)

In [1054]:
dv_small_industry.get_feature_names_out()

array(['annual_income', 'industry=NA', 'industry=education',
       'industry=finance', 'industry=healthcare',
       'industry=manufacturing', 'industry=other', 'industry=retail',
       'industry=technology', 'number_of_courses_viewed'], dtype=object)

In [1055]:
X_train_small_industry = dv_small_industry.transform(dicts_train_small)

In [1056]:
dv = DictVectorizer(sparse=False)

train_dict = df_train[['lead_source', 'industry', 'employment_status', 'location'] + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val[['lead_source', 'industry', 'employment_status', 'location'] + numerical].to_dict(orient='records')
X_val = dv.transform(val_dict)

model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

converted_dec = model.predict_proba(X_val)[:, 1] >= 0.5

orig = (y_val == converted_dec.astype(int)).mean()

orig

0.6996587030716723

In [1057]:
dv = DictVectorizer(sparse=False)

train_dict = df_train[['lead_source', 'employment_status', 'location'] + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val[['lead_source', 'employment_status', 'location'] + numerical].to_dict(orient='records')
X_val = dv.transform(val_dict)

model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

converted_dec = model.predict_proba(X_val)[:, 1] >= 0.5

(y_val == converted_dec.astype(int)).mean()



0.6996587030716723

In [1058]:
dv = DictVectorizer(sparse=False)

train_dict = df_train[['lead_source', 'industry', 'location'] + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val[['lead_source', 'industry', 'location'] + numerical].to_dict(orient='records')
X_val = dv.transform(val_dict)

model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

converted_dec = model.predict_proba(X_val)[:, 1] >= 0.5

(y_val == converted_dec.astype(int)).mean()

0.6962457337883959

In [1060]:
dv = DictVectorizer(sparse=False)

train_dict = df_train[categorical + ['number_of_courses_viewed', 'annual_income', 'interaction_count']].to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val[categorical + ['number_of_courses_viewed', 'annual_income', 'interaction_count']].to_dict(orient='records')
X_val = dv.transform(val_dict)

model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

converted_dec = model.predict_proba(X_val)[:, 1] >= 0.5

(y_val == converted_dec.astype(int)).mean()

0.7064846416382252

In [1064]:
dv = DictVectorizer(sparse=False)

train_dict = df_train[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dict)

for c in [0.01, 0.1, 1, 10, 100]:

    model = LogisticRegression(solver='liblinear', C=c, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)

    converted_dec = model.predict_proba(X_val)[:, 1] >= 0.5

    print((y_val == converted_dec.astype(int)).mean())


0.6996587030716723
0.6996587030716723
0.6996587030716723
0.6996587030716723
0.6996587030716723
