In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression

In [2]:
df = pd.read_csv("course_lead_scoring.csv")

df.head()

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.8,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1


In [3]:
print(df.dtypes)
df.isnull().sum()

lead_source                  object
industry                     object
number_of_courses_viewed      int64
annual_income               float64
employment_status            object
location                     object
interaction_count             int64
lead_score                  float64
converted                     int64
dtype: object


lead_source                 128
industry                    134
number_of_courses_viewed      0
annual_income               181
employment_status           100
location                     63
interaction_count             0
lead_score                    0
converted                     0
dtype: int64

In [4]:
numerical = ['number_of_courses_viewed', 'annual_income', 'interaction_count', 'lead_score', 'converted']
numerical_nc = ['number_of_courses_viewed', 'annual_income', 'interaction_count', 'lead_score']
categorical = ['lead_source', 'industry', 'employment_status', 'location']

In [5]:
df[numerical] = df[numerical].fillna(0.0)
df.isnull().sum()

lead_source                 128
industry                    134
number_of_courses_viewed      0
annual_income                 0
employment_status           100
location                     63
interaction_count             0
lead_score                    0
converted                     0
dtype: int64

In [6]:
df[categorical] = df[categorical].fillna('NA')
df.isnull().sum()

lead_source                 0
industry                    0
number_of_courses_viewed    0
annual_income               0
employment_status           0
location                    0
interaction_count           0
lead_score                  0
converted                   0
dtype: int64

In [7]:
df.head()

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.8,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1


In [8]:
df['industry'].value_counts()

# Q1: retail

industry
retail           203
finance          200
other            198
healthcare       187
education        187
technology       179
manufacturing    174
NA               134
Name: count, dtype: int64

In [9]:
df[numerical].corr()

# df[['interaction_count']].corrwith(df['lead_score'])

# interaction_count and lead_score - 0.009888
# number_of_courses_viewed and lead_score - -0.004879
# number_of_courses_viewed and interaction_count - -0.023565
# annual_income and lead_score - 0.015610

# annual_income and lead_score (< converted and number_of_courses_viewed)

Unnamed: 0,number_of_courses_viewed,annual_income,interaction_count,lead_score,converted
number_of_courses_viewed,1.0,0.00977,-0.023565,-0.004879,0.435914
annual_income,0.00977,1.0,0.027036,0.01561,0.053131
interaction_count,-0.023565,0.027036,1.0,0.009888,0.374573
lead_score,-0.004879,0.01561,0.009888,1.0,0.193673
converted,0.435914,0.053131,0.374573,0.193673,1.0


In [10]:
df_train_full, df_val = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_test = train_test_split(df_train_full, test_size=0.25, random_state=42)

y_train = df_train['converted']
del df_train['converted']
y_val = df_val['converted']
del df_val['converted']
y_test = df_test['converted']
del df_test['converted']

In [11]:
for col in categorical:
    print(round(mutual_info_score(y_train, df_train[col]), 2), col)

# lead source, 0.04

0.04 lead_source
0.01 industry
0.01 employment_status
0.0 location


In [12]:
# trying something different here

# preprocessor = ColumnTransformer([
#     ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical),
#     ('num', StandardScaler(), numerical_nc)
# ])

# df_train2 = preprocessor.fit_transform(df_train)
# df_train2.shape

In [13]:
records = df_train[categorical + numerical_nc].to_dict(orient='records')
vec = DictVectorizer(sparse=False)

df_train2 = vec.fit_transform(records)

# feature_names = vec.get_feature_names_out()
df_train2 = pd.DataFrame(df_train2, columns=vec.get_feature_names_out())

# df_train2.shape

In [14]:
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)

model.fit(df_train2, y_train)

# df_val2 = preprocessor.transform(df_val)
df_val2 = df_val[categorical + numerical_nc].to_dict(orient='records')
df_val2 = vec.transform(df_val2)
df_val2 = pd.DataFrame(df_val2, columns=vec.get_feature_names_out())

y_pred = model.predict(df_val2)

# y_pred = model.predict_proba(df_val2)[:, 1] > 0.5

avg_res = (y_val == y_pred).mean() # 0.72, i guess 0.74?
avg_res

np.float64(0.726962457337884)

In [15]:
base_train = df_train.copy()
base_val = df_val.copy()

cols = base_train.columns

for col in base_train.columns:
    dv = DictVectorizer(sparse=False)
    
    x_train = base_train.drop(columns=col)
    x_rec_train = x_train.to_dict(orient='records')
    x_train = dv.fit_transform(x_rec_train)

    x_val = base_val.drop(columns=col)
    x_rec_val = x_val.to_dict(orient='records')
    x_val = dv.transform(x_rec_val)

    model_s = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
    model_s.fit(x_train, y_train)

    y_pred_s = model_s.predict(x_val)

    # y_pred = model.predict_proba(df_val2)[:, 1] > 0.5

    print(col, (y_val == y_pred_s).mean() - avg_res) # location has difference of 0?

lead_source 0.0034129692832763903
industry 0.010238907849829282
number_of_courses_viewed -0.058020477815699745
annual_income 0.09215017064846409
employment_status 0.013651877133105783
location 0.0
interaction_count -0.061433447098976135
lead_score 0.0068259385665528916


In [16]:
base_train = df_train.copy()
base_val = df_val.copy()

dv2 = DictVectorizer(sparse=False)

x_rec_train = base_train.to_dict(orient='records')
x_train2 = dv2.fit_transform(x_rec_train)

x_rec_val = base_val.to_dict(orient='records')
x_val2 = dv2.transform(x_rec_val)

for C in [0.01, 0.1, 1, 10, 100]:
    model_x = LogisticRegression(solver='liblinear', C=C, max_iter=1000, random_state=42)
    model_x.fit(x_train2, y_train)

    y_pred_x = model_x.predict(x_val2)

    # y_pred = model.predict_proba(df_val2)[:, 1] > 0.5

    print(C, round((y_val == y_pred_x).mean(), 3)) # 0.01

0.01 0.73
0.1 0.73
1 0.727
10 0.727
100 0.727
