In [430]:
import pandas as pd
import numpy as np

from sklearn.metrics import mutual_info_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.linear_model import LogisticRegression


In [431]:
scoring_df = pd.read_csv('../dataset/course_lead_scoring.csv')

In [432]:
scoring_df.dtypes

lead_source                  object
industry                     object
number_of_courses_viewed      int64
annual_income               float64
employment_status            object
location                     object
interaction_count             int64
lead_score                  float64
converted                     int64
dtype: object

In [433]:
scoring_df.head()

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.8,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1


In [434]:
scoring_df.isna().sum()

lead_source                 128
industry                    134
number_of_courses_viewed      0
annual_income               181
employment_status           100
location                     63
interaction_count             0
lead_score                    0
converted                     0
dtype: int64

In [435]:
scoring_df[['lead_source', 'industry', 'employment_status', 'location']] = scoring_df[['lead_source', 'industry', 'employment_status', 'location']].fillna('NA')

scoring_df['annual_income'].fillna(0, inplace=True) 

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  scoring_df['annual_income'].fillna(0, inplace=True)


In [436]:
scoring_df.isna().sum()

lead_source                 0
industry                    0
number_of_courses_viewed    0
annual_income               0
employment_status           0
location                    0
interaction_count           0
lead_score                  0
converted                   0
dtype: int64

In [437]:
scoring_df['industry'].value_counts().idxmax()

'retail'

In [438]:
cols = ['interaction_count', 'number_of_courses_viewed', 'annual_income', 'lead_score']

corr_matrix = scoring_df[cols].corr().abs() 
np.fill_diagonal(corr_matrix.values, 0)     

max_corr = corr_matrix.unstack().sort_values(ascending=False).head(1)
print(max_corr)

interaction_count  annual_income    0.027036
dtype: float64


In [439]:
df_full_train, df_test = train_test_split(scoring_df, test_size=0.2, random_state=42)

df_train, df_val = train_test_split(df_full_train, test_size=0.2, random_state=42)

In [440]:
y_train = df_train['converted']

y_val = df_val['converted']

y_test = df_test['converted']

In [441]:
del df_train['converted']

del df_val['converted']

del df_test['converted']

In [442]:
print('industry mi:', mutual_info_score(df_train['industry'], y_train))

print('location mi:', mutual_info_score(df_train['location'], y_train))

print('lead_source mi:', mutual_info_score(df_train['lead_source'], y_train))

print('employment_status mi:', mutual_info_score(df_train['employment_status'], y_train))

# lead source

industry mi: 0.011490855136790837
location mi: 0.0044337360380081944
lead_source mi: 0.031265493895643426
employment_status mi: 0.011631223134153823


In [443]:
def prepare_X(df):

    df = df.copy()

    return pd.get_dummies(df)

In [444]:
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)

X_train = prepare_X(df_train)

model.fit(X_train, y_train)

In [445]:
X_val = prepare_X(df_val)

y_proba = model.predict_proba(X_val)[:, 1]

y_pred = (y_proba > 0.5).astype('int')

In [446]:
orig_acc = accuracy_score(y_val, y_pred)

In [447]:
column_test = ['industry', 'employment_status', 'lead_score']

for col in column_test:
    model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)

    X_train = pd.get_dummies(df_train.drop(columns=[col], errors='ignore'))
    X_val = pd.get_dummies(df_val.drop(columns=[col], errors='ignore'))

    X_val = X_val.reindex(columns=X_train.columns, fill_value=0)

    model.fit(X_train, y_train)
    y_pred = (model.predict_proba(X_val)[:, 1] > 0.5).astype(int)
    acc = accuracy_score(y_val, y_pred)
    print(col, orig_acc - acc)

industry 0.0
employment_status 0.012820512820512886
lead_score 0.0


In [448]:
C = [0.01, 0.1, 1, 10, 100]

X_train = prepare_X(df_train)

X_val = prepare_X(df_val)

for c in C:

    model = LogisticRegression(solver='liblinear', C=c, max_iter=1000, random_state=42)

    model.fit(X_train, y_train)

    y_prob = model.predict_proba(X_val)[:, 1]

    y_pred = (model.predict_proba(X_val)[:, 1] > 0.5).astype(int)

    acc = accuracy_score(y_val, y_pred)
    auc = roc_auc_score(y_val, y_prob)

    print(f"C={c:<5} | Accuracy={acc:.4f} | AUC={auc:.4f}")


C=0.01  | Accuracy=0.7051 | AUC=0.8605
C=0.1   | Accuracy=0.7051 | AUC=0.8530
C=1     | Accuracy=0.7051 | AUC=0.8523
C=10    | Accuracy=0.7051 | AUC=0.8522
C=100   | Accuracy=0.7051 | AUC=0.8522
