In [26]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression

In [2]:
df = pd.read_csv('course_lead_scoring.csv')

In [3]:
df.head()

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.8,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1


### Data preparation

In [4]:
df.isnull().sum()

lead_source                 128
industry                    134
number_of_courses_viewed      0
annual_income               181
employment_status           100
location                     63
interaction_count             0
lead_score                    0
converted                     0
dtype: int64

In [5]:
df = df.fillna({
    'lead_source': 'NA',
    'industry': 'NA',
    'annual_income': 0.0,
    'employment_status': 'NA',
    'location': 'NA',
})

In [6]:
df.head()

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.8,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1


In [9]:
numerical = ['number_of_courses_viewed', 'annual_income', 'interaction_count', 'lead_score']

In [10]:
categorical = ['lead_source', 'industry', 'employment_status', 'location']

### Q1 industry mode

In [7]:
df.industry.mode()

0    retail
Name: industry, dtype: object

In [8]:
df.industry.value_counts()

industry
retail           203
finance          200
other            198
healthcare       187
education        187
technology       179
manufacturing    174
NA               134
Name: count, dtype: int64

### Q2 correlation matrix

In [11]:
df.corr(method='pearson', numeric_only=True)
# annual_income - interaction_count

Unnamed: 0,number_of_courses_viewed,annual_income,interaction_count,lead_score,converted
number_of_courses_viewed,1.0,0.00977,-0.023565,-0.004879,0.435914
annual_income,0.00977,1.0,0.027036,0.01561,0.053131
interaction_count,-0.023565,0.027036,1.0,0.009888,0.374573
lead_score,-0.004879,0.01561,0.009888,1.0,0.193673
converted,0.435914,0.053131,0.374573,0.193673,1.0


### Split the date

In [13]:
train_val_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

In [14]:
train_df, val_df = train_test_split(train_val_df, test_size=0.25, random_state=42)

In [15]:
len(train_df), len(val_df), len(test_df)

(876, 293, 293)

In [16]:
train_df = train_df.reset_index(drop=True)
val_df = val_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

y_train = train_df.converted.values
y_val = val_df.converted.values
y_test = test_df.converted.values

del train_df['converted']
del val_df['converted']
del test_df['converted']

### Q3 mutual information

In [18]:
for c in categorical:
    score = mutual_info_score(train_df[c], y_train)
    print(f'{c}: {score} ({round(score, 2)})')

lead_source: 0.03539624379726594 (0.04)
industry: 0.011574521435657112 (0.01)
employment_status: 0.012937677269442782 (0.01)
location: 0.004464157884038034 (0.0)


### Q4 Logistic Regression

In [23]:
dv = DictVectorizer(sparse=False)

In [25]:
X_train = dv.fit_transform(train_df.to_dict(orient='records'))
X_val = dv.fit_transform(val_df.to_dict(orient='records'))

In [27]:
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'liblinear'
,max_iter,1000


In [28]:
model.intercept_[0]

array([-0.06914728])

In [29]:
model.coef_[0].round(3)

array([-0.   , -0.015,  0.034,  0.003,  0.012, -0.103, -0.025,  0.049,
       -0.02 , -0.013, -0.003, -0.009, -0.032, -0.016,  0.311,  0.051,
        0.02 , -0.012, -0.012, -0.115,  0.08 , -0.03 ,  0.004, -0.011,
       -0.011, -0.006,  0.008,  0.006, -0.033, -0.025,  0.454])

In [31]:
y_pred = model.predict(X_val)

In [38]:
round((y_pred == y_val).mean(), 2)

np.float64(0.7)

In [43]:
origin_accuracy = float((y_pred == y_val).mean())

In [44]:
origin_accuracy

0.6996587030716723

### Q5 Feature importance

In [45]:
features = numerical + categorical

for f in features:
    tmp_train = train_df.copy()
    tmp_val = val_df.copy()
    del tmp_train[f]
    del tmp_val[f]

    X_train = dv.fit_transform(tmp_train.to_dict(orient='records'))
    X_val = dv.fit_transform(tmp_val.to_dict(orient='records'))

    model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    new_accuracy = float((y_pred == y_val).mean())
    print(f'{f}: {new_accuracy=} ({new_accuracy - origin_accuracy})')

number_of_courses_viewed: new_accuracy=0.5563139931740614 (-0.14334470989761094)
annual_income: new_accuracy=0.8532423208191127 (0.15358361774744034)
interaction_count: new_accuracy=0.5563139931740614 (-0.14334470989761094)
lead_score: new_accuracy=0.7064846416382252 (0.0068259385665528916)
lead_source: new_accuracy=0.7030716723549488 (0.0034129692832765013)
industry: new_accuracy=0.6996587030716723 (0.0)
employment_status: new_accuracy=0.6962457337883959 (-0.0034129692832763903)
location: new_accuracy=0.7098976109215017 (0.010238907849829393)


### Q6 Regulized logistic regression

In [46]:
dv = DictVectorizer(sparse=False)
X_train = dv.fit_transform(train_df.to_dict(orient='records'))
X_val = dv.fit_transform(val_df.to_dict(orient='records'))

for c in [0.01, 0.1, 1, 10, 100]:
    model = LogisticRegression(solver='liblinear', C=c, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    score = float((y_pred == y_val).mean())
    print(f'{c=}: {score=} ({round(score, 3)})')

c=0.01: score=0.6996587030716723 (0.7)
c=0.1: score=0.6996587030716723 (0.7)
c=1: score=0.6996587030716723 (0.7)
c=10: score=0.6996587030716723 (0.7)
c=100: score=0.6996587030716723 (0.7)
