In [None]:
import pandas as pd
import numpy as np

import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline

In [None]:
df = pd.read_csv('https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv')
len(df)


In [None]:
# eda
df.dtypes

In [17]:
df.dtypes[df.dtypes == 'object'].index

Index(['lead_source', 'industry', 'employment_status', 'location'], dtype='object')

In [18]:
for col in df.columns:
    print(col)
    print(df[col].head())
    print(df[col].unique())
    print(df[col].unique()[:5])
    print(df[col].nunique())
    print()
    

lead_source
0        paid_ads
1    social_media
2          events
3        paid_ads
4        referral
Name: lead_source, dtype: object
['paid_ads' 'social_media' 'events' 'referral' 'organic_search' 'NA']
['paid_ads' 'social_media' 'events' 'referral' 'organic_search']
6

industry
0            NA
1        retail
2    healthcare
3        retail
4     education
Name: industry, dtype: object
['NA' 'retail' 'healthcare' 'education' 'manufacturing' 'technology'
 'other' 'finance']
['NA' 'retail' 'healthcare' 'education' 'manufacturing']
8

number_of_courses_viewed
0    1
1    1
2    5
3    2
4    3
Name: number_of_courses_viewed, dtype: int64
[1 5 2 3 0 4 6 8 7 9]
[1 5 2 3 0]
10

annual_income
0    79450.0
1    46992.0
2    78796.0
3    83843.0
4    85012.0
Name: annual_income, dtype: float64
[79450. 46992. 78796. ... 45688. 71016. 92855.]
[79450. 46992. 78796. 83843. 85012.]
1268

employment_status
0       unemployed
1         employed
2       unemployed
3               NA
4    self_employ

In [19]:
categorical_columns = list(df.dtypes[df.dtypes =='object'].index)

In [20]:
categorical_columns

['lead_source', 'industry', 'employment_status', 'location']

In [21]:
# DATA PREPARATION
# We have annual_income of type float with nulls, replacing them with 0.0
df.isnull().sum()
#df = df['annual_income'].fillna(0.0)

#df.isnull().sum()

lead_source                 0
industry                    0
number_of_courses_viewed    0
annual_income               0
employment_status           0
location                    0
interaction_count           0
lead_score                  0
converted                   0
dtype: int64

In [54]:
df_prepared = df
df_prepared.annual_income = df_prepared.annual_income.fillna(0.0)

categorical_columns = list(df_prepared.dtypes[df_prepared.dtypes == 'object'].index)

for c in categorical_columns:    
    df_prepared[c] = df_prepared[c].fillna('NA')
    df_prepared[c] = df_prepared[c].str.lower().replace(' ', '_')

#df_prepared


In [76]:
df_prepared

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,na,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.80,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,na,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1
...,...,...,...,...,...,...,...,...,...
1457,referral,manufacturing,1,0.0,self_employed,north_america,4,0.53,1
1458,referral,technology,3,65259.0,student,europe,2,0.24,1
1459,paid_ads,technology,1,45688.0,student,north_america,3,0.02,1
1460,referral,na,5,71016.0,self_employed,north_america,0,0.25,1


In [55]:
df_prepared.isnull().sum()

lead_source                 0
industry                    0
number_of_courses_viewed    0
annual_income               0
employment_status           0
location                    0
interaction_count           0
lead_score                  0
converted                   0
dtype: int64

In [56]:
# Q1 - Most frequesnt observstion in column industry. Answer - retail
df_prepared.industry.mode()

0    retail
Name: industry, dtype: object

In [57]:
# Q2 - correlation matrix - Answer: `annual_income and interaction_count`
numerical1 = ['interaction_count', 'number_of_courses_viewed' ]
numerical1

['interaction_count', 'number_of_courses_viewed']

In [58]:
df_prepared[numerical1].corrwith(df_prepared.lead_score)

interaction_count           0.009888
number_of_courses_viewed   -0.004879
dtype: float64

In [59]:
numerical2 = ['number_of_courses_viewed', 'annual_income' ]
numerical2

['number_of_courses_viewed', 'annual_income']

In [60]:
df_prepared[numerical2].corrwith(df_prepared.interaction_count)

number_of_courses_viewed   -0.023565
annual_income               0.027036
dtype: float64

In [163]:
# Q3 - data split

from sklearn.model_selection import train_test_split

df_train_full, df_test = train_test_split(df_prepared, test_size=0.2, random_state=42)

len(df_train_full), len(df_test)
    

(1169, 293)

In [164]:
#train_test_split?
df_train, df_val = train_test_split(df_train_full, test_size=0.25, random_state=42)
len(df_train), len(df_val), len(df_test)

(876, 293, 293)

In [165]:
y_train = df_train.converted.values
y_val = df_val.converted.values
y_test = df_test.converted.values

# removing the converted filed from training data
del df_train['converted']
del df_val['converted']

In [166]:
df_train.shape

(876, 8)

In [114]:
# we have not removed the `converted` column here it is OK
df_test.shape

(293, 9)

In [112]:
df_val.shape

(293, 8)

In [67]:
# Q3 - Calculate mutual information score between converted and other categorical vars in the training dataset
# round the scores to 2 decimals.
# Which of th vars has the biggest mutual information score? Answer: lead_source (0.04)

from sklearn.metrics import mutual_info_score

def calculate_mi(series):
    score = mutual_info_score(series, y_train)
    return round(score, 2)

fields = ['industry', 'location', 'lead_source', 'employment_status']

df_mi = df_train[fields].apply(calculate_mi)
df_mi = df_mi.sort_values(ascending=False).to_frame(name='MI')


display(df_mi.head())
# display(df_mi.tail())    
#mutual_info_score(df_train.industry, y_train)

Unnamed: 0,MI
lead_source,0.04
industry,0.01
employment_status,0.01
location,0.0


In [68]:
df_train

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score
1077,paid_ads,retail,0,58472.0,student,middle_east,5,0.03
463,organic_search,manufacturing,3,71738.0,student,middle_east,6,0.77
842,paid_ads,technology,3,81973.0,employed,north_america,2,0.59
835,na,technology,1,74956.0,employed,europe,3,0.34
837,organic_search,retail,3,59335.0,student,australia,1,0.98
...,...,...,...,...,...,...,...,...
725,organic_search,other,1,43907.0,employed,australia,4,0.33
401,social_media,retail,3,64969.0,employed,north_america,1,0.18
957,na,education,3,89042.0,employed,asia,4,0.75
992,social_media,manufacturing,1,0.0,self_employed,europe,1,0.65


In [94]:
df_train_full[categorical].nunique()

lead_source          6
industry             8
employment_status    5
location             8
dtype: int64

In [95]:
df_train[categorical].nunique()

lead_source          6
industry             8
employment_status    5
location             8
dtype: int64

In [103]:
df_val[categorical].nunique()

lead_source          6
industry             8
employment_status    5
location             8
dtype: int64

In [115]:
df_train[numerical].nunique()

number_of_courses_viewed      9
annual_income               762
interaction_count            11
lead_score                  101
dtype: int64

In [117]:
df_val[numerical].nunique()

number_of_courses_viewed      8
annual_income               255
interaction_count             9
lead_score                   98
dtype: int64

In [206]:
# Q4 - Train Logistic regression
# OHE

from sklearn.feature_extraction import DictVectorizer

categorical = ['lead_source', 'industry', 'employment_status', 'location']

numerical = ['number_of_courses_viewed', 'annual_income', 'interaction_count', 'lead_score']

train_dict = df_train[categorical + numerical].to_dict(orient='records')

train_dict[10]


{'lead_source': 'events',
 'industry': 'finance',
 'employment_status': 'unemployed',
 'location': 'asia',
 'number_of_courses_viewed': 0,
 'annual_income': 42104.0,
 'interaction_count': 2,
 'lead_score': 0.97}

In [119]:

dv = DictVectorizer(sparse=False)
dv.fit(train_dict)


0,1,2
,dtype,<class 'numpy.float64'>
,separator,'='
,sparse,False
,sort,True


In [79]:
X_train = dv.transform(train_dict)
X_train.shape

(876, 31)

In [135]:
X_train2 = dv.fit_transform(train_dict)
X_train2

array([[5.8472e+04, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00,
        0.0000e+00],
       [7.1738e+04, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00,
        3.0000e+00],
       [8.1973e+04, 1.0000e+00, 0.0000e+00, ..., 1.0000e+00, 0.0000e+00,
        3.0000e+00],
       ...,
       [8.9042e+04, 1.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00,
        3.0000e+00],
       [0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00,
        1.0000e+00],
       [5.0259e+04, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00,
        4.0000e+00]], shape=(876, 31))

In [80]:
X_train

array([[5.8472e+04, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00,
        0.0000e+00],
       [7.1738e+04, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00,
        3.0000e+00],
       [8.1973e+04, 1.0000e+00, 0.0000e+00, ..., 1.0000e+00, 0.0000e+00,
        3.0000e+00],
       ...,
       [8.9042e+04, 1.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00,
        3.0000e+00],
       [0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00,
        1.0000e+00],
       [5.0259e+04, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00,
        4.0000e+00]], shape=(876, 31))

In [98]:
# Holy shit! The method chanegd from get_feature_names() to get_feature_names_out()
# BUT IT IS NOT MENTIONED ANYWHERE!
dv.get_feature_names_out()

array(['employment_status=employed', 'employment_status=na',
       'employment_status=self_employed', 'employment_status=student',
       'employment_status=unemployed', 'industry=education',
       'industry=finance', 'industry=healthcare',
       'industry=manufacturing', 'industry=na', 'industry=other',
       'industry=retail', 'industry=technology', 'lead_source=events',
       'lead_source=na', 'lead_source=organic_search',
       'lead_source=paid_ads', 'lead_source=referral',
       'lead_source=social_media', 'location=africa', 'location=asia',
       'location=australia', 'location=europe', 'location=middle_east',
       'location=na', 'location=north_america', 'location=south_america'],
      dtype=object)

In [122]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)
model.coef_[0].round(3)


array([-0.   ,  0.034, -0.015,  0.003,  0.012, -0.103,  0.049, -0.02 ,
       -0.013, -0.003, -0.025, -0.009, -0.032, -0.016,  0.311,  0.051,
       -0.012,  0.02 , -0.012, -0.115,  0.08 , -0.03 , -0.011, -0.011,
       -0.006,  0.008,  0.006,  0.004, -0.033, -0.025,  0.454])

In [124]:
model.intercept_[0]

np.float64(-0.06914728027832559)

In [125]:
model.predict(X_train)

array([1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1,
       1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1,
       1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1,
       1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
       0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1,
       1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0,
       1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1,
       1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1,
       0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1,
       0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,

In [127]:
model.predict_proba(X_train)

array([[0.42085656, 0.57914344],
       [0.12716508, 0.87283492],
       [0.41183893, 0.58816107],
       ...,
       [0.25265784, 0.74734216],
       [0.3302157 , 0.6697843 ],
       [0.14407823, 0.85592177]], shape=(876, 2))

In [157]:
# Taking the second collumn, the positive probability class
y_trained = model.predict_proba(X_train)[:,1]

In [219]:
y_trained_convert = y_trained > 0.5
#(y_train == y_trained_convert).mean().round(2)
accuracy_train = (y_train == y_trained_convert).mean()
accuracy_train

np.float64(0.7385844748858448)

In [146]:
val_dict = df_val[categorical + numerical].to_dict(orient='records')
#val_dict

In [147]:

X_val = dv.transform(val_dict)
X_val
# Finally this shit is of 293 x 31 size, not 293 x 27 size out of nowhere

array([[5.2220e+04, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00,
        3.0000e+00],
       [5.9656e+04, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00,
        3.0000e+00],
       [5.7134e+04, 0.0000e+00, 0.0000e+00, ..., 1.0000e+00, 0.0000e+00,
        0.0000e+00],
       ...,
       [7.4166e+04, 1.0000e+00, 0.0000e+00, ..., 0.0000e+00, 1.0000e+00,
        1.0000e+00],
       [3.9103e+04, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00,
        2.0000e+00],
       [4.7129e+04, 0.0000e+00, 1.0000e+00, ..., 0.0000e+00, 1.0000e+00,
        1.0000e+00]], shape=(293, 31))

In [151]:
y_pred = model.predict_proba(X_val)[:,1]


In [201]:
convert = y_pred > 0.5
(y_val == convert).mean().round(2)
accuracy_val = (y_val == convert).mean()
accuracy_val

np.float64(0.6996587030716723)

In [216]:
categorical = ['lead_source', 'industry', 'employment_status', 'location']

numerical = ['number_of_courses_viewed', 'annual_income', 'interaction_count', 'lead_score']

test_dict1 = df_test[categorical + numerical].to_dict(orient='records')

dv_test = DictVectorizer(sparse=False)
dv_test.fit(test_dict1)

X_test1 = dv_test.transform(test_dict1)
X_test1

array([[5.6070e+04, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00,
        2.0000e+00],
       [7.8409e+04, 0.0000e+00, 1.0000e+00, ..., 0.0000e+00, 0.0000e+00,
        1.0000e+00],
       [6.6206e+04, 1.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00,
        2.0000e+00],
       ...,
       [6.6922e+04, 1.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00,
        5.0000e+00],
       [8.2306e+04, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00,
        4.0000e+00],
       [6.4070e+04, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00,
        2.0000e+00]], shape=(293, 31))

In [218]:
y_for_test = model.predict_proba(X_test1)[:, 1]
test_convert = y_for_test > 0.5
(y_test == test_convert).mean().round(2)

accuracy_test = (y_test == test_convert).mean()
accuracy_test

np.float64(0.726962457337884)

In [149]:
(y_test == test_convert).mean().round(2)

In [222]:
# Q5 Finding the least useful feature
# 5.1 - no industry

categorical = ['lead_source', 'employment_status', 'location']  # 'industry'

numerical = ['number_of_courses_viewed', 'annual_income', 'interaction_count', 'lead_score']

train_dict_no_param1 = df_train[categorical + numerical].to_dict(orient='records')
dv_no_param = DictVectorizer(sparse=False)
dv_no_param.fit(train_dict_no_param1)


X_train_no_param = dv_no_param.transform(train_dict_no_param1)


model_no_param = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model_no_param.fit(X_train_no_param, y_train)


val_dict_no_param = df_val[categorical + numerical].to_dict(orient='records')
X_val_no_param = dv_no_param.transform(val_dict_no_param)

y_no_param = model_no_param.predict_proba(X_val_no_param)[:, 1]
convert_no_param1 = y_no_param > 0.5
accuracy_no_industry = (y_val == convert_no_param1).mean()

result_difference = accuracy_train - accuracy_no_industry

print(result_difference)

0.03892577181417245


In [177]:
model_no_param1 = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model_no_param1.fit(X_train_no_param, y_train)
model_no_param1.coef_[0]

y_no_industry = model_no_param1.predict_proba(X_train_no_param)[:, 1]
convert_no_industry = y_no_industry > 0.5
(y_train == convert_no_industry).mean()

np.float64(0.7408675799086758)

In [223]:
# 5.2 - no employment_status

categorical = ['lead_source', 'industry', 'location']  # 'employment_status'

numerical = ['number_of_courses_viewed', 'annual_income', 'interaction_count', 'lead_score']

train_dict_no_param = df_train[categorical + numerical].to_dict(orient='records')
dv_no_param2 = DictVectorizer(sparse=False)
dv_no_param2.fit(train_dict_no_param)


X_train_no_param2 = dv_no_param2.transform(train_dict_no_param)


model_no_param2 = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model_no_param2.fit(X_train_no_param2, y_train)


val_dict_no_param2 = df_val[categorical + numerical].to_dict(orient='records')
X_val_no_param2 = dv_no_param2.transform(val_dict_no_param2)

y_no_industry = model_no_param2.predict_proba(X_val_no_param2)[:, 1]
convert_no_industry = y_no_industry > 0.5
accuracy_no_param = (y_val == convert_no_industry).mean()


result_difference = accuracy_train - accuracy_no_param

print(result_difference)

0.04233874109744884


In [224]:
# 5.3 - no lead_score
categorical = ['lead_source', 'industry', 'employment_status', 'location']  

numerical = ['number_of_courses_viewed', 'annual_income', 'interaction_count'] #, 'lead_score']

train_dict_no_param3 = df_train[categorical + numerical].to_dict(orient='records')
dv_no_param3 = DictVectorizer(sparse=False)
dv_no_param3.fit(train_dict_no_param3)


X_train_no_param3 = dv_no_param3.transform(train_dict_no_param3)


model_no_param3 = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model_no_param3.fit(X_train_no_param3, y_train)


val_dict_no_param3 = df_val[categorical + numerical].to_dict(orient='records')
X_val_no_param3 = dv_no_param3.transform(val_dict_no_param3)

y_no_industry3 = model_no_param3.predict_proba(X_val_no_param3)[:, 1]
convert_no_industry3 = y_no_industry3 > 0.5
accuracy_no_param3 = (y_val == convert_no_industry3).mean()




result_difference3 = accuracy_train - accuracy_no_param3

print(result_difference3)

0.03209983324761956


In [239]:
# Q6 - Train model using different C parameter
categorical = ['lead_source', 'industry', 'employment_status', 'location']  

numerical = ['number_of_courses_viewed', 'annual_income', 'interaction_count', 'lead_score']

dv_c = DictVectorizer(sparse=False)

train_dict = df_train[categorical + numerical].to_dict(orient='records')
dv_c.fit(train_dict)

X_train_c = dv_c.transform(train_dict)

val_dict = df_val[categorical + numerical].to_dict(orient='records')
X_val_c = dv_c.transform(val_dict)

test_dict = df_test[categorical + numerical].to_dict(orient='records')
X_test_c = dv_c.transform(test_dict)

train_full = df_train_full[categorical + numerical].to_dict(orient='records')
X_full_c = dv_c.transform(train_full)
y_full = np.concatenate([y_train, y_val])

for i in [0.01, 0.1, 1, 10, 100]:    
    model_c = LogisticRegression(solver='liblinear', C=i, max_iter=1000, random_state=42)
    model_c.fit(X_train_c, y_train)

    y_pred_c = model_c.predict_proba(X_val_c)[:, 1]
    convert_y_pred_c = y_pred_c > 0.5
    accuracy_c = (y_val == convert_y_pred_c).mean()

    # train on full model
    model_c.fit(X_full_c, y_full)
        
    y_test_c = model_c.predict_proba(X_test_c)[:, 1]
    convert_y_test_c = y_test_c > 0.5
    accuracy_test_c = (y_test == convert_y_test_c).mean()

    
    print('c=%.2f, accuracy_val=%.18f, accuracy_test=%.18f' % (i, accuracy_c, accuracy_test_c))

c=0.01, accuracy_val=0.556313993174061383, accuracy_test=0.665529010238907870
c=0.10, accuracy_val=0.556313993174061383, accuracy_test=0.665529010238907870
c=1.00, accuracy_val=0.556313993174061383, accuracy_test=0.665529010238907870
c=10.00, accuracy_val=0.556313993174061383, accuracy_test=0.665529010238907870
c=100.00, accuracy_val=0.556313993174061383, accuracy_test=0.665529010238907870
