In [None]:
import pandas as pd
import numpy as np

import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline

In [None]:
df = pd.read_csv('https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv')
len(df)


In [None]:
# eda
df.dtypes

In [17]:
df.dtypes[df.dtypes == 'object'].index

Index(['lead_source', 'industry', 'employment_status', 'location'], dtype='object')

In [18]:
for col in df.columns:
    print(col)
    print(df[col].head())
    print(df[col].unique())
    print(df[col].unique()[:5])
    print(df[col].nunique())
    print()
    

lead_source
0        paid_ads
1    social_media
2          events
3        paid_ads
4        referral
Name: lead_source, dtype: object
['paid_ads' 'social_media' 'events' 'referral' 'organic_search' 'NA']
['paid_ads' 'social_media' 'events' 'referral' 'organic_search']
6

industry
0            NA
1        retail
2    healthcare
3        retail
4     education
Name: industry, dtype: object
['NA' 'retail' 'healthcare' 'education' 'manufacturing' 'technology'
 'other' 'finance']
['NA' 'retail' 'healthcare' 'education' 'manufacturing']
8

number_of_courses_viewed
0    1
1    1
2    5
3    2
4    3
Name: number_of_courses_viewed, dtype: int64
[1 5 2 3 0 4 6 8 7 9]
[1 5 2 3 0]
10

annual_income
0    79450.0
1    46992.0
2    78796.0
3    83843.0
4    85012.0
Name: annual_income, dtype: float64
[79450. 46992. 78796. ... 45688. 71016. 92855.]
[79450. 46992. 78796. 83843. 85012.]
1268

employment_status
0       unemployed
1         employed
2       unemployed
3               NA
4    self_employ

In [19]:
categorical_columns = list(df.dtypes[df.dtypes =='object'].index)

In [20]:
categorical_columns

['lead_source', 'industry', 'employment_status', 'location']

In [21]:
# DATA PREPARATION
# We have annual_income of type float with nulls, replacing them with 0.0
df.isnull().sum()
#df = df['annual_income'].fillna(0.0)

#df.isnull().sum()

lead_source                 0
industry                    0
number_of_courses_viewed    0
annual_income               0
employment_status           0
location                    0
interaction_count           0
lead_score                  0
converted                   0
dtype: int64

In [54]:
df_prepared = df
df_prepared.annual_income = df_prepared.annual_income.fillna(0.0)

categorical_columns = list(df_prepared.dtypes[df_prepared.dtypes == 'object'].index)

for c in categorical_columns:    
    df_prepared[c] = df_prepared[c].fillna('NA')
    df_prepared[c] = df_prepared[c].str.lower().replace(' ', '_')

#df_prepared


In [76]:
df_prepared

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,na,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.80,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,na,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1
...,...,...,...,...,...,...,...,...,...
1457,referral,manufacturing,1,0.0,self_employed,north_america,4,0.53,1
1458,referral,technology,3,65259.0,student,europe,2,0.24,1
1459,paid_ads,technology,1,45688.0,student,north_america,3,0.02,1
1460,referral,na,5,71016.0,self_employed,north_america,0,0.25,1


In [55]:
df_prepared.isnull().sum()

lead_source                 0
industry                    0
number_of_courses_viewed    0
annual_income               0
employment_status           0
location                    0
interaction_count           0
lead_score                  0
converted                   0
dtype: int64

In [56]:
# Q1 - Most frequesnt observstion in column industry. Answer - retail
df_prepared.industry.mode()

0    retail
Name: industry, dtype: object

In [57]:
# Q2 - correlation matrix - Answer: `annual_income and interaction_count`
numerical1 = ['interaction_count', 'number_of_courses_viewed' ]
numerical1

['interaction_count', 'number_of_courses_viewed']

In [58]:
df_prepared[numerical1].corrwith(df_prepared.lead_score)

interaction_count           0.009888
number_of_courses_viewed   -0.004879
dtype: float64

In [59]:
numerical2 = ['number_of_courses_viewed', 'annual_income' ]
numerical2

['number_of_courses_viewed', 'annual_income']

In [60]:
df_prepared[numerical2].corrwith(df_prepared.interaction_count)

number_of_courses_viewed   -0.023565
annual_income               0.027036
dtype: float64

In [61]:
# Q3 - data split

from sklearn.model_selection import train_test_split

df_train_full, df_test = train_test_split(df_prepared, test_size=0.2, random_state=42)

len(df_train_full), len(df_test)
    

(1169, 293)

In [62]:
#train_test_split?
df_train, df_val = train_test_split(df_train_full, test_size=0.25, random_state=42)
len(df_train), len(df_val), len(df_test)

(876, 293, 293)

In [63]:
y_train = df_train.converted.values
y_val = df_val.converted.values

# removing the converted filed from training data
del df_train['converted']
del df_val['converted']

In [64]:
df_train

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score
1077,paid_ads,retail,0,58472.0,student,middle_east,5,0.03
463,organic_search,manufacturing,3,71738.0,student,middle_east,6,0.77
842,paid_ads,technology,3,81973.0,employed,north_america,2,0.59
835,na,technology,1,74956.0,employed,europe,3,0.34
837,organic_search,retail,3,59335.0,student,australia,1,0.98
...,...,...,...,...,...,...,...,...
725,organic_search,other,1,43907.0,employed,australia,4,0.33
401,social_media,retail,3,64969.0,employed,north_america,1,0.18
957,na,education,3,89042.0,employed,asia,4,0.75
992,social_media,manufacturing,1,0.0,self_employed,europe,1,0.65


In [65]:
df_test

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
892,social_media,manufacturing,2,56070.0,self_employed,middle_east,2,0.23,1
1106,na,other,1,78409.0,na,australia,4,0.79,0
413,referral,manufacturing,2,66206.0,employed,australia,3,0.30,1
522,events,retail,0,0.0,self_employed,north_america,2,0.98,0
1036,organic_search,retail,6,62832.0,unemployed,na,4,1.00,1
...,...,...,...,...,...,...,...,...,...
1362,referral,other,2,58981.0,student,europe,3,0.20,1
802,organic_search,education,1,79448.0,unemployed,north_america,4,0.38,0
651,na,education,5,66922.0,employed,europe,5,0.53,1
722,referral,healthcare,4,82306.0,self_employed,middle_east,3,0.25,1


In [66]:
df_val

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score
662,paid_ads,healthcare,3,52220.0,unemployed,europe,1,0.07
600,organic_search,technology,3,59656.0,unemployed,middle_east,4,0.65
477,events,manufacturing,0,57134.0,self_employed,north_america,4,0.13
1057,events,other,0,0.0,na,asia,0,0.03
891,referral,retail,1,54103.0,unemployed,south_america,3,0.16
...,...,...,...,...,...,...,...,...
1367,social_media,healthcare,1,55222.0,self_employed,africa,1,0.25
1390,paid_ads,na,1,20326.0,employed,middle_east,3,0.81
419,organic_search,technology,1,74166.0,employed,south_america,2,0.01
114,organic_search,technology,2,39103.0,self_employed,africa,3,0.60


In [67]:
# Q3 - Calculate mutual information score between converted and other categorical vars in the training dataset
# round the scores to 2 decimals.
# Which of th vars has the biggest mutual information score? Answer: lead_source (0.04)

from sklearn.metrics import mutual_info_score


def calculate_mi(series):
    score = mutual_info_score(series, y_train)
    return round(score, 2)

fields = ['industry', 'location', 'lead_source', 'employment_status']

df_mi = df_train[fields].apply(calculate_mi)
df_mi = df_mi.sort_values(ascending=False).to_frame(name='MI')


display(df_mi.head())
# display(df_mi.tail())    
#mutual_info_score(df_train.industry, y_train)

Unnamed: 0,MI
lead_source,0.04
industry,0.01
employment_status,0.01
location,0.0


In [68]:
df_train

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score
1077,paid_ads,retail,0,58472.0,student,middle_east,5,0.03
463,organic_search,manufacturing,3,71738.0,student,middle_east,6,0.77
842,paid_ads,technology,3,81973.0,employed,north_america,2,0.59
835,na,technology,1,74956.0,employed,europe,3,0.34
837,organic_search,retail,3,59335.0,student,australia,1,0.98
...,...,...,...,...,...,...,...,...
725,organic_search,other,1,43907.0,employed,australia,4,0.33
401,social_media,retail,3,64969.0,employed,north_america,1,0.18
957,na,education,3,89042.0,employed,asia,4,0.75
992,social_media,manufacturing,1,0.0,self_employed,europe,1,0.65


In [69]:
# Q4 - Train Logistic regression

from sklearn.feature_extraction import DictVectorizer

categorical = ['lead_source', 'industry', 'employment_status', 'location']

numerical = ['number_of_courses_viewed', 'annual_income', 'interaction_count', 'lead_score']


train_dict = df_train[categorical + numerical].to_dict(orient='records')


train_dict[10]


{'lead_source': 'events',
 'industry': 'finance',
 'employment_status': 'unemployed',
 'location': 'asia',
 'number_of_courses_viewed': 0,
 'annual_income': 42104.0,
 'interaction_count': 2,
 'lead_score': 0.97}

In [78]:

dv = DictVectorizer(sparse=False)
dv.fit(train_dict)



0,1,2
,dtype,<class 'numpy.float64'>
,separator,'='
,sparse,False
,sort,True


In [79]:
X_train = dv.transform(train_dict)
X_train.shape

(876, 31)

In [80]:
X_train

array([[5.8472e+04, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00,
        0.0000e+00],
       [7.1738e+04, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00,
        3.0000e+00],
       [8.1973e+04, 1.0000e+00, 0.0000e+00, ..., 1.0000e+00, 0.0000e+00,
        3.0000e+00],
       ...,
       [8.9042e+04, 1.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00,
        3.0000e+00],
       [0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00,
        1.0000e+00],
       [5.0259e+04, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00,
        4.0000e+00]], shape=(876, 31))

In [93]:

dv.vocabulary_(train_dict)
#dv.get_feature_names()


#dv?
#dv.get_feature_names_()
#dv.get_feature_names_()



TypeError: 'dict' object is not callable

In [90]:
dv?

[31mType:[39m        DictVectorizer
[31mString form:[39m DictVectorizer(sparse=False)
[31mFile:[39m        ~/.local/lib/python3.12/site-packages/sklearn/feature_extraction/_dict_vectorizer.py
[31mDocstring:[39m  
Transforms lists of feature-value mappings to vectors.

This transformer turns lists of mappings (dict-like objects) of feature
names to feature values into Numpy arrays or scipy.sparse matrices for use
with scikit-learn estimators.

When feature values are strings, this transformer will do a binary one-hot
(aka one-of-K) coding: one boolean-valued feature is constructed for each
of the possible string values that the feature can take on. For instance,
a feature "f" that can take on the values "ham" and "spam" will become two
features in the output, one signifying "f=ham", the other "f=spam".

If a feature value is a sequence or set of strings, this transformer
will iterate over the values and will count the occurrences of each string
value.

However, note that this tr

In [88]:
#dv?


train_dict = df_train[categorical].to_dict(orient='records')

train_dict[10]
dv = DictVectorizer(sparse=False)
dv.fit(train_dict)


dv.get_feature_names



AttributeError: 'DictVectorizer' object has no attribute 'get_feature_names'

In [43]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)



0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'liblinear'
,max_iter,1000


In [None]:
val_dict = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dict)

X_val

In [None]:
model.predict_proba(X_val)