In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

data_url = "https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv"
df = pd.read_csv(data_url)

df

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.80,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1
...,...,...,...,...,...,...,...,...,...
1457,referral,manufacturing,1,,self_employed,north_america,4,0.53,1
1458,referral,technology,3,65259.0,student,europe,2,0.24,1
1459,paid_ads,technology,1,45688.0,student,north_america,3,0.02,1
1460,referral,,5,71016.0,self_employed,north_america,0,0.25,1


In [2]:
df.dtypes

lead_source                  object
industry                     object
number_of_courses_viewed      int64
annual_income               float64
employment_status            object
location                     object
interaction_count             int64
lead_score                  float64
converted                     int64
dtype: object

In [3]:
df.isnull().sum()

lead_source                 128
industry                    134
number_of_courses_viewed      0
annual_income               181
employment_status           100
location                     63
interaction_count             0
lead_score                    0
converted                     0
dtype: int64

In [4]:
df.nunique()

lead_source                    5
industry                       7
number_of_courses_viewed      10
annual_income               1267
employment_status              4
location                       7
interaction_count             12
lead_score                   101
converted                      2
dtype: int64

In [5]:
categorical_cols = df.select_dtypes(include=['object']).columns
categorical_cols = list(categorical_cols)
categorical_cols

['lead_source', 'industry', 'employment_status', 'location']

In [6]:
numeric_cols = df._get_numeric_data().columns
numeric_cols = list(numeric_cols)
numeric_cols

['number_of_courses_viewed',
 'annual_income',
 'interaction_count',
 'lead_score',
 'converted']

In [7]:
## fill with 'NA' for missing values in categorical columns
for c in categorical_cols:
    df[c] = df[c].fillna('NA')
## fill with '0' for missing values in numerical columns
for c in numeric_cols:
    df[c] = df[c].fillna(0)

In [8]:
df.isnull().sum()

lead_source                 0
industry                    0
number_of_courses_viewed    0
annual_income               0
employment_status           0
location                    0
interaction_count           0
lead_score                  0
converted                   0
dtype: int64

In [9]:
df.industry.mode()

0    retail
Name: industry, dtype: object

#### Q1: What is the most frequent observation (mode) for the column industry?
- retail

In [10]:
for c in numeric_cols:
    if c in ['interaction_count', 'number_of_courses_viewed', 'annual_income']:
        print(f"--------\n{c}\n--------")
        display(df[numeric_cols].corrwith(df[c]).round(4))

--------
number_of_courses_viewed
--------


number_of_courses_viewed    1.0000
annual_income               0.0098
interaction_count          -0.0236
lead_score                 -0.0049
converted                   0.4359
dtype: float64

--------
annual_income
--------


number_of_courses_viewed    0.0098
annual_income               1.0000
interaction_count           0.0270
lead_score                  0.0156
converted                   0.0531
dtype: float64

--------
interaction_count
--------


number_of_courses_viewed   -0.0236
annual_income               0.0270
interaction_count           1.0000
lead_score                  0.0099
converted                   0.3746
dtype: float64

#### Q2: What are the two features that have the biggest correlation?
- `annual_income` and `interaction_count` = `0.027036`

In [11]:
from sklearn.model_selection import train_test_split
df_full_train, df_test = train_test_split(df, test_size = 0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size = .25, random_state=42)

In [12]:
df_train.reset_index(drop=True, inplace = True)
df_val.reset_index(drop=True, inplace = True)
df_test.reset_index(drop=True, inplace = True)
df_full_train.reset_index(drop=True, inplace= True)

In [13]:
y_train = df_train.converted.values
y_val = df_val.converted.values
y_test = df_test.converted.values

del df_train['converted']
del df_val['converted']
del df_test['converted']
numeric_cols.remove('converted')

In [14]:
from sklearn.metrics import mutual_info_score
def mutual_info_converted_score(series):
    return mutual_info_score(series, df_full_train.converted)
mi = df_full_train[categorical_cols].apply(mutual_info_converted_score)
mi.sort_values(ascending=False).round(2)

lead_source          0.03
employment_status    0.01
industry             0.01
location             0.00
dtype: float64

#### Q3: Which of these variables has the biggest mutual information score?
- `lead_source` = `0.03`

In [15]:
from sklearn.feature_extraction import DictVectorizer
dv = DictVectorizer(sparse = False)

train_dict = df_train[categorical_cols + numeric_cols].to_dict(orient = 'records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val[categorical_cols + numeric_cols].to_dict(orient = 'records')
X_val = dv.transform(val_dict)

In [16]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='liblinear', C = 1.0, max_iter = 1000, random_state=42)
model.fit(X_train, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'liblinear'
,max_iter,1000


In [17]:
y_pred = model.predict_proba(X_val)[: , 1] # [:,0] [p1,p2], p1 = !converted, p2 = converted
y_pred

array([0.61192163, 0.79982617, 0.53021343, 0.47131479, 0.57066131,
       0.44227168, 0.87127669, 0.84883115, 0.83290037, 0.61497801,
       0.54968026, 0.78153088, 0.69039785, 0.77017122, 0.5265944 ,
       0.91706425, 0.53170634, 0.42123048, 0.30146455, 0.84881583,
       0.79488653, 0.73670374, 0.4452721 , 0.64838383, 0.4176882 ,
       0.75393418, 0.90166116, 0.33903048, 0.43181431, 0.9680681 ,
       0.92018714, 0.37487988, 0.65230099, 0.90650057, 0.75164116,
       0.64202121, 0.82250075, 0.83375553, 0.659116  , 0.30978853,
       0.78942264, 0.35546365, 0.96517758, 0.63389304, 0.51274195,
       0.53230533, 0.82287785, 0.744074  , 0.73452313, 0.68955217,
       0.46964443, 0.84539252, 0.55635243, 0.92637871, 0.65258021,
       0.61526273, 0.63816995, 0.28304018, 0.48049824, 0.57890618,
       0.35497342, 0.62175051, 0.38960777, 0.61156056, 0.85304278,
       0.75430136, 0.89185954, 0.71946458, 0.95387623, 0.89209517,
       0.75277087, 0.33850139, 0.61376593, 0.51622275, 0.64088

In [18]:
converted_decision = (y_pred >= 0.5)
converted_decision

array([ True,  True,  True, False,  True, False,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True, False,
       False,  True,  True,  True, False,  True, False,  True,  True,
       False, False,  True,  True, False,  True,  True,  True,  True,
        True,  True,  True, False,  True, False,  True,  True,  True,
        True,  True,  True,  True,  True, False,  True,  True,  True,
        True,  True,  True, False, False,  True, False,  True, False,
        True,  True,  True,  True,  True,  True,  True,  True, False,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
       False,  True,  True,  True, False,  True,  True,  True,  True,
        True,  True,  True,  True, False,  True,  True,  True,  True,
        True, False, False,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True, False,  True, False,  True,
       False,  True,  True, False,  True,  True, False,  True,  True,
       False,  True,

In [19]:
accuracy_with_all_col = (converted_decision == y_val).mean().round(3)
accuracy_with_all_col

np.float64(0.7)

#### Q4: What accuracy did you get?
- `0.7`

In [20]:
remove_features = ['industry','employment_status','lead_score']
for rf in remove_features:

    new_cat_cols= categorical_cols[:]
    new_num_cols = numeric_cols[:]
    
    if(rf == 'lead_score'):
        new_num_cols.remove(rf)
    else:
        new_cat_cols.remove(rf)
    
    dv = DictVectorizer(sparse = False)

    train_dict_1 = df_full_train[new_cat_cols + new_num_cols].to_dict(orient = 'records')
    X_train_1 = dv.fit_transform(train_dict)

    val_dict_1 = df_val[new_cat_cols + new_num_cols].to_dict(orient = 'records')
    X_val_1 = dv.transform(val_dict)

    model.fit(X_train_1, y_train)

    y_pred = model.predict_proba(X_val_1) [:, 1] #  [p1,p2], p1 = !converted, p2 = converted
    converted_decision = (y_pred >= 0.5)

    print(f"""Removed Col= `{rf}` --> accuracy_without_col = {(converted_decision == y_val).mean()} |
    (accuracy_with_all_col - accuracy_without_col) = {accuracy_with_all_col - ((converted_decision == y_val).mean())}""")

Removed Col= `industry` --> accuracy_without_col = 0.6996587030716723 |
    (accuracy_with_all_col - accuracy_without_col) = 0.0003412969283276279
Removed Col= `employment_status` --> accuracy_without_col = 0.6996587030716723 |
    (accuracy_with_all_col - accuracy_without_col) = 0.0003412969283276279
Removed Col= `lead_score` --> accuracy_without_col = 0.6996587030716723 |
    (accuracy_with_all_col - accuracy_without_col) = 0.0003412969283276279


#### Q5: Which of following feature has the smallest difference?
- industry

In [21]:
dv = DictVectorizer(sparse = False)

train_dict = df_full_train[categorical_cols + numeric_cols].to_dict(orient = 'records')
X_full_train = dv.fit_transform(train_dict)

test_dict = df_test[categorical_cols + numeric_cols].to_dict(orient = 'records')
X_test = dv.transform(test_dict)

y_full_train = df_full_train.converted.values

In [22]:
for c in [0.01, 0.1, 1, 10, 100]:
    model = LogisticRegression(solver='liblinear', C = c, max_iter = 1000, random_state=42)
    model.fit(X_full_train, y_full_train)
    y_pred = model.predict_proba(X_test)[:, 1]
    converted_decision = (y_pred >= 0.5)
    accuracy = (converted_decision == y_test).mean()
    print(f"""for C= {c}, accuracy = {accuracy}""")

for C= 0.01, accuracy = 0.7337883959044369
for C= 0.1, accuracy = 0.7372013651877133
for C= 1, accuracy = 0.7372013651877133
for C= 10, accuracy = 0.7372013651877133
for C= 100, accuracy = 0.7372013651877133


#### Q6: Which of these `C` leads to the best accuracy on the validation set?
- 0.01