In [2]:
# Import all Libraries

# Suppressing Warnings
import warnings
warnings.filterwarnings('ignore')

# Importing Pandas and NumPy
import pandas as pd, numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import statsmodels.api as sm

pd.set_option('display.max_columns', 200)
pd.set_option('display.max_rows', 200)

In [3]:
# Data Import and Exploration
lead = pd.read_csv("Leads.csv")
lead.head()

Unnamed: 0,Prospect ID,Lead Number,Lead Origin,Lead Source,Do Not Email,Do Not Call,Converted,TotalVisits,Total Time Spent on Website,Page Views Per Visit,Last Activity,Country,Specialization,How did you hear about X Education,What is your current occupation,What matters most to you in choosing a course,Search,Magazine,Newspaper Article,X Education Forums,Newspaper,Digital Advertisement,Through Recommendations,Receive More Updates About Our Courses,Tags,Lead Quality,Update me on Supply Chain Content,Get updates on DM Content,Lead Profile,City,Asymmetrique Activity Index,Asymmetrique Profile Index,Asymmetrique Activity Score,Asymmetrique Profile Score,I agree to pay the amount through cheque,A free copy of Mastering The Interview,Last Notable Activity
0,7927b2df-8bba-4d29-b9a2-b6e0beafe620,660737,API,Olark Chat,No,No,0,0.0,0,0.0,Page Visited on Website,,Select,Select,Unemployed,Better Career Prospects,No,No,No,No,No,No,No,No,Interested in other courses,Low in Relevance,No,No,Select,Select,02.Medium,02.Medium,15.0,15.0,No,No,Modified
1,2a272436-5132-4136-86fa-dcc88c88f482,660728,API,Organic Search,No,No,0,5.0,674,2.5,Email Opened,India,Select,Select,Unemployed,Better Career Prospects,No,No,No,No,No,No,No,No,Ringing,,No,No,Select,Select,02.Medium,02.Medium,15.0,15.0,No,No,Email Opened
2,8cc8c611-a219-4f35-ad23-fdfd2656bd8a,660727,Landing Page Submission,Direct Traffic,No,No,1,2.0,1532,2.0,Email Opened,India,Business Administration,Select,Student,Better Career Prospects,No,No,No,No,No,No,No,No,Will revert after reading the email,Might be,No,No,Potential Lead,Mumbai,02.Medium,01.High,14.0,20.0,No,Yes,Email Opened
3,0cc2df48-7cf4-4e39-9de9-19797f9b38cc,660719,Landing Page Submission,Direct Traffic,No,No,0,1.0,305,1.0,Unreachable,India,Media and Advertising,Word Of Mouth,Unemployed,Better Career Prospects,No,No,No,No,No,No,No,No,Ringing,Not Sure,No,No,Select,Mumbai,02.Medium,01.High,13.0,17.0,No,No,Modified
4,3256f628-e534-4826-9d63-4a8b88782852,660681,Landing Page Submission,Google,No,No,1,2.0,1428,1.0,Converted to Lead,India,Select,Other,Unemployed,Better Career Prospects,No,No,No,No,No,No,No,No,Will revert after reading the email,Might be,No,No,Select,Mumbai,02.Medium,01.High,15.0,18.0,No,No,Modified


In [4]:
# Create Student ID to help with interpretation of results from the model at the end
Student_ID = lead["Prospect ID"]
Student_ID.head()

0    7927b2df-8bba-4d29-b9a2-b6e0beafe620
1    2a272436-5132-4136-86fa-dcc88c88f482
2    8cc8c611-a219-4f35-ad23-fdfd2656bd8a
3    0cc2df48-7cf4-4e39-9de9-19797f9b38cc
4    3256f628-e534-4826-9d63-4a8b88782852
Name: Prospect ID, dtype: object

In [5]:
Student_ID = pd.DataFrame(Student_ID)
Student_ID.head()

Unnamed: 0,Prospect ID
0,7927b2df-8bba-4d29-b9a2-b6e0beafe620
1,2a272436-5132-4136-86fa-dcc88c88f482
2,8cc8c611-a219-4f35-ad23-fdfd2656bd8a
3,0cc2df48-7cf4-4e39-9de9-19797f9b38cc
4,3256f628-e534-4826-9d63-4a8b88782852


In [6]:
# Inspecting the Dataframe
lead.shape

(9240, 37)

In [7]:
lead.describe()

Unnamed: 0,Lead Number,Converted,TotalVisits,Total Time Spent on Website,Page Views Per Visit,Asymmetrique Activity Score,Asymmetrique Profile Score
count,9240.0,9240.0,9103.0,9240.0,9103.0,5022.0,5022.0
mean,617188.435606,0.38539,3.445238,487.698268,2.36282,14.306252,16.344883
std,23405.995698,0.486714,4.854853,548.021466,2.161418,1.386694,1.811395
min,579533.0,0.0,0.0,0.0,0.0,7.0,11.0
25%,596484.5,0.0,1.0,12.0,1.0,14.0,15.0
50%,615479.0,0.0,3.0,248.0,2.0,14.0,16.0
75%,637387.25,1.0,5.0,936.0,3.0,15.0,18.0
max,660737.0,1.0,251.0,2272.0,55.0,18.0,20.0


In [8]:
lead.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9240 entries, 0 to 9239
Data columns (total 37 columns):
Prospect ID                                      9240 non-null object
Lead Number                                      9240 non-null int64
Lead Origin                                      9240 non-null object
Lead Source                                      9204 non-null object
Do Not Email                                     9240 non-null object
Do Not Call                                      9240 non-null object
Converted                                        9240 non-null int64
TotalVisits                                      9103 non-null float64
Total Time Spent on Website                      9240 non-null int64
Page Views Per Visit                             9103 non-null float64
Last Activity                                    9137 non-null object
Country                                          6779 non-null object
Specialization                                   7802 

In [9]:
# Finding Percentage of Missing Values
(lead.isnull().sum()/len(lead))*100

Prospect ID                                       0.000000
Lead Number                                       0.000000
Lead Origin                                       0.000000
Lead Source                                       0.389610
Do Not Email                                      0.000000
Do Not Call                                       0.000000
Converted                                         0.000000
TotalVisits                                       1.482684
Total Time Spent on Website                       0.000000
Page Views Per Visit                              1.482684
Last Activity                                     1.114719
Country                                          26.634199
Specialization                                   15.562771
How did you hear about X Education               23.885281
What is your current occupation                  29.112554
What matters most to you in choosing a course    29.318182
Search                                            0.0000

In [10]:
# Checking value count for "Country" and "City" levels
lead['Country'].value_counts()

India                   6492
United States             69
United Arab Emirates      53
Singapore                 24
Saudi Arabia              21
United Kingdom            15
Australia                 13
Qatar                     10
Hong Kong                  7
Bahrain                    7
France                     6
Oman                       6
unknown                    5
Kuwait                     4
South Africa               4
Nigeria                    4
Germany                    4
Canada                     4
Sweden                     3
Philippines                2
Uganda                     2
China                      2
Ghana                      2
Bangladesh                 2
Belgium                    2
Netherlands                2
Asia/Pacific Region        2
Italy                      2
Russia                     1
Liberia                    1
Tanzania                   1
Switzerland                1
Malaysia                   1
Kenya                      1
Denmark       

In [11]:
lead['City'].value_counts()

Mumbai                         3222
Select                         2249
Thane & Outskirts               752
Other Cities                    686
Other Cities of Maharashtra     457
Other Metro Cities              380
Tier II Cities                   74
Name: City, dtype: int64

In [12]:
lead['Lead Quality'].value_counts()

Might be             1560
Not Sure             1092
High in Relevance     637
Worst                 601
Low in Relevance      583
Name: Lead Quality, dtype: int64

In [13]:
lead['Last Activity'].value_counts()

Email Opened                    3437
SMS Sent                        2745
Olark Chat Conversation          973
Page Visited on Website          640
Converted to Lead                428
Email Bounced                    326
Email Link Clicked               267
Form Submitted on Website        116
Unreachable                       93
Unsubscribed                      61
Had a Phone Conversation          30
Approached upfront                 9
View in browser link Clicked       6
Email Received                     2
Email Marked Spam                  2
Visited Booth in Tradeshow         1
Resubscribed to emails             1
Name: Last Activity, dtype: int64

In [14]:
# Analysis of "Country" value counts suggest that majority of country count is India. So this does not add value
# to the analysis (no variance in information) and can be dropped.
# Similarly, for "City" columnn, maximum frequency is Mumbai followed by Select. Again these two do not add value 
# to the analysis and can be dropped.
# Between Prospect ID and Lead Number, there is redundancy, as both indicate a unique ID. These are unique columns and 
# therefore will not add value to the analysis.
# We therefore drop these two columns
# Last activity and Last Notable activity are redundant columns. can drop one. 
# We chose Last Notable Activity to be dropped as the levels are not clear, while Last Activity has well defined and
# informative levels. 
# Lead Quality is subjective and derived column. It can also be removed. 

lead.drop(["City", "Country", "Prospect ID","Lead Number", 
                "Lead Quality","Last Notable Activity"], axis=1, inplace=True)

In [15]:
# Finding Percentage of Missing Values
(lead.isnull().sum()/len(lead))*100

Lead Origin                                       0.000000
Lead Source                                       0.389610
Do Not Email                                      0.000000
Do Not Call                                       0.000000
Converted                                         0.000000
TotalVisits                                       1.482684
Total Time Spent on Website                       0.000000
Page Views Per Visit                              1.482684
Last Activity                                     1.114719
Specialization                                   15.562771
How did you hear about X Education               23.885281
What is your current occupation                  29.112554
What matters most to you in choosing a course    29.318182
Search                                            0.000000
Magazine                                          0.000000
Newspaper Article                                 0.000000
X Education Forums                                0.0000

In [16]:
lead['Do Not Email'].value_counts()

No     8506
Yes     734
Name: Do Not Email, dtype: int64

In [17]:
lead['Do Not Call'].value_counts()

No     9238
Yes       2
Name: Do Not Call, dtype: int64

In [18]:
# Dropping further columns/features that do not add value

# Do not Email and Do not Call columns have maximum No values. 
#These will not add any value to the analysis and can be removed. 

In [19]:
lead.drop(['Do Not Email', 'Do Not Call', ], axis=1, inplace=True)

In [20]:
lead['Specialization'].value_counts()/len(lead['Specialization'])*100

Select                               21.017316
Finance Management                   10.562771
Human Resource Management             9.177489
Marketing Management                  9.069264
Operations Management                 5.443723
Business Administration               4.361472
IT Projects Management                3.961039
Supply Chain Management               3.777056
Banking, Investment And Insurance     3.658009
Travel and Tourism                    2.196970
Media and Advertising                 2.196970
International Business                1.926407
Healthcare Management                 1.720779
Hospitality Management                1.233766
E-COMMERCE                            1.212121
Retail Management                     1.082251
Rural and Agribusiness                0.790043
E-Business                            0.616883
Services Excellence                   0.432900
Name: Specialization, dtype: float64

In [21]:
lead['Lead Profile'].value_counts()/len(lead['Lead Profile'])*100

Select                         44.870130
Potential Lead                 17.456710
Other Leads                     5.270563
Student of SomeSchool           2.608225
Lateral Student                 0.259740
Dual Specialization Student     0.216450
Name: Lead Profile, dtype: float64

In [22]:
lead['How did you hear about X Education'].value_counts()/len(lead['How did you hear about X Education'])*100

Select                   54.577922
Online Search             8.744589
Word Of Mouth             3.766234
Student of SomeSchool     3.354978
Other                     2.012987
Multiple Sources          1.645022
Advertisements            0.757576
Social Media              0.725108
Email                     0.281385
SMS                       0.248918
Name: How did you hear about X Education, dtype: float64

In [23]:
# Select option implies the prospective student did not select any option. Hence it is as good as null/NaN. 
# Imputation is needed to be carried out for columns with Select options. 
# First we replace "Select" with NaN in such columns. 

cols = ["Specialization","Lead Profile","How did you hear about X Education"]
lead[cols] = lead[cols].replace({'Select':np.nan})

In [24]:
lead['What matters most to you in choosing a course'].value_counts()/len(lead['What matters most to you in choosing a course'])*100

Better Career Prospects      70.649351
Flexibility & Convenience     0.021645
Other                         0.010823
Name: What matters most to you in choosing a course, dtype: float64

In [25]:
# Majority of the leads have replied to this question as "Better Career Prospects". Hence removing this column
lead.drop('What matters most to you in choosing a course',axis=1,inplace=True)

In [26]:
lead['Search'].value_counts()/len(lead['Search'])*100

No     99.848485
Yes     0.151515
Name: Search, dtype: float64

In [27]:
lead['Magazine'].value_counts()/len(lead['Magazine'])*100

No    100.0
Name: Magazine, dtype: float64

In [28]:
lead['Newspaper Article'].value_counts()/len(lead['Newspaper Article'])*100

No     99.978355
Yes     0.021645
Name: Newspaper Article, dtype: float64

In [29]:
lead['X Education Forums'].value_counts()/len(lead['X Education Forums'])*100

No     99.989177
Yes     0.010823
Name: X Education Forums, dtype: float64

In [30]:
lead['Newspaper'].value_counts()/len(lead['Newspaper'])*100

No     99.989177
Yes     0.010823
Name: Newspaper, dtype: float64

In [31]:
lead['Digital Advertisement'].value_counts()/len(lead['Digital Advertisement'])*100

No     99.95671
Yes     0.04329
Name: Digital Advertisement, dtype: float64

In [32]:
lead['Through Recommendations'].value_counts()/len(lead['Through Recommendations'])*100

No     99.924242
Yes     0.075758
Name: Through Recommendations, dtype: float64

In [33]:
lead['Receive More Updates About Our Courses'].value_counts()/len(lead['Receive More Updates About Our Courses'])*100

No    100.0
Name: Receive More Updates About Our Courses, dtype: float64

In [34]:
lead['Update me on Supply Chain Content'].value_counts()/len(lead['Update me on Supply Chain Content'])*100

No    100.0
Name: Update me on Supply Chain Content, dtype: float64

In [35]:
lead['Get updates on DM Content'].value_counts()/len(lead['Get updates on DM Content'])*100

No    100.0
Name: Get updates on DM Content, dtype: float64

In [36]:
lead['I agree to pay the amount through cheque'].value_counts()/len(lead['I agree to pay the amount through cheque'])*100

No    100.0
Name: I agree to pay the amount through cheque, dtype: float64

In [37]:
lead['A free copy of Mastering The Interview'].value_counts()/len(lead['A free copy of Mastering The Interview'])*100

No     68.744589
Yes    31.255411
Name: A free copy of Mastering The Interview, dtype: float64

In [38]:
# Further, some of the columns have close to 99 to 100% similar values. These columns hence do not provide any
# variety and therefore information to the model. Such columns can be deleted.

lead.drop(['Search', 'Magazine','Newspaper Article','X Education Forums', 'Newspaper',
                  'Digital Advertisement','Through Recommendations','Receive More Updates About Our Courses'
                  ,'Update me on Supply Chain Content','Get updates on DM Content','I agree to pay the amount through cheque'], 
                 axis=1, inplace=True)

In [39]:
lead.shape

(9240, 17)

In [40]:
# % of null values in each column
(lead.isnull().sum()/len(lead))*100

Lead Origin                                0.000000
Lead Source                                0.389610
Converted                                  0.000000
TotalVisits                                1.482684
Total Time Spent on Website                0.000000
Page Views Per Visit                       1.482684
Last Activity                              1.114719
Specialization                            36.580087
How did you hear about X Education        78.463203
What is your current occupation           29.112554
Tags                                      36.287879
Lead Profile                              74.188312
Asymmetrique Activity Index               45.649351
Asymmetrique Profile Index                45.649351
Asymmetrique Activity Score               45.649351
Asymmetrique Profile Score                45.649351
A free copy of Mastering The Interview     0.000000
dtype: float64

In [41]:
lead['What is your current occupation'].value_counts()/len(lead['What is your current occupation'])*100

Unemployed              60.606061
Working Professional     7.640693
Student                  2.272727
Other                    0.173160
Housewife                0.108225
Businessman              0.086580
Name: What is your current occupation, dtype: float64

In [42]:
lead['Tags'].value_counts()/len(lead['Tags'])*100

Will revert after reading the email                  22.424242
Ringing                                              13.019481
Interested in other courses                           5.551948
Already a student                                     5.032468
Closed by Horizzon                                    3.874459
switched off                                          2.597403
Busy                                                  2.012987
Lost to EINS                                          1.893939
Not doing further education                           1.569264
Interested  in full time MBA                          1.266234
Graduation in progress                                1.201299
invalid number                                        0.898268
Diploma holder (Not Eligible)                         0.681818
wrong number given                                    0.508658
opp hangup                                            0.357143
number not provided                                   0

In [43]:
lead['Asymmetrique Activity Index'].value_counts()/len(lead['Asymmetrique Activity Index'])*100

02.Medium    41.547619
01.High       8.885281
03.Low        3.917749
Name: Asymmetrique Activity Index, dtype: float64

In [44]:
lead['Asymmetrique Profile Index'].value_counts()/len(lead['Asymmetrique Profile Index'])*100

02.Medium    30.173160
01.High      23.841991
03.Low        0.335498
Name: Asymmetrique Profile Index, dtype: float64

In [45]:
lead['Asymmetrique Activity Score'].value_counts()/len(lead['Asymmetrique Activity Score'])*100

14.0    19.166667
15.0    13.993506
13.0     8.387446
16.0     5.054113
17.0     3.777056
12.0     2.121212
11.0     1.028139
10.0     0.616883
9.0      0.097403
18.0     0.054113
8.0      0.043290
7.0      0.010823
Name: Asymmetrique Activity Score, dtype: float64

In [46]:
lead['Asymmetrique Profile Score'].value_counts()/len(lead['Asymmetrique Profile Score'])*100

15.0    19.036797
18.0    11.590909
16.0     6.482684
17.0     6.266234
20.0     3.333333
19.0     2.651515
14.0     2.445887
13.0     2.207792
12.0     0.238095
11.0     0.097403
Name: Asymmetrique Profile Score, dtype: float64

In [47]:
# Converting binary values (yes/no to 1/0)

varlist =  ['A free copy of Mastering The Interview']

# Defining the map function
def binary_map(x):
    return x.map({'Yes': 1, "No": 0})

# Applying the function 
lead[varlist] = lead[varlist].apply(binary_map)

In [48]:
lead.shape

(9240, 17)

In [49]:
lead['Last Activity'].value_counts()

Email Opened                    3437
SMS Sent                        2745
Olark Chat Conversation          973
Page Visited on Website          640
Converted to Lead                428
Email Bounced                    326
Email Link Clicked               267
Form Submitted on Website        116
Unreachable                       93
Unsubscribed                      61
Had a Phone Conversation          30
Approached upfront                 9
View in browser link Clicked       6
Email Received                     2
Email Marked Spam                  2
Visited Booth in Tradeshow         1
Resubscribed to emails             1
Name: Last Activity, dtype: int64

In [50]:
lead['Lead Source'].value_counts()

Google               2868
Direct Traffic       2543
Olark Chat           1755
Organic Search       1154
Reference             534
Welingak Website      142
Referral Sites        125
Facebook               55
bing                    6
google                  5
Click2call              4
Social Media            2
Live Chat               2
Press_Release           2
NC_EDM                  1
Pay per Click Ads       1
blog                    1
welearnblog_Home        1
WeLearn                 1
youtubechannel          1
testone                 1
Name: Lead Source, dtype: int64

In [51]:
# Replace google with Google
lead['Lead Source'].replace("google",'Google',inplace=True)

In [52]:
# Convert the index column from 01.High to High, 02.Medium to Medium & 03.Low to Low
lead['Asymmetrique Profile Index']=lead['Asymmetrique Profile Index'].map(({'01.High': 'High', "02.Medium": 'Medium','03.Low':'Low'}))
lead['Asymmetrique Activity Index']=lead['Asymmetrique Activity Index'].map(({'01.High': 'High', "02.Medium": 'Medium','03.Low':'Low'}))

In [53]:
# Imputing Null values

In [54]:
# Next we impute these NaN values 

for column in ["Specialization","Lead Profile","How did you hear about X Education"]:
    lead[column].fillna(lead[column].mode()[0], inplace=True)

In [55]:

# Next we impute the NaN values of categorical columns - 'Tags','Asymmetrique Activity Index','Asymmetrique Profile Index'

for column in ['What is your current occupation','Tags','Asymmetrique Activity Index','Asymmetrique Profile Index','Last Activity']:
    lead[column].fillna(lead[column].mode()[0], inplace=True)

In [56]:
# compute the mean for the columns
m1=round(lead['TotalVisits'].mean(),2)
m2=round(lead['Page Views Per Visit'].mean(),2)
m3=round(lead['Asymmetrique Activity Score'].mean(),2)
m4=round(lead['Asymmetrique Profile Score'].mean(),2)

In [57]:
# Next we impute the NaN values for numerical columns
                                                                                                                                                                                                                                                                                           
lead['TotalVisits'].fillna(m1,inplace=True)
lead['Page Views Per Visit'].fillna(m2,inplace=True)
lead['Asymmetrique Activity Score'].fillna(m3,inplace=True)
lead['Asymmetrique Profile Score'].fillna(m4,inplace=True)

In [58]:
lead.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9240 entries, 0 to 9239
Data columns (total 17 columns):
Lead Origin                               9240 non-null object
Lead Source                               9204 non-null object
Converted                                 9240 non-null int64
TotalVisits                               9240 non-null float64
Total Time Spent on Website               9240 non-null int64
Page Views Per Visit                      9240 non-null float64
Last Activity                             9240 non-null object
Specialization                            9240 non-null object
How did you hear about X Education        9240 non-null object
What is your current occupation           9240 non-null object
Tags                                      9240 non-null object
Lead Profile                              9240 non-null object
Asymmetrique Activity Index               9240 non-null object
Asymmetrique Profile Index                9240 non-null object
Asymmetriqu

In [59]:
# Create Dummy Variable

In [60]:
# Creating a dummy variable for some of the categorical variables.
dummy1 = pd.get_dummies(lead[['Lead Origin', 'Lead Source', 'Last Activity', 'Specialization','How did you hear about X Education',
                             'What is your current occupation','Tags','Lead Profile','Asymmetrique Activity Index','Asymmetrique Profile Index']])
lead_df = pd.concat([lead, dummy1], axis=1)

In [61]:
lead_df.head()

Unnamed: 0,Lead Origin,Lead Source,Converted,TotalVisits,Total Time Spent on Website,Page Views Per Visit,Last Activity,Specialization,How did you hear about X Education,What is your current occupation,Tags,Lead Profile,Asymmetrique Activity Index,Asymmetrique Profile Index,Asymmetrique Activity Score,Asymmetrique Profile Score,A free copy of Mastering The Interview,Lead Origin_API,Lead Origin_Landing Page Submission,Lead Origin_Lead Add Form,Lead Origin_Lead Import,Lead Origin_Quick Add Form,Lead Source_Click2call,Lead Source_Direct Traffic,Lead Source_Facebook,Lead Source_Google,Lead Source_Live Chat,Lead Source_NC_EDM,Lead Source_Olark Chat,Lead Source_Organic Search,Lead Source_Pay per Click Ads,Lead Source_Press_Release,Lead Source_Reference,Lead Source_Referral Sites,Lead Source_Social Media,Lead Source_WeLearn,Lead Source_Welingak Website,Lead Source_bing,Lead Source_blog,Lead Source_testone,Lead Source_welearnblog_Home,Lead Source_youtubechannel,Last Activity_Approached upfront,Last Activity_Converted to Lead,Last Activity_Email Bounced,Last Activity_Email Link Clicked,Last Activity_Email Marked Spam,Last Activity_Email Opened,Last Activity_Email Received,Last Activity_Form Submitted on Website,Last Activity_Had a Phone Conversation,Last Activity_Olark Chat Conversation,Last Activity_Page Visited on Website,Last Activity_Resubscribed to emails,Last Activity_SMS Sent,Last Activity_Unreachable,Last Activity_Unsubscribed,Last Activity_View in browser link Clicked,Last Activity_Visited Booth in Tradeshow,"Specialization_Banking, Investment And Insurance",Specialization_Business Administration,Specialization_E-Business,Specialization_E-COMMERCE,Specialization_Finance Management,Specialization_Healthcare Management,Specialization_Hospitality Management,Specialization_Human Resource Management,Specialization_IT Projects Management,Specialization_International Business,Specialization_Marketing Management,Specialization_Media and Advertising,Specialization_Operations Management,Specialization_Retail Management,Specialization_Rural and Agribusiness,Specialization_Services Excellence,Specialization_Supply Chain Management,Specialization_Travel and Tourism,How did you hear about X Education_Advertisements,How did you hear about X Education_Email,How did you hear about X Education_Multiple Sources,How did you hear about X Education_Online Search,How did you hear about X Education_Other,How did you hear about X Education_SMS,How did you hear about X Education_Social Media,How did you hear about X Education_Student of SomeSchool,How did you hear about X Education_Word Of Mouth,What is your current occupation_Businessman,What is your current occupation_Housewife,What is your current occupation_Other,What is your current occupation_Student,What is your current occupation_Unemployed,What is your current occupation_Working Professional,Tags_Already a student,Tags_Busy,Tags_Closed by Horizzon,Tags_Diploma holder (Not Eligible),Tags_Graduation in progress,Tags_In confusion whether part time or DLP,Tags_Interested in full time MBA,Tags_Interested in Next batch,Tags_Interested in other courses,Tags_Lateral student,Tags_Lost to EINS,Tags_Lost to Others,Tags_Not doing further education,Tags_Recognition issue (DEC approval),Tags_Ringing,Tags_Shall take in the next coming month,Tags_Still Thinking,Tags_University not recognized,Tags_Want to take admission but has financial problems,Tags_Will revert after reading the email,Tags_in touch with EINS,Tags_invalid number,Tags_number not provided,Tags_opp hangup,Tags_switched off,Tags_wrong number given,Lead Profile_Dual Specialization Student,Lead Profile_Lateral Student,Lead Profile_Other Leads,Lead Profile_Potential Lead,Lead Profile_Student of SomeSchool,Asymmetrique Activity Index_High,Asymmetrique Activity Index_Low,Asymmetrique Activity Index_Medium,Asymmetrique Profile Index_High,Asymmetrique Profile Index_Low,Asymmetrique Profile Index_Medium
0,API,Olark Chat,0,0.0,0,0.0,Page Visited on Website,Finance Management,Online Search,Unemployed,Interested in other courses,Potential Lead,Medium,Medium,15.0,15.0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1
1,API,Organic Search,0,5.0,674,2.5,Email Opened,Finance Management,Online Search,Unemployed,Ringing,Potential Lead,Medium,Medium,15.0,15.0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1
2,Landing Page Submission,Direct Traffic,1,2.0,1532,2.0,Email Opened,Business Administration,Online Search,Student,Will revert after reading the email,Potential Lead,Medium,High,14.0,20.0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,1,0,0
3,Landing Page Submission,Direct Traffic,0,1.0,305,1.0,Unreachable,Media and Advertising,Word Of Mouth,Unemployed,Ringing,Potential Lead,Medium,High,13.0,17.0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,1,0,0
4,Landing Page Submission,Google,1,2.0,1428,1.0,Converted to Lead,Finance Management,Other,Unemployed,Will revert after reading the email,Potential Lead,Medium,High,15.0,18.0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,1,0,0


In [62]:
# Dropping original columns from the dataframe
lead_df.drop(['Lead Origin', 'Lead Source', 'Last Activity', 'Specialization','How did you hear about X Education',
                             'What is your current occupation','Tags','Lead Profile','Asymmetrique Profile Index',
             'Asymmetrique Activity Index'],axis=1,inplace=True)

# Dropping one of the dummy variable which has least number of records
lead_df.drop(['Lead Origin_Quick Add Form','Lead Source_testone','Last Activity_Visited Booth in Tradeshow',
              'Tags_Recognition issue (DEC approval)','Specialization_Services Excellence',
              'What is your current occupation_Businessman','How did you hear about X Education_SMS',
              'Lead Profile_Dual Specialization Student','Asymmetrique Profile Index_Low',
              'Asymmetrique Activity Index_Low'],axis=1,inplace=True)

In [63]:
lead_df.info(verbose=True)
# The datatype of all columns is numeric.

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9240 entries, 0 to 9239
Data columns (total 109 columns):
Converted                                                   int64
TotalVisits                                                 float64
Total Time Spent on Website                                 int64
Page Views Per Visit                                        float64
Asymmetrique Activity Score                                 float64
Asymmetrique Profile Score                                  float64
A free copy of Mastering The Interview                      int64
Lead Origin_API                                             uint8
Lead Origin_Landing Page Submission                         uint8
Lead Origin_Lead Add Form                                   uint8
Lead Origin_Lead Import                                     uint8
Lead Source_Click2call                                      uint8
Lead Source_Direct Traffic                                  uint8
Lead Source_Facebook      

In [64]:
# Data Pre-Processing

In [65]:
# Checking how many leads got converted
conversion_rate = (sum(lead_df['Converted'])/len(lead_df['Converted']))*100
conversion_rate

38.53896103896104

In [66]:
# Put variable converted to y
y = lead_df['Converted']
y.head()

0    0
1    0
2    1
3    0
4    1
Name: Converted, dtype: int64

In [67]:
# Put all variables except converted to X
X = lead_df.drop(['Converted'],axis=1)

In [68]:
# Splitting the data into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, test_size=0.3, random_state=100)

In [69]:
# Number of rows in X_test and X_train
print(X_test.shape)
print(X_train.shape)

(2772, 108)
(6468, 108)


In [70]:
# Scaling data
scaler = StandardScaler()

X_train[['TotalVisits','Total Time Spent on Website','Page Views Per Visit','Asymmetrique Activity Score',
        'Asymmetrique Profile Score']] = scaler.fit_transform(X_train[['TotalVisits','Total Time Spent on Website',
                                                                       'Page Views Per Visit','Asymmetrique Activity Score',
                                                                       'Asymmetrique Profile Score']])
X_train.head()

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


Unnamed: 0,TotalVisits,Total Time Spent on Website,Page Views Per Visit,Asymmetrique Activity Score,Asymmetrique Profile Score,A free copy of Mastering The Interview,Lead Origin_API,Lead Origin_Landing Page Submission,Lead Origin_Lead Add Form,Lead Origin_Lead Import,Lead Source_Click2call,Lead Source_Direct Traffic,Lead Source_Facebook,Lead Source_Google,Lead Source_Live Chat,Lead Source_NC_EDM,Lead Source_Olark Chat,Lead Source_Organic Search,Lead Source_Pay per Click Ads,Lead Source_Press_Release,Lead Source_Reference,Lead Source_Referral Sites,Lead Source_Social Media,Lead Source_WeLearn,Lead Source_Welingak Website,Lead Source_bing,Lead Source_blog,Lead Source_welearnblog_Home,Lead Source_youtubechannel,Last Activity_Approached upfront,Last Activity_Converted to Lead,Last Activity_Email Bounced,Last Activity_Email Link Clicked,Last Activity_Email Marked Spam,Last Activity_Email Opened,Last Activity_Email Received,Last Activity_Form Submitted on Website,Last Activity_Had a Phone Conversation,Last Activity_Olark Chat Conversation,Last Activity_Page Visited on Website,Last Activity_Resubscribed to emails,Last Activity_SMS Sent,Last Activity_Unreachable,Last Activity_Unsubscribed,Last Activity_View in browser link Clicked,"Specialization_Banking, Investment And Insurance",Specialization_Business Administration,Specialization_E-Business,Specialization_E-COMMERCE,Specialization_Finance Management,Specialization_Healthcare Management,Specialization_Hospitality Management,Specialization_Human Resource Management,Specialization_IT Projects Management,Specialization_International Business,Specialization_Marketing Management,Specialization_Media and Advertising,Specialization_Operations Management,Specialization_Retail Management,Specialization_Rural and Agribusiness,Specialization_Supply Chain Management,Specialization_Travel and Tourism,How did you hear about X Education_Advertisements,How did you hear about X Education_Email,How did you hear about X Education_Multiple Sources,How did you hear about X Education_Online Search,How did you hear about X Education_Other,How did you hear about X Education_Social Media,How did you hear about X Education_Student of SomeSchool,How did you hear about X Education_Word Of Mouth,What is your current occupation_Housewife,What is your current occupation_Other,What is your current occupation_Student,What is your current occupation_Unemployed,What is your current occupation_Working Professional,Tags_Already a student,Tags_Busy,Tags_Closed by Horizzon,Tags_Diploma holder (Not Eligible),Tags_Graduation in progress,Tags_In confusion whether part time or DLP,Tags_Interested in full time MBA,Tags_Interested in Next batch,Tags_Interested in other courses,Tags_Lateral student,Tags_Lost to EINS,Tags_Lost to Others,Tags_Not doing further education,Tags_Ringing,Tags_Shall take in the next coming month,Tags_Still Thinking,Tags_University not recognized,Tags_Want to take admission but has financial problems,Tags_Will revert after reading the email,Tags_in touch with EINS,Tags_invalid number,Tags_number not provided,Tags_opp hangup,Tags_switched off,Tags_wrong number given,Lead Profile_Lateral Student,Lead Profile_Other Leads,Lead Profile_Potential Lead,Lead Profile_Student of SomeSchool,Asymmetrique Activity Index_High,Asymmetrique Activity Index_Medium,Asymmetrique Profile Index_High,Asymmetrique Profile Index_Medium
1871,-0.657777,-0.885371,-1.088285,0.673271,-1.012828,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,1
6795,0.099469,0.005716,-0.473212,-0.003209,-0.008618,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1,0,1
3516,0.288781,-0.691418,0.067867,-0.003209,-0.008618,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,1
8105,0.288781,1.365219,1.224019,-0.307135,-1.012828,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,1
3934,-0.657777,-0.885371,-1.088285,-0.003209,-0.008618,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1,0,1


In [71]:
# Correlation matrix 
cor=lead_df.corr()

In [72]:
# Fill diagonal values in correlation matrix with NA values
np.fill_diagonal(cor.values,np.nan)

In [73]:
# List the columns with correlation value more than 0.95
to_drop = [column for column in cor.columns if any(cor[column] > 0.95)]
to_drop

['Lead Origin_Lead Import', 'Lead Source_Facebook']

In [74]:
# As we can see Lead Origin_Lead Import is highly correlated to Lead Source_Facebook. We can drop either of the column
cor[to_drop]

Unnamed: 0,Lead Origin_Lead Import,Lead Source_Facebook
Converted,-0.023695,-0.023695
TotalVisits,-0.028817,-0.027649
Total Time Spent on Website,-0.034926,-0.03393
Page Views Per Visit,-0.042836,-0.040212
Asymmetrique Activity Score,-0.052217,-0.05179
Asymmetrique Profile Score,-0.122064,-0.120652
A free copy of Mastering The Interview,-0.052178,-0.052178
Lead Origin_API,-0.061542,-0.061542
Lead Origin_Landing Page Submission,-0.081974,-0.079155
Lead Origin_Lead Add Form,-0.022461,-0.022461


In [75]:
# Drop column 'Lead Origin_Lead Import'
X_train.drop('Lead Origin_Lead Import',axis=1,inplace=True)
X_test.drop('Lead Origin_Lead Import',axis=1,inplace=True)

In [76]:
# Build Logistic Regression – first iteration
logm1 = sm.GLM(y_train,(sm.add_constant(X_train)), family = sm.families.Binomial())
res = logm1.fit()
res.summary()

0,1,2,3
Dep. Variable:,Converted,No. Observations:,6468
Model:,GLM,Df Residuals:,6360
Model Family:,Binomial,Df Model:,107
Link Function:,logit,Scale:,1.0000
Method:,IRLS,Log-Likelihood:,-1607.3
Date:,"Mon, 26 Aug 2019",Deviance:,3214.5
Time:,11:39:05,Pearson chi2:,1.63e+04
No. Iterations:,24,Covariance Type:,nonrobust

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,30.3111,3.12e+05,9.73e-05,1.000,-6.11e+05,6.11e+05
TotalVisits,0.1461,0.039,3.794,0.000,0.071,0.222
Total Time Spent on Website,1.1455,0.056,20.497,0.000,1.036,1.255
Page Views Per Visit,-0.1690,0.066,-2.568,0.010,-0.298,-0.040
Asymmetrique Activity Score,1.3644,0.112,12.222,0.000,1.146,1.583
Asymmetrique Profile Score,0.5090,0.090,5.637,0.000,0.332,0.686
A free copy of Mastering The Interview,-0.0716,0.154,-0.466,0.641,-0.373,0.229
Lead Origin_API,-22.6938,2.98e+04,-0.001,0.999,-5.84e+04,5.83e+04
Lead Origin_Landing Page Submission,-23.7152,2.98e+04,-0.001,0.999,-5.84e+04,5.83e+04


In [77]:
# Feature Elimination
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()

from sklearn.feature_selection import RFE

rfe=RFE(logreg,15)
rfe=rfe.fit(X_train,y_train)

# View the ranking of all variables
list(zip(X_train.columns,rfe.ranking_))

[('TotalVisits', 69),
 ('Total Time Spent on Website', 17),
 ('Page Views Per Visit', 68),
 ('Asymmetrique Activity Score', 14),
 ('Asymmetrique Profile Score', 36),
 ('A free copy of Mastering The Interview', 87),
 ('Lead Origin_API', 90),
 ('Lead Origin_Landing Page Submission', 31),
 ('Lead Origin_Lead Add Form', 1),
 ('Lead Source_Click2call', 89),
 ('Lead Source_Direct Traffic', 55),
 ('Lead Source_Facebook', 52),
 ('Lead Source_Google', 57),
 ('Lead Source_Live Chat', 80),
 ('Lead Source_NC_EDM', 49),
 ('Lead Source_Olark Chat', 67),
 ('Lead Source_Organic Search', 58),
 ('Lead Source_Pay per Click Ads', 92),
 ('Lead Source_Press_Release', 22),
 ('Lead Source_Reference', 85),
 ('Lead Source_Referral Sites', 35),
 ('Lead Source_Social Media', 29),
 ('Lead Source_WeLearn', 43),
 ('Lead Source_Welingak Website', 12),
 ('Lead Source_bing', 83),
 ('Lead Source_blog', 54),
 ('Lead Source_welearnblog_Home', 56),
 ('Lead Source_youtubechannel', 60),
 ('Last Activity_Approached upfront', 

In [78]:
col = X_train.columns[rfe.support_]
col

Index(['Lead Origin_Lead Add Form', 'Last Activity_Email Bounced',
       'Last Activity_Had a Phone Conversation',
       'Last Activity_Olark Chat Conversation', 'Last Activity_SMS Sent',
       'What is your current occupation_Working Professional',
       'Tags_Already a student', 'Tags_Busy', 'Tags_Closed by Horizzon',
       'Tags_Interested  in full time MBA', 'Tags_Lost to EINS',
       'Tags_Ringing', 'Tags_Will revert after reading the email',
       'Tags_switched off', 'Lead Profile_Potential Lead'],
      dtype='object')

In [79]:
# Re-run the model using RFE variables

X_train_sm = sm.add_constant(X_train[col])
logm2 = sm.GLM(y_train,(X_train_sm), family = sm.families.Binomial())
res = logm2.fit()
res.summary()

0,1,2,3
Dep. Variable:,Converted,No. Observations:,6468
Model:,GLM,Df Residuals:,6452
Model Family:,Binomial,Df Model:,15
Link Function:,logit,Scale:,1.0000
Method:,IRLS,Log-Likelihood:,-2196.4
Date:,"Mon, 26 Aug 2019",Deviance:,4392.9
Time:,11:39:09,Pearson chi2:,1.37e+04
No. Iterations:,9,Covariance Type:,nonrobust

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-2.3159,0.222,-10.429,0.000,-2.751,-1.881
Lead Origin_Lead Add Form,2.3791,0.213,11.153,0.000,1.961,2.797
Last Activity_Email Bounced,-1.6687,0.278,-5.995,0.000,-2.214,-1.123
Last Activity_Had a Phone Conversation,2.1490,0.711,3.022,0.003,0.755,3.543
Last Activity_Olark Chat Conversation,-1.6259,0.165,-9.862,0.000,-1.949,-1.303
Last Activity_SMS Sent,1.7694,0.085,20.747,0.000,1.602,1.937
What is your current occupation_Working Professional,2.8925,0.236,12.268,0.000,2.430,3.355
Tags_Already a student,-2.1305,0.738,-2.885,0.004,-3.578,-0.683
Tags_Busy,2.8043,0.279,10.064,0.000,2.258,3.350


In [80]:
# Calculate the VIFs for the model
from statsmodels.stats.outliers_influence import variance_inflation_factor

vif = pd.DataFrame()
vif['Features'] = X_train_sm.columns
vif['VIF'] = [variance_inflation_factor(X_train_sm.values, i) for i in range(X_train_sm.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

Unnamed: 0,Features,VIF
0,const,19.35
13,Tags_Will revert after reading the email,2.65
12,Tags_Ringing,1.93
9,Tags_Closed by Horizzon,1.48
7,Tags_Already a student,1.42
14,Tags_switched off,1.22
15,Lead Profile_Potential Lead,1.21
1,Lead Origin_Lead Add Form,1.2
5,Last Activity_SMS Sent,1.19
8,Tags_Busy,1.17


In [81]:
X_train_sm.drop('Tags_Interested  in full time MBA',axis=1,inplace=True)

In [82]:
logm3 = sm.GLM(y_train,X_train_sm, family = sm.families.Binomial())
logm3.fit().summary()

0,1,2,3
Dep. Variable:,Converted,No. Observations:,6468
Model:,GLM,Df Residuals:,6453
Model Family:,Binomial,Df Model:,14
Link Function:,logit,Scale:,1.0000
Method:,IRLS,Log-Likelihood:,-2198.4
Date:,"Mon, 26 Aug 2019",Deviance:,4396.8
Time:,11:39:09,Pearson chi2:,1.36e+04
No. Iterations:,9,Covariance Type:,nonrobust

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-2.4483,0.216,-11.338,0.000,-2.871,-2.025
Lead Origin_Lead Add Form,2.3855,0.214,11.168,0.000,1.967,2.804
Last Activity_Email Bounced,-1.6646,0.278,-5.980,0.000,-2.210,-1.119
Last Activity_Had a Phone Conversation,2.1544,0.714,3.019,0.003,0.756,3.553
Last Activity_Olark Chat Conversation,-1.6248,0.165,-9.864,0.000,-1.948,-1.302
Last Activity_SMS Sent,1.7686,0.085,20.742,0.000,1.601,1.936
What is your current occupation_Working Professional,2.8503,0.231,12.343,0.000,2.398,3.303
Tags_Already a student,-2.0022,0.737,-2.717,0.007,-3.446,-0.558
Tags_Busy,2.9134,0.277,10.523,0.000,2.371,3.456


In [83]:
# Calculate the VIFs for the model
from statsmodels.stats.outliers_influence import variance_inflation_factor

vif = pd.DataFrame()
vif['Features'] = X_train_sm.columns
vif['VIF'] = [variance_inflation_factor(X_train_sm.values, i) for i in range(X_train_sm.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

Unnamed: 0,Features,VIF
0,const,18.05
12,Tags_Will revert after reading the email,2.48
11,Tags_Ringing,1.84
9,Tags_Closed by Horizzon,1.45
7,Tags_Already a student,1.37
14,Lead Profile_Potential Lead,1.21
1,Lead Origin_Lead Add Form,1.2
13,Tags_switched off,1.2
5,Last Activity_SMS Sent,1.19
8,Tags_Busy,1.15


In [84]:
X_train_sm.drop('Tags_switched off',axis=1,inplace=True)
logm4 = sm.GLM(y_train,(X_train_sm), family = sm.families.Binomial())
logm4.fit().summary()

0,1,2,3
Dep. Variable:,Converted,No. Observations:,6468
Model:,GLM,Df Residuals:,6454
Model Family:,Binomial,Df Model:,13
Link Function:,logit,Scale:,1.0000
Method:,IRLS,Log-Likelihood:,-2200.6
Date:,"Mon, 26 Aug 2019",Deviance:,4401.3
Time:,11:39:10,Pearson chi2:,1.33e+04
No. Iterations:,9,Covariance Type:,nonrobust

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-2.5887,0.213,-12.178,0.000,-3.005,-2.172
Lead Origin_Lead Add Form,2.3965,0.214,11.183,0.000,1.976,2.816
Last Activity_Email Bounced,-1.6658,0.279,-5.975,0.000,-2.212,-1.119
Last Activity_Had a Phone Conversation,2.1621,0.718,3.010,0.003,0.754,3.570
Last Activity_Olark Chat Conversation,-1.6244,0.165,-9.855,0.000,-1.948,-1.301
Last Activity_SMS Sent,1.7573,0.085,20.685,0.000,1.591,1.924
What is your current occupation_Working Professional,2.8670,0.232,12.344,0.000,2.412,3.322
Tags_Already a student,-1.8491,0.735,-2.515,0.012,-3.290,-0.408
Tags_Busy,3.1004,0.267,11.605,0.000,2.577,3.624


In [85]:
# Calculate the VIFs for the model
from statsmodels.stats.outliers_influence import variance_inflation_factor

vif = pd.DataFrame()
vif['Features'] = X_train_sm.columns
vif['VIF'] = [variance_inflation_factor(X_train_sm.values, i) for i in range(X_train_sm.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

Unnamed: 0,Features,VIF
0,const,17.16
12,Tags_Will revert after reading the email,2.1
11,Tags_Ringing,1.66
9,Tags_Closed by Horizzon,1.4
7,Tags_Already a student,1.32
1,Lead Origin_Lead Add Form,1.2
13,Lead Profile_Potential Lead,1.2
5,Last Activity_SMS Sent,1.18
8,Tags_Busy,1.12
10,Tags_Lost to EINS,1.1


In [86]:
X_train_sm.drop('Tags_Ringing',axis=1,inplace=True)
logm5 = sm.GLM(y_train,(X_train_sm), family = sm.families.Binomial())
res = logm5.fit()
res.summary()

0,1,2,3
Dep. Variable:,Converted,No. Observations:,6468
Model:,GLM,Df Residuals:,6455
Model Family:,Binomial,Df Model:,12
Link Function:,logit,Scale:,1.0000
Method:,IRLS,Log-Likelihood:,-2202.4
Date:,"Mon, 26 Aug 2019",Deviance:,4404.8
Time:,11:39:10,Pearson chi2:,1.35e+04
No. Iterations:,9,Covariance Type:,nonrobust

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-2.7429,0.205,-13.409,0.000,-3.144,-2.342
Lead Origin_Lead Add Form,2.3637,0.211,11.220,0.000,1.951,2.777
Last Activity_Email Bounced,-1.6590,0.279,-5.957,0.000,-2.205,-1.113
Last Activity_Had a Phone Conversation,2.1608,0.718,3.007,0.003,0.753,3.569
Last Activity_Olark Chat Conversation,-1.6247,0.165,-9.849,0.000,-1.948,-1.301
Last Activity_SMS Sent,1.7469,0.085,20.631,0.000,1.581,1.913
What is your current occupation_Working Professional,2.8925,0.235,12.306,0.000,2.432,3.353
Tags_Already a student,-1.6722,0.731,-2.286,0.022,-3.106,-0.239
Tags_Busy,3.3367,0.241,13.831,0.000,2.864,3.810


In [87]:
# Calculate the VIFs for the model
from statsmodels.stats.outliers_influence import variance_inflation_factor

vif = pd.DataFrame()
vif['Features'] = X_train_sm.columns
vif['VIF'] = [variance_inflation_factor(X_train_sm.values, i) for i in range(X_train_sm.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

Unnamed: 0,Features,VIF
0,const,15.82
11,Tags_Will revert after reading the email,1.34
9,Tags_Closed by Horizzon,1.29
7,Tags_Already a student,1.22
1,Lead Origin_Lead Add Form,1.2
5,Last Activity_SMS Sent,1.17
12,Lead Profile_Potential Lead,1.17
4,Last Activity_Olark Chat Conversation,1.09
6,What is your current occupation_Working Profes...,1.09
8,Tags_Busy,1.06


In [88]:
X_train_sm.drop('Tags_Already a student',axis=1,inplace=True)
logm6 = sm.GLM(y_train,(X_train_sm), family = sm.families.Binomial())
res = logm6.fit()
res.summary()

# This is the final model since the p values are almost zero for all variables.

0,1,2,3
Dep. Variable:,Converted,No. Observations:,6468
Model:,GLM,Df Residuals:,6456
Model Family:,Binomial,Df Model:,11
Link Function:,logit,Scale:,1.0000
Method:,IRLS,Log-Likelihood:,-2206.7
Date:,"Mon, 26 Aug 2019",Deviance:,4413.4
Time:,11:39:10,Pearson chi2:,1.36e+04
No. Iterations:,9,Covariance Type:,nonrobust

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-2.9623,0.190,-15.572,0.000,-3.335,-2.589
Lead Origin_Lead Add Form,2.3873,0.212,11.250,0.000,1.971,2.803
Last Activity_Email Bounced,-1.6643,0.278,-5.978,0.000,-2.210,-1.119
Last Activity_Had a Phone Conversation,2.1706,0.722,3.005,0.003,0.755,3.586
Last Activity_Olark Chat Conversation,-1.6268,0.165,-9.872,0.000,-1.950,-1.304
Last Activity_SMS Sent,1.7605,0.085,20.759,0.000,1.594,1.927
What is your current occupation_Working Professional,2.8878,0.233,12.400,0.000,2.431,3.344
Tags_Busy,3.4428,0.240,14.325,0.000,2.972,3.914
Tags_Closed by Horizzon,9.1737,1.017,9.022,0.000,7.181,11.167


In [89]:
# Calculate the VIFs for the model
from statsmodels.stats.outliers_influence import variance_inflation_factor

vif = pd.DataFrame()
vif['Features'] = X_train_sm.columns
vif['VIF'] = [variance_inflation_factor(X_train_sm.values, i) for i in range(X_train_sm.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

Unnamed: 0,Features,VIF
0,const,13.1
8,Tags_Closed by Horizzon,1.28
10,Tags_Will revert after reading the email,1.26
1,Lead Origin_Lead Add Form,1.2
5,Last Activity_SMS Sent,1.17
6,What is your current occupation_Working Profes...,1.09
4,Last Activity_Olark Chat Conversation,1.08
11,Lead Profile_Potential Lead,1.08
7,Tags_Busy,1.05
9,Tags_Lost to EINS,1.04


In [90]:
# make predictions on train data
y_train_pred = res.predict(X_train_sm).values.reshape(-1)
y_train_pred

array([0.00962263, 0.34713414, 0.00962263, ..., 0.34713414, 0.00962263,
       0.34713414])

In [91]:
# Create dataframe with actual converted column and predicted probability
y_train_pred_final = pd.DataFrame({'Converted':y_train.values, 'Converted_prob':y_train_pred})
y_train_pred_final

Unnamed: 0,Converted,Converted_prob
0,0,0.009623
1,0,0.347134
2,0,0.009623
3,0,0.053482
4,0,0.347134
5,1,0.982306
6,0,0.009623
7,1,0.999994
8,0,0.347134
9,1,0.755628


In [92]:
# Creating new column 'predicted' with 1 if converted_prob > 0.5 else 0
y_train_pred_final['predicted'] = y_train_pred_final.Converted_prob.map(lambda x: 1 if x > 0.5 else 0)
y_train_pred_final['Lead_Score'] = round(y_train_pred_final.Converted_prob*100,0)
y_train_pred_final.head()

Unnamed: 0,Converted,Converted_prob,predicted,Lead_Score
0,0,0.009623,0,1.0
1,0,0.347134,0,35.0
2,0,0.009623,0,1.0
3,0,0.053482,0,5.0
4,0,0.347134,0,35.0


In [93]:
# Confusion matrix 
from sklearn import metrics
confusion = metrics.confusion_matrix(y_train_pred_final.Converted, y_train_pred_final.predicted )
print(confusion)
# 3704 not converted is being predicted as not converted
# 1823 converted is being predicted as converted
# 298 not converted is being predicted as converted
# 643 converted is being predicted as not converted

[[3704  298]
 [ 643 1823]]


In [94]:
# Let's check the overall accuracy.
from sklearn import metrics
print(metrics.accuracy_score(y_train_pred_final.Converted, y_train_pred_final.predicted))

0.8545145330859617


In [95]:
# Final columns
final_col = X_train_sm.drop('const',axis=1).columns
final_col

Index(['Lead Origin_Lead Add Form', 'Last Activity_Email Bounced',
       'Last Activity_Had a Phone Conversation',
       'Last Activity_Olark Chat Conversation', 'Last Activity_SMS Sent',
       'What is your current occupation_Working Professional', 'Tags_Busy',
       'Tags_Closed by Horizzon', 'Tags_Lost to EINS',
       'Tags_Will revert after reading the email',
       'Lead Profile_Potential Lead'],
      dtype='object')

In [96]:
# Making predictions on Test Data

In [97]:
# Scale the test data
X_test[['TotalVisits','Total Time Spent on Website','Page Views Per Visit','Asymmetrique Activity Score',
        'Asymmetrique Profile Score']] = scaler.transform(X_test[['TotalVisits','Total Time Spent on Website',
                                                                       'Page Views Per Visit','Asymmetrique Activity Score',
                                                                       'Asymmetrique Profile Score']])
X_test.head()

  """


Unnamed: 0,TotalVisits,Total Time Spent on Website,Page Views Per Visit,Asymmetrique Activity Score,Asymmetrique Profile Score,A free copy of Mastering The Interview,Lead Origin_API,Lead Origin_Landing Page Submission,Lead Origin_Lead Add Form,Lead Source_Click2call,Lead Source_Direct Traffic,Lead Source_Facebook,Lead Source_Google,Lead Source_Live Chat,Lead Source_NC_EDM,Lead Source_Olark Chat,Lead Source_Organic Search,Lead Source_Pay per Click Ads,Lead Source_Press_Release,Lead Source_Reference,Lead Source_Referral Sites,Lead Source_Social Media,Lead Source_WeLearn,Lead Source_Welingak Website,Lead Source_bing,Lead Source_blog,Lead Source_welearnblog_Home,Lead Source_youtubechannel,Last Activity_Approached upfront,Last Activity_Converted to Lead,Last Activity_Email Bounced,Last Activity_Email Link Clicked,Last Activity_Email Marked Spam,Last Activity_Email Opened,Last Activity_Email Received,Last Activity_Form Submitted on Website,Last Activity_Had a Phone Conversation,Last Activity_Olark Chat Conversation,Last Activity_Page Visited on Website,Last Activity_Resubscribed to emails,Last Activity_SMS Sent,Last Activity_Unreachable,Last Activity_Unsubscribed,Last Activity_View in browser link Clicked,"Specialization_Banking, Investment And Insurance",Specialization_Business Administration,Specialization_E-Business,Specialization_E-COMMERCE,Specialization_Finance Management,Specialization_Healthcare Management,Specialization_Hospitality Management,Specialization_Human Resource Management,Specialization_IT Projects Management,Specialization_International Business,Specialization_Marketing Management,Specialization_Media and Advertising,Specialization_Operations Management,Specialization_Retail Management,Specialization_Rural and Agribusiness,Specialization_Supply Chain Management,Specialization_Travel and Tourism,How did you hear about X Education_Advertisements,How did you hear about X Education_Email,How did you hear about X Education_Multiple Sources,How did you hear about X Education_Online Search,How did you hear about X Education_Other,How did you hear about X Education_Social Media,How did you hear about X Education_Student of SomeSchool,How did you hear about X Education_Word Of Mouth,What is your current occupation_Housewife,What is your current occupation_Other,What is your current occupation_Student,What is your current occupation_Unemployed,What is your current occupation_Working Professional,Tags_Already a student,Tags_Busy,Tags_Closed by Horizzon,Tags_Diploma holder (Not Eligible),Tags_Graduation in progress,Tags_In confusion whether part time or DLP,Tags_Interested in full time MBA,Tags_Interested in Next batch,Tags_Interested in other courses,Tags_Lateral student,Tags_Lost to EINS,Tags_Lost to Others,Tags_Not doing further education,Tags_Ringing,Tags_Shall take in the next coming month,Tags_Still Thinking,Tags_University not recognized,Tags_Want to take admission but has financial problems,Tags_Will revert after reading the email,Tags_in touch with EINS,Tags_invalid number,Tags_number not provided,Tags_opp hangup,Tags_switched off,Tags_wrong number given,Lead Profile_Lateral Student,Lead Profile_Other Leads,Lead Profile_Potential Lead,Lead Profile_Student of SomeSchool,Asymmetrique Activity Index_High,Asymmetrique Activity Index_Medium,Asymmetrique Profile Index_High,Asymmetrique Profile Index_Medium
4269,0.856716,0.964504,2.611401,-0.003209,-0.008618,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1,0,1
2376,-0.657777,-0.885371,-1.088285,0.673271,0.485993,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1,1,0
7766,0.288781,-0.777416,1.224019,1.653676,-1.012828,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0,1
9199,-0.657777,-0.885371,-1.088285,2.634081,-0.263417,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,1
4359,-0.657777,-0.885371,-1.088285,-0.003209,-0.008618,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1,0,1


In [98]:
X_test.shape

(2772, 107)

In [99]:
X_test = X_test[final_col]
X_test.head()

Unnamed: 0,Lead Origin_Lead Add Form,Last Activity_Email Bounced,Last Activity_Had a Phone Conversation,Last Activity_Olark Chat Conversation,Last Activity_SMS Sent,What is your current occupation_Working Professional,Tags_Busy,Tags_Closed by Horizzon,Tags_Lost to EINS,Tags_Will revert after reading the email,Lead Profile_Potential Lead
4269,0,0,0,0,1,0,0,0,0,1,1
2376,1,0,0,0,1,0,0,0,0,1,1
7766,0,0,0,0,0,1,0,0,0,1,1
9199,0,0,0,1,0,0,0,0,0,0,1
4359,1,0,0,0,0,0,0,0,0,1,1


In [100]:
X_test_sm = sm.add_constant(X_test)

In [101]:
y_test_pred = res.predict(X_test_sm)
y_test_pred_df=pd.DataFrame(y_test_pred)
y_test_pred_df.head()

Unnamed: 0,0
4269,0.755628
2376,0.971143
7766,0.90518
9199,0.001906
4359,0.852659


In [102]:
# Renaming the column 
y_test_pred_df= y_test_pred_df.rename(columns={ 0 : 'Converted_Prob'})

In [103]:
y_test_pred_f1 = pd.merge(y_test_pred_df, lead, left_index=True, right_index=True)
y_test_pred_f2 = pd.merge(y_test_pred_f1, Student_ID, left_index=True, right_index=True)
y_test_pred_f2.head()

Unnamed: 0,Converted_Prob,Lead Origin,Lead Source,Converted,TotalVisits,Total Time Spent on Website,Page Views Per Visit,Last Activity,Specialization,How did you hear about X Education,What is your current occupation,Tags,Lead Profile,Asymmetrique Activity Index,Asymmetrique Profile Index,Asymmetrique Activity Score,Asymmetrique Profile Score,A free copy of Mastering The Interview,Prospect ID
4269,0.755628,API,Direct Traffic,1,8.0,1011,8.0,SMS Sent,Finance Management,Online Search,Unemployed,Will revert after reading the email,Potential Lead,Medium,Medium,14.31,16.34,0,dd53b3eb-ae22-474c-b872-48b05bbe180b
2376,0.971143,Lead Add Form,Reference,1,0.0,0,0.0,SMS Sent,Finance Management,Online Search,Unemployed,Will revert after reading the email,Potential Lead,Medium,High,15.0,17.0,0,7a960b03-466c-4e36-bf12-b755fc77a0b1
7766,0.90518,API,Google,1,5.0,59,5.0,Unreachable,Supply Chain Management,Word Of Mouth,Working Professional,Will revert after reading the email,Potential Lead,High,Medium,16.0,15.0,0,2bd5fd90-a8fe-413e-9b8a-28e8469c5a5c
9199,0.001906,API,Olark Chat,0,0.0,0,0.0,Olark Chat Conversation,Finance Management,Online Search,Unemployed,Already a student,Potential Lead,High,Medium,17.0,16.0,0,6eb89ae5-d1e0-4c19-8661-8f9545e0e408
4359,0.852659,Lead Add Form,Reference,1,0.0,0,0.0,Email Opened,Marketing Management,Online Search,Unemployed,Will revert after reading the email,Potential Lead,Medium,Medium,14.31,16.34,0,f7ed6c72-7d36-413c-b2e0-414213c4ceef


In [104]:
y_test_pred_f3=y_test_pred_f2[['Prospect ID','Converted_Prob','Converted']]
y_test_pred_f3['predicted']=y_test_pred_f3.Converted_Prob.map(lambda x: 1 if x > 0.5 else 0)
y_test_pred_f3['lead_score']=y_test_pred_f3.Converted_Prob*100
y_test_pred_f3.head()

Unnamed: 0,Prospect ID,Converted_Prob,Converted,predicted,lead_score
4269,dd53b3eb-ae22-474c-b872-48b05bbe180b,0.755628,1,1,75.562802
2376,7a960b03-466c-4e36-bf12-b755fc77a0b1,0.971143,1,1,97.114325
7766,2bd5fd90-a8fe-413e-9b8a-28e8469c5a5c,0.90518,1,1,90.517985
9199,6eb89ae5-d1e0-4c19-8661-8f9545e0e408,0.001906,0,0,0.190622
4359,f7ed6c72-7d36-413c-b2e0-414213c4ceef,0.852659,1,1,85.265926


In [105]:
# Accuracy on test data
print(metrics.accuracy_score(y_test_pred_f3.Converted, y_test_pred_f3.predicted))

# Accuracy on test data is same as train data i.e 85%

0.8466810966810967


In [125]:
# Scenario 1 - 80% conversion rate
# Hence the student IDs that are predicted as 1, and having 80% or more probability are the Hot leads
# 936 records in total have been predicted as 1  
# 438 potential students are classified as hot

y80=y_test_pred_f3[y_test_pred_f3.lead_score >=80].sort_values(by='lead_score',ascending=False)
y80.shape

(438, 5)

In [107]:
# Scenario-2 (2 months in a year, when interns are hired; more bandwidth for nurturing leads) – we assume a lower target conversion rate 
# – say all 1s are hot leads

y_test_pred_f3[y_test_pred_f3.predicted == 1].sort_values(by='lead_score',ascending=False)

Unnamed: 0,Prospect ID,Converted_Prob,Converted,predicted,lead_score
220,0ad6eef7-7935-499d-903b-bf17566b241e,0.999990,1,1,99.998973
5921,bb3840bf-b040-45f3-a494-68927055dbb1,0.999990,1,1,99.998973
5784,4e50d3ad-ebb2-4943-ac53-6838264b90b6,0.999984,1,1,99.998381
2764,7b00aa14-d76a-448c-bb99-9af1e1088018,0.999984,1,1,99.998381
6736,8377d1f6-b952-4e63-9fb1-aeb388c39662,0.999945,1,1,99.994537
4612,5f4a0ace-c880-4427-8d7b-66d97202b08d,0.999945,1,1,99.994537
3478,59d2f9c6-d03e-4c51-8324-0c76feb19df6,0.999945,1,1,99.994537
7187,f33166e8-d8d3-4e8c-b9d0-8a1922c35910,0.999945,1,1,99.994537
4613,240134a6-7917-4a71-a0ac-909d9f055148,0.999945,1,1,99.994537
818,4cb2a48d-e9d2-4845-acb1-baaad08c8c07,0.999945,1,1,99.994537


In [124]:
# Scenario-3 (target achieved before time) – we assume a very high target conversion rate – 
#say 99% or above probability as hot leads to allow the team to focus on other activities


y90=y_test_pred_f3[y_test_pred_f3.lead_score >=99].sort_values(by='lead_score',ascending=False)
y90.shape

(131, 5)