In [2]:
#!pipenv install pandas numpy matplolib seaborn plotly scipy scikit-learn

In [87]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from scipy.stats import chi2_contingency

sns.set_style("whitegrid")

from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import StackingClassifier

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix


In [2]:
df_leads = pd.read_csv('./datasets/leads.csv')

In [3]:
  df_leads.head()

Unnamed: 0,Prospect ID,Lead Number,Lead Origin,Lead Source,Do Not Email,Do Not Call,Converted,TotalVisits,Total Time Spent on Website,Page Views Per Visit,Last Activity,Country,Specialization,How did you hear about X Education,What is your current occupation,What matters most to you in choosing a course,Search,Magazine,Newspaper Article,X Education Forums,Newspaper,Digital Advertisement,Through Recommendations,Receive More Updates About Our Courses,Tags,Lead Quality,Update me on Supply Chain Content,Get updates on DM Content,Lead Profile,City,Asymmetrique Activity Index,Asymmetrique Profile Index,Asymmetrique Activity Score,Asymmetrique Profile Score,I agree to pay the amount through cheque,A free copy of Mastering The Interview,Last Notable Activity
0,7927b2df-8bba-4d29-b9a2-b6e0beafe620,660737,API,Olark Chat,No,No,0,0.0,0,0.0,Page Visited on Website,,Select,Select,Unemployed,Better Career Prospects,No,No,No,No,No,No,No,No,Interested in other courses,Low in Relevance,No,No,Select,Select,02.Medium,02.Medium,15.0,15.0,No,No,Modified
1,2a272436-5132-4136-86fa-dcc88c88f482,660728,API,Organic Search,No,No,0,5.0,674,2.5,Email Opened,India,Select,Select,Unemployed,Better Career Prospects,No,No,No,No,No,No,No,No,Ringing,,No,No,Select,Select,02.Medium,02.Medium,15.0,15.0,No,No,Email Opened
2,8cc8c611-a219-4f35-ad23-fdfd2656bd8a,660727,Landing Page Submission,Direct Traffic,No,No,1,2.0,1532,2.0,Email Opened,India,Business Administration,Select,Student,Better Career Prospects,No,No,No,No,No,No,No,No,Will revert after reading the email,Might be,No,No,Potential Lead,Mumbai,02.Medium,01.High,14.0,20.0,No,Yes,Email Opened
3,0cc2df48-7cf4-4e39-9de9-19797f9b38cc,660719,Landing Page Submission,Direct Traffic,No,No,0,1.0,305,1.0,Unreachable,India,Media and Advertising,Word Of Mouth,Unemployed,Better Career Prospects,No,No,No,No,No,No,No,No,Ringing,Not Sure,No,No,Select,Mumbai,02.Medium,01.High,13.0,17.0,No,No,Modified
4,3256f628-e534-4826-9d63-4a8b88782852,660681,Landing Page Submission,Google,No,No,1,2.0,1428,1.0,Converted to Lead,India,Select,Other,Unemployed,Better Career Prospects,No,No,No,No,No,No,No,No,Will revert after reading the email,Might be,No,No,Select,Mumbai,02.Medium,01.High,15.0,18.0,No,No,Modified


In [4]:
  df_leads.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9240 entries, 0 to 9239
Data columns (total 37 columns):
 #   Column                                         Non-Null Count  Dtype  
---  ------                                         --------------  -----  
 0   Prospect ID                                    9240 non-null   object 
 1   Lead Number                                    9240 non-null   int64  
 2   Lead Origin                                    9240 non-null   object 
 3   Lead Source                                    9204 non-null   object 
 4   Do Not Email                                   9240 non-null   object 
 5   Do Not Call                                    9240 non-null   object 
 6   Converted                                      9240 non-null   int64  
 7   TotalVisits                                    9103 non-null   float64
 8   Total Time Spent on Website                    9240 non-null   int64  
 9   Page Views Per Visit                           9103 

In [5]:
df_leads.isna().sum()

Prospect ID                                         0
Lead Number                                         0
Lead Origin                                         0
Lead Source                                        36
Do Not Email                                        0
Do Not Call                                         0
Converted                                           0
TotalVisits                                       137
Total Time Spent on Website                         0
Page Views Per Visit                              137
Last Activity                                     103
Country                                          2461
Specialization                                   1438
How did you hear about X Education               2207
What is your current occupation                  2690
What matters most to you in choosing a course    2709
Search                                              0
Magazine                                            0
Newspaper Article           

In [6]:
df_leads.shape

(9240, 37)

In [7]:
df_leads.describe()

Unnamed: 0,Lead Number,Converted,TotalVisits,Total Time Spent on Website,Page Views Per Visit,Asymmetrique Activity Score,Asymmetrique Profile Score
count,9240.0,9240.0,9103.0,9240.0,9103.0,5022.0,5022.0
mean,617188.435606,0.38539,3.445238,487.698268,2.36282,14.306252,16.344883
std,23405.995698,0.486714,4.854853,548.021466,2.161418,1.386694,1.811395
min,579533.0,0.0,0.0,0.0,0.0,7.0,11.0
25%,596484.5,0.0,1.0,12.0,1.0,14.0,15.0
50%,615479.0,0.0,3.0,248.0,2.0,14.0,16.0
75%,637387.25,1.0,5.0,936.0,3.0,15.0,18.0
max,660737.0,1.0,251.0,2272.0,55.0,18.0,20.0


In [8]:
df_leads['Lead Origin'].unique()

array(['API', 'Landing Page Submission', 'Lead Add Form', 'Lead Import',
       'Quick Add Form'], dtype=object)

In [9]:
df_leads['Lead Source'].unique()

array(['Olark Chat', 'Organic Search', 'Direct Traffic', 'Google',
       'Referral Sites', 'Welingak Website', 'Reference', 'google',
       'Facebook', nan, 'blog', 'Pay per Click Ads', 'bing',
       'Social Media', 'WeLearn', 'Click2call', 'Live Chat',
       'welearnblog_Home', 'youtubechannel', 'testone', 'Press_Release',
       'NC_EDM'], dtype=object)

In [10]:
df_leads['Last Notable Activity'].unique()

array(['Modified', 'Email Opened', 'Page Visited on Website',
       'Email Bounced', 'Email Link Clicked', 'Unreachable',
       'Unsubscribed', 'Had a Phone Conversation',
       'Olark Chat Conversation', 'SMS Sent', 'Approached upfront',
       'Resubscribed to emails', 'View in browser link Clicked',
       'Form Submitted on Website', 'Email Received', 'Email Marked Spam'],
      dtype=object)

### FEATURING ENGENIRING AND DATA CLEANING

In [11]:
df_leads.drop(columns=['Prospect ID', 'Lead Number'], axis=1, inplace=True)

In [12]:
df_leads.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9240 entries, 0 to 9239
Data columns (total 35 columns):
 #   Column                                         Non-Null Count  Dtype  
---  ------                                         --------------  -----  
 0   Lead Origin                                    9240 non-null   object 
 1   Lead Source                                    9204 non-null   object 
 2   Do Not Email                                   9240 non-null   object 
 3   Do Not Call                                    9240 non-null   object 
 4   Converted                                      9240 non-null   int64  
 5   TotalVisits                                    9103 non-null   float64
 6   Total Time Spent on Website                    9240 non-null   int64  
 7   Page Views Per Visit                           9103 non-null   float64
 8   Last Activity                                  9137 non-null   object 
 9   Country                                        6779 

In [13]:
df_leads.shape

(9240, 35)

In [14]:
# Colunas que tem apenas um valor mesmo valor em todas as linhas
# Podemos excluir essas colunas
colunas = []
for column in df_leads.select_dtypes(include=['object']).columns:
  if len(df_leads[column].unique()) == 1:
    colunas.append(column)

In [15]:
colunas

['Magazine',
 'Receive More Updates About Our Courses',
 'Update me on Supply Chain Content',
 'Get updates on DM Content',
 'I agree to pay the amount through cheque']

In [16]:
df_leads.drop(columns=colunas, axis=1, inplace=True)

In [17]:
df_leads.shape

(9240, 30)

In [18]:
for column in df_leads.select_dtypes(include=['object']).columns:
  print(f"Value possiveis para a coluna {column}: ", df_leads[column].unique())

Value possiveis para a coluna Lead Origin:  ['API' 'Landing Page Submission' 'Lead Add Form' 'Lead Import'
 'Quick Add Form']
Value possiveis para a coluna Lead Source:  ['Olark Chat' 'Organic Search' 'Direct Traffic' 'Google' 'Referral Sites'
 'Welingak Website' 'Reference' 'google' 'Facebook' nan 'blog'
 'Pay per Click Ads' 'bing' 'Social Media' 'WeLearn' 'Click2call'
 'Live Chat' 'welearnblog_Home' 'youtubechannel' 'testone' 'Press_Release'
 'NC_EDM']
Value possiveis para a coluna Do Not Email:  ['No' 'Yes']
Value possiveis para a coluna Do Not Call:  ['No' 'Yes']
Value possiveis para a coluna Last Activity:  ['Page Visited on Website' 'Email Opened' 'Unreachable'
 'Converted to Lead' 'Olark Chat Conversation' 'Email Bounced'
 'Email Link Clicked' 'Form Submitted on Website' 'Unsubscribed'
 'Had a Phone Conversation' 'View in browser link Clicked' nan
 'Approached upfront' 'SMS Sent' 'Visited Booth in Tradeshow'
 'Resubscribed to emails' 'Email Received' 'Email Marked Spam']
Value p

In [19]:
# Corrigindo valores 
df_leads['Lead Source'] = df_leads['Lead Source'].apply(lambda x: 'Google' if x == 'google' else x)

In [20]:
df_leads.isna().sum()

Lead Origin                                         0
Lead Source                                        36
Do Not Email                                        0
Do Not Call                                         0
Converted                                           0
TotalVisits                                       137
Total Time Spent on Website                         0
Page Views Per Visit                              137
Last Activity                                     103
Country                                          2461
Specialization                                   1438
How did you hear about X Education               2207
What is your current occupation                  2690
What matters most to you in choosing a course    2709
Search                                              0
Newspaper Article                                   0
X Education Forums                                  0
Newspaper                                           0
Digital Advertisement       

In [21]:
# Calculando porcentagem de valores nulos em colunas object
# Nas colunas object, se o texto 'Select' estiver presente, devemos considerar como nula
object_to_drop = []
for column in df_leads.select_dtypes(include=['object']).columns:
  print(f"Quantidade de nulos na coluna {column}: {(df_leads[column].isnull().sum() + len(df_leads[df_leads[column] == 'Select'])) / len(df_leads[column]) * 100:.2f}\n")
  if ((df_leads[column].isnull().sum() +  len(df_leads[df_leads[column] == 'Select'])) / len(df_leads[column]) * 100) > 25:
    object_to_drop.append(column)
print(f"Colunas com mais de 30% de valores nulos: \n{object_to_drop}")
print("Quantidade de colunas a excluir: ", len(object_to_drop))

Quantidade de nulos na coluna Lead Origin: 0.00

Quantidade de nulos na coluna Lead Source: 0.39

Quantidade de nulos na coluna Do Not Email: 0.00

Quantidade de nulos na coluna Do Not Call: 0.00

Quantidade de nulos na coluna Last Activity: 1.11

Quantidade de nulos na coluna Country: 26.63

Quantidade de nulos na coluna Specialization: 36.58

Quantidade de nulos na coluna How did you hear about X Education: 78.46

Quantidade de nulos na coluna What is your current occupation: 29.11

Quantidade de nulos na coluna What matters most to you in choosing a course: 29.32

Quantidade de nulos na coluna Search: 0.00

Quantidade de nulos na coluna Newspaper Article: 0.00

Quantidade de nulos na coluna X Education Forums: 0.00

Quantidade de nulos na coluna Newspaper: 0.00

Quantidade de nulos na coluna Digital Advertisement: 0.00

Quantidade de nulos na coluna Through Recommendations: 0.00

Quantidade de nulos na coluna Tags: 36.29

Quantidade de nulos na coluna Lead Quality: 51.59

Quantidade

In [22]:
# Drop das colunas categoricas com mais de 25% de nulos
df_leads.drop(columns=object_to_drop, axis=1, inplace=True)

In [23]:
df_leads.shape

(9240, 19)

In [24]:
# Calculando porcentagem de valores nulos em colunas int64
int_to_drop = []
for column in df_leads.select_dtypes(include=['int64']).columns:
  if df_leads[column].isnull().sum() / len(df_leads[column]) * 100 > 30:
    print(df_leads[column].isna().sum() / len(df_leads[column]) * 100)
    int_to_drop.append(column)
int_to_drop

[]

In [25]:
# Calculando porcentagem de valores nulos em colunas number
number_to_drop = []
for column in df_leads.select_dtypes(include=['number']).columns:
  if df_leads[column].isnull().sum() / len(df_leads[column]) * 100 > 30:
    print(df_leads[column].isna().sum() / len(df_leads[column]) * 100)
    number_to_drop.append(column)
number_to_drop

45.64935064935065
45.64935064935065


['Asymmetrique Activity Score', 'Asymmetrique Profile Score']

In [26]:
# Drop das colunas float64 com mais de 25% de nulos
df_leads.drop(columns=number_to_drop, axis=1, inplace=True)

In [27]:
# Convertendo valores Yes/No para 0 e 1
features_binarias = []
for column in df_leads.select_dtypes(include=['object']).columns:
  if len(df_leads[column].unique()) == 2:
    features_binarias.append(column)
    print(df_leads[column].unique())
    df_leads[column] = df_leads[column].apply(lambda x: 1 if x == 'Yes' else 0)
features_binarias

['No' 'Yes']
['No' 'Yes']
['No' 'Yes']
['No' 'Yes']
['No' 'Yes']
['No' 'Yes']
['No' 'Yes']
['No' 'Yes']
['No' 'Yes']


['Do Not Email',
 'Do Not Call',
 'Search',
 'Newspaper Article',
 'X Education Forums',
 'Newspaper',
 'Digital Advertisement',
 'Through Recommendations',
 'A free copy of Mastering The Interview']

In [28]:
df_leads.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9240 entries, 0 to 9239
Data columns (total 17 columns):
 #   Column                                  Non-Null Count  Dtype  
---  ------                                  --------------  -----  
 0   Lead Origin                             9240 non-null   object 
 1   Lead Source                             9204 non-null   object 
 2   Do Not Email                            9240 non-null   int64  
 3   Do Not Call                             9240 non-null   int64  
 4   Converted                               9240 non-null   int64  
 5   TotalVisits                             9103 non-null   float64
 6   Total Time Spent on Website             9240 non-null   int64  
 7   Page Views Per Visit                    9103 non-null   float64
 8   Last Activity                           9137 non-null   object 
 9   Search                                  9240 non-null   int64  
 10  Newspaper Article                       9240 non-null   int6

In [29]:
for column in df_leads.select_dtypes(include=['int64']).columns:
  print(f"Possiveis valores da coluna {column}", df_leads[column].unique())

Possiveis valores da coluna Do Not Email [0 1]
Possiveis valores da coluna Do Not Call [0 1]
Possiveis valores da coluna Converted [0 1]
Possiveis valores da coluna Total Time Spent on Website [   0  674 1532 ...  603  483  927]
Possiveis valores da coluna Search [0 1]
Possiveis valores da coluna Newspaper Article [0 1]
Possiveis valores da coluna X Education Forums [0 1]
Possiveis valores da coluna Newspaper [0 1]
Possiveis valores da coluna Digital Advertisement [0 1]
Possiveis valores da coluna Through Recommendations [0 1]
Possiveis valores da coluna A free copy of Mastering The Interview [0 1]


In [64]:
df_leads.isnull().sum()

Lead Origin                               0
Lead Source                               0
Do Not Email                              0
Do Not Call                               0
Converted                                 0
TotalVisits                               0
Total Time Spent on Website               0
Page Views Per Visit                      0
Last Activity                             0
Search                                    0
Newspaper Article                         0
X Education Forums                        0
Newspaper                                 0
Digital Advertisement                     0
Through Recommendations                   0
A free copy of Mastering The Interview    0
Last Notable Activity                     0
dtype: int64

In [63]:
df_leads['Page Views Per Visit'].unique()

array([ 0.  ,  2.5 ,  2.  ,  1.  ,  4.  ,  8.  ,  2.67, 11.  ,  5.  ,
        6.  ,  3.  ,  1.33,  1.5 ,  3.5 ,  7.  ,  2.33, 13.  ,  8.5 ,
        5.5 ,  1.67,  4.5 ,  3.33, 16.  , 12.  ,  1.71,  1.8 ,  6.5 ,
        4.33, 14.  ,  3.4 , 10.  ,  1.25,  1.75,  2.63, 15.  ,  2.25,
        3.67,  1.43,  9.  ,  2.6 ,  4.75,  1.27,  3.25,  5.33,  2.57,
        2.17,  2.75,  2.8 ,  2.2 ,  2.86,  3.91,  1.4 ,  5.67,  3.2 ,
        1.38,  2.09,  2.4 ,  5.25,  6.71,  3.57,  2.22,  1.83,  3.6 ,
        1.2 ,  1.57,  1.56,  5.4 ,  4.25,  1.31,  1.6 ,  2.9 ,  1.23,
        1.78,  3.83,  7.5 ,  1.14,  2.71,  1.45,  2.38,  1.86,  2.29,
        1.21, 12.33,  3.43,  2.56,  6.33,  1.64,  8.21,  4.4 ,  3.17,
        8.33,  1.48,  1.22,  3.75,  6.67,  1.54,  2.13,  2.14,  2.45,
        3.29,  4.17,  1.63,  3.38,  1.17, 14.5 ,  3.8 ,  1.19,  3.82,
        2.83,  1.93, 11.5 ,  2.08])

In [32]:
# Remover linhas com valores nulos em colunas categoricas
categoricas = df_leads.select_dtypes(include=['object']).columns
df_leads.dropna(subset=categoricas, inplace=True)

In [33]:
df_leads.shape

(9103, 17)

In [34]:
df_leads.info()

<class 'pandas.core.frame.DataFrame'>
Index: 9103 entries, 0 to 9239
Data columns (total 17 columns):
 #   Column                                  Non-Null Count  Dtype  
---  ------                                  --------------  -----  
 0   Lead Origin                             9103 non-null   object 
 1   Lead Source                             9103 non-null   object 
 2   Do Not Email                            9103 non-null   int64  
 3   Do Not Call                             9103 non-null   int64  
 4   Converted                               9103 non-null   int64  
 5   TotalVisits                             9074 non-null   float64
 6   Total Time Spent on Website             9103 non-null   int64  
 7   Page Views Per Visit                    9074 non-null   float64
 8   Last Activity                           9103 non-null   object 
 9   Search                                  9103 non-null   int64  
 10  Newspaper Article                       9103 non-null   int64  
 

In [62]:
for column in df_leads.select_dtypes(include=['float64']).columns:
  df_leads[column] = df_leads[column].fillna(df_leads[column].median())

In [36]:
df_leads.isnull().sum()

Lead Origin                               0
Lead Source                               0
Do Not Email                              0
Do Not Call                               0
Converted                                 0
TotalVisits                               0
Total Time Spent on Website               0
Page Views Per Visit                      0
Last Activity                             0
Search                                    0
Newspaper Article                         0
X Education Forums                        0
Newspaper                                 0
Digital Advertisement                     0
Through Recommendations                   0
A free copy of Mastering The Interview    0
Last Notable Activity                     0
dtype: int64

In [37]:
df_leads.info()

<class 'pandas.core.frame.DataFrame'>
Index: 9103 entries, 0 to 9239
Data columns (total 17 columns):
 #   Column                                  Non-Null Count  Dtype  
---  ------                                  --------------  -----  
 0   Lead Origin                             9103 non-null   object 
 1   Lead Source                             9103 non-null   object 
 2   Do Not Email                            9103 non-null   int64  
 3   Do Not Call                             9103 non-null   int64  
 4   Converted                               9103 non-null   int64  
 5   TotalVisits                             9103 non-null   float64
 6   Total Time Spent on Website             9103 non-null   int64  
 7   Page Views Per Visit                    9103 non-null   float64
 8   Last Activity                           9103 non-null   object 
 9   Search                                  9103 non-null   int64  
 10  Newspaper Article                       9103 non-null   int64  
 

### EXPLORANDO A VARIAVEL TARGET

In [38]:
df_leads['Converted'].value_counts()

Converted
0    5651
1    3452
Name: count, dtype: int64

In [39]:
fig = px.bar(df_leads['Converted'].value_counts() / len(df_leads) * 100, 
             title = 'Hit Ratio - Fator de conversão',
             labels={'index': 'Converted', 'value': 'Percentual'},
             opacity=0.8
             )
fig.update_layout(showlegend=False)
fig.show()

In [40]:
# Plot da Matriz de correlação das variáveis
corr_matrix = df_leads.select_dtypes(include=['number']).corr()

fig = go.Figure()

fig.add_trace(
  go.Heatmap(
    x = corr_matrix.columns,
    y = corr_matrix.index,
    z = np.array(corr_matrix),
    text = corr_matrix.values,
    texttemplate='%{text:.2f}',
    colorscale=px.colors.diverging.RdBu,
    zmin=-1,
    zmax=2
  )
)

fig.show()

In [41]:
fig = px.box(df_leads, x='Converted', y='TotalVisits', color='Converted')
fig.show()

In [42]:
# Eliminando Outlier de TotalVisits
df_leads['TotalVisits'] = df_leads[df_leads['TotalVisits'] < 40]['TotalVisits']

In [43]:
fig = px.box(df_leads, x='Converted', y='TotalVisits', color='Converted')
fig.show()

In [44]:
fig = px.box(df_leads, x='Converted', y='Total Time Spent on Website', color='Converted')
fig.show()

In [45]:
fig = px.box(df_leads, x='Converted', y='Page Views Per Visit', color='Converted')
fig.show()

In [46]:
# Eliminando Outlier de Page Views Per Visit
df_leads['Page Views Per Visit'] = df_leads[df_leads['Page Views Per Visit'] < 20]['Page Views Per Visit']

In [47]:
fig = px.box(df_leads, x='Converted', y='Page Views Per Visit', color='Converted')
fig.show()

In [48]:
fig = px.scatter(df_leads, x='Page Views Per Visit', y='Total Time Spent on Website')
fig.show()

In [49]:
fig = px.scatter(df_leads, x='Newspaper Article', y='X Education Forums')
fig.show()

In [50]:
contingency_table_lead_source = pd.crosstab(df_leads['Converted'], df_leads['Lead Source'])
contingency_table_lead_source

Lead Source,Click2call,Direct Traffic,Facebook,Google,Live Chat,NC_EDM,Olark Chat,Organic Search,Pay per Click Ads,Press_Release,Reference,Referral Sites,Social Media,WeLearn,Welingak Website,bing,blog,testone,welearnblog_Home,youtubechannel
Converted,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
0,1,1725,29,1726,0,0,1307,718,1,2,36,94,1,0,2,5,1,1,1,1
1,3,818,10,1147,2,1,448,436,0,0,426,31,1,1,127,1,0,0,0,0


### TESTE ESTATÍSTICO PARA VERIFICAR CORRELAÇÃO ENTRE VARIÁVEIS

In [51]:
chi2, p, dof, expected = chi2_contingency(contingency_table_lead_source)
print(chi2)
print("P-Value: ", p)
print(dof)
print(expected)
print(f"\nExiste relação entre as variáveis: {'Sim' if p < 0.05 else 'Não'}")

960.393305511584
P-Value:  1.5011803953129708e-191
19
[[2.48313743e+00 1.57865462e+03 2.42105899e+01 1.78351346e+03
  1.24156871e+00 6.20784357e-01 1.08947655e+03 7.16385148e+02
  6.20784357e-01 1.24156871e+00 2.86802373e+02 7.75980446e+01
  1.24156871e+00 6.20784357e-01 8.00811820e+01 3.72470614e+00
  6.20784357e-01 6.20784357e-01 6.20784357e-01 6.20784357e-01]
 [1.51686257e+00 9.64345381e+02 1.47894101e+01 1.08948654e+03
  7.58431286e-01 3.79215643e-01 6.65523454e+02 4.37614852e+02
  3.79215643e-01 7.58431286e-01 1.75197627e+02 4.74019554e+01
  7.58431286e-01 3.79215643e-01 4.89188180e+01 2.27529386e+00
  3.79215643e-01 3.79215643e-01 3.79215643e-01 3.79215643e-01]]

Existe relação entre as variáveis: Sim


In [52]:
contingency_table_lead_origin = pd.crosstab(df_leads['Converted'], df_leads['Lead Origin'])
contingency_table_lead_origin

Lead Origin,API,Landing Page Submission,Lead Add Form,Lead Import
Converted,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,2465,3118,40,28
1,1115,1767,560,10


In [53]:
chi2, p, dof, expected = chi2_contingency(contingency_table_lead_origin)
print(chi2)
print("P-Value: ", p)
print(dof)
print(expected)
print(f"\nExiste relação entre as variáveis: {'Sim' if p < 0.05 else 'Não'}")

860.9356520974923
P-Value:  2.6310402182271016e-186
3
[[2222.40799736 3032.53158299  372.47061408   23.58980556]
 [1357.59200264 1852.46841701  227.52938592   14.41019444]]

Existe relação entre as variáveis: Sim


### MODELAGEM DO ALGORITMO

In [65]:
X = df_leads.drop(columns=['Converted'])
y = df_leads['Converted']

#### COLUMN TRANSFORM

In [66]:
numeric_features = X.select_dtypes(include=['number']).columns
categorical_features = X.select_dtypes(include=['object']).columns

preprocessor = ColumnTransformer(
  transformers=[
    ('num', StandardScaler(), numeric_features),
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
  ]
)

In [67]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=51)

X_train_tranformed = preprocessor.fit_transform(X_train)
# Uso somente o transform pois o preprocessor já foi treinado na linha acima
X_test_tranformed = preprocessor.transform(X_test)

In [68]:
X_train_tranformed.shape

(7282, 66)

In [69]:
X_test_tranformed.shape

(1821, 66)

#### TREINAMENTO DO MODELO

In [70]:
bagging_model = BaggingClassifier(
  estimator=LogisticRegression(),
  n_estimators=10,
  random_state=51,
  max_samples=0.3,
  #max_features=0.7
)

In [71]:
bagging_model.fit(X_train_tranformed, y_train)

#### AVALIANDO PERFORMANCE DO MODELO

In [72]:
y_pred = bagging_model.predict(X_test_tranformed)

In [73]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

In [None]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
print(f"Acurácia {accuracy}")
print(f"Precision {precision}")
print(f"Recall {recall}")
print(f"F1 {f1}")

Acurácia 0.8105436573311368
Precision 0.7986577181208053
Recall 0.6790299572039943
F1 0.7340015420200463


#### TREINANDO SEM ENSEMBLE

In [75]:
log_reg = LogisticRegression()
log_reg.fit(X_train_tranformed, y_train)
y_pred = log_reg.predict(X_test_tranformed)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
print(f"Acurácia {accuracy}")
print(f"Precision {precision}")
print(f"Recall {recall}")
print(f"F1 {f1}")

Acurácia 0.8083470620538166
Precision 0.794314381270903
Recall 0.6776034236804565
F1 0.7313317936874519


#### PLOTANDO A MATRIZ DE CONFUSÃO

In [76]:
conf_matrix = confusion_matrix(y_test, y_pred)

fig = px.imshow(
  conf_matrix,
  labels=dict(x='Predição', y='Real', color='Contagem'),
  x=['Not Converted', 'Converted'],
  y=['Not Converted', 'Converted'],
  color_continuous_scale='Viridis'
)

fig.update_traces(text=conf_matrix, texttemplate="%{z}")
fig.update_layout(coloraxis_showscale=False)

fig.show()

In [77]:
# Calcular a importancia das variaveis
importances = np.mean([np.abs(estimator.coef_[0]) for estimator in bagging_model.estimators_], axis=0)
importances

array([0.40155423, 0.00824107, 0.2368917 , 1.09052368, 0.11503191,
       0.07315608, 0.05367807, 0.03819041, 0.03098566, 0.04594713,
       0.05930544, 0.03807006, 0.69574736, 0.90553435, 1.64874592,
       0.20180656, 0.09633376, 0.71273578, 0.27708351, 0.409923  ,
       0.02779554, 0.04873471, 0.4442325 , 0.47785373, 0.08448063,
       0.08199455, 1.23083288, 0.8894209 , 0.1723784 , 0.08863506,
       1.4402557 , 0.14156026, 0.09182653, 0.01558852, 0.10624502,
       0.11807771, 0.51164509, 0.75437538, 0.62200475, 0.24580043,
       0.39583292, 0.06809118, 0.2612604 , 0.45982804, 0.82331717,
       0.35053638, 0.05346427, 0.54120701, 0.19740352, 0.61330651,
       0.1974746 , 0.03535592, 0.04892956, 0.34535395, 0.70574683,
       0.5062233 , 0.05825506, 0.89527548, 0.72588527, 0.75200425,
       0.51657729, 0.05346427, 0.86597941, 0.90205678, 0.53939021,
       0.05118287])

In [78]:
# Listar nomes das colunas criadas durante o OneHotEncode
feature_names = (numeric_features.tolist() +
                  preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_features).tolist()
)
feature_names

['Do Not Email',
 'Do Not Call',
 'TotalVisits',
 'Total Time Spent on Website',
 'Page Views Per Visit',
 'Search',
 'Newspaper Article',
 'X Education Forums',
 'Newspaper',
 'Digital Advertisement',
 'Through Recommendations',
 'A free copy of Mastering The Interview',
 'Lead Origin_API',
 'Lead Origin_Landing Page Submission',
 'Lead Origin_Lead Add Form',
 'Lead Origin_Lead Import',
 'Lead Source_Click2call',
 'Lead Source_Direct Traffic',
 'Lead Source_Facebook',
 'Lead Source_Google',
 'Lead Source_Live Chat',
 'Lead Source_NC_EDM',
 'Lead Source_Olark Chat',
 'Lead Source_Organic Search',
 'Lead Source_Pay per Click Ads',
 'Lead Source_Press_Release',
 'Lead Source_Reference',
 'Lead Source_Referral Sites',
 'Lead Source_Social Media',
 'Lead Source_WeLearn',
 'Lead Source_Welingak Website',
 'Lead Source_bing',
 'Lead Source_blog',
 'Lead Source_testone',
 'Lead Source_welearnblog_Home',
 'Lead Source_youtubechannel',
 'Last Activity_Approached upfront',
 'Last Activity_Conver

In [79]:
# Criar DataFrame
df_feature_importances = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
df_feature_importances.sort_values(by='Importance', ascending=False)

Unnamed: 0,Feature,Importance
14,Lead Origin_Lead Add Form,1.648746
30,Lead Source_Welingak Website,1.440256
26,Lead Source_Reference,1.230833
3,Total Time Spent on Website,1.090524
13,Lead Origin_Landing Page Submission,0.905534
...,...,...
51,Last Activity_Visited Booth in Tradeshow,0.035356
8,Newspaper,0.030986
20,Lead Source_Live Chat,0.027796
33,Lead Source_testone,0.015589


In [80]:
# Plotando as importancias
df_feature_importances = df_feature_importances.sort_values(by='Importance', ascending=True)
fig = px.bar(
  df_feature_importances,
  x='Importance',
  y='Feature',
  orientation='h',
  title='Importancia das Features'
)

fig.update_layout(height=1200, width=1000, yaxis={'categoryorder': 'total ascending'})

fig.show()

Cenário de CRM - Utilidade da probabilidade

- CRM
  Leads concluídos - Resultado Positivo ou Negativo
  Leads em aberto - Não tenho Resultado

Treine um modelo no que está concluído, para que ele generalize bem no que está aberto

Lead em aberto

  - Probabilidade de converter. Quando muito alto, podemos olhar com mais foco pra realmente converter
                                Quando muito baixo, podemos descartar
  - Importânco das Features

### BOOSTING

In [82]:
boosting_model = AdaBoostClassifier(
  estimator=LogisticRegression(),
  n_estimators=50,
  random_state=51,
  learning_rate=1.0
  #max_samples=0.3,
  #max_features=0.7
)

In [83]:
boosting_model.fit(X_train_tranformed, y_train)





In [85]:
y_pred = boosting_model.predict(X_test_tranformed)

In [86]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
print(f"Acurácia {accuracy}")
print(f"Precision {precision}")
print(f"Recall {recall}")
print(f"F1 {f1}")

Acurácia 0.8056013179571664
Precision 0.7975986277873071
Recall 0.6633380884450785
F1 0.7242990654205608


### STACKING

In [88]:
lr_model = LogisticRegression(random_state=51)

tree_model = DecisionTreeClassifier(random_state=51)

svc_model = SVC(kernel='linear')

sgd_model = SGDClassifier(penalty='elasticnet', random_state=51)

stacking_mode = StackingClassifier(
  estimators= [
    ('sgd classifier', sgd_model),
    ('svc', svc_model),
    ('decision tree', tree_model)
  ],
  final_estimator=lr_model,
  passthrough=False
)

In [89]:
stacking_mode.fit(X_train_tranformed, y_train)

In [90]:
y_pred = stacking_mode.predict(X_test_tranformed)

In [91]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
print(f"Acurácia {accuracy}")
print(f"Precision {precision}")
print(f"Recall {recall}")
print(f"F1 {f1}")

Acurácia 0.8050521691378364
Precision 0.7763578274760383
Recall 0.6932952924393724
F1 0.7324792765636775
