# 1. Preparación de los datos

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('../datasets/Leads.csv')
print(len(df))
df.head().T

9240


Unnamed: 0,0,1,2,3,4
Prospect ID,7927b2df-8bba-4d29-b9a2-b6e0beafe620,2a272436-5132-4136-86fa-dcc88c88f482,8cc8c611-a219-4f35-ad23-fdfd2656bd8a,0cc2df48-7cf4-4e39-9de9-19797f9b38cc,3256f628-e534-4826-9d63-4a8b88782852
Lead Number,660737,660728,660727,660719,660681
Lead Origin,API,API,Landing Page Submission,Landing Page Submission,Landing Page Submission
Lead Source,Olark Chat,Organic Search,Direct Traffic,Direct Traffic,Google
Do Not Email,No,No,No,No,No
Do Not Call,No,No,No,No,No
Converted,0,0,1,0,1
TotalVisits,0.0,5.0,2.0,1.0,2.0
Total Time Spent on Website,0,674,1532,305,1428
Page Views Per Visit,0.0,2.5,2.0,1.0,1.0


# Preproceso

Aquí irían todas las funciones y transformaciones que permitieran el uso de las variables en los siguientes clasificadores, por ejemplo la transformación de las variables categóricas en numéricas, etc.

In [3]:
pd.isnull(df).any()

Prospect ID                                      False
Lead Number                                      False
Lead Origin                                      False
Lead Source                                       True
Do Not Email                                     False
Do Not Call                                      False
Converted                                        False
TotalVisits                                       True
Total Time Spent on Website                      False
Page Views Per Visit                              True
Last Activity                                     True
Country                                           True
Specialization                                    True
How did you hear about X Education                True
What is your current occupation                   True
What matters most to you in choosing a course     True
Search                                           False
Magazine                                         False
Newspaper 

In [4]:
df.fillna(method='ffill',inplace=True)
pd.isnull(df).any()

Prospect ID                                      False
Lead Number                                      False
Lead Origin                                      False
Lead Source                                      False
Do Not Email                                     False
Do Not Call                                      False
Converted                                        False
TotalVisits                                      False
Total Time Spent on Website                      False
Page Views Per Visit                             False
Last Activity                                    False
Country                                           True
Specialization                                   False
How did you hear about X Education               False
What is your current occupation                  False
What matters most to you in choosing a course    False
Search                                           False
Magazine                                         False
Newspaper 

In [5]:
replacer = lambda str: str.lower().str.replace(' ', '_')
df.columns = replacer(df.columns.str)
for col in list(df.dtypes[df.dtypes == 'object'].index):
    df[col] = replacer(df[col].str)
df.head().T

Unnamed: 0,0,1,2,3,4
prospect_id,7927b2df-8bba-4d29-b9a2-b6e0beafe620,2a272436-5132-4136-86fa-dcc88c88f482,8cc8c611-a219-4f35-ad23-fdfd2656bd8a,0cc2df48-7cf4-4e39-9de9-19797f9b38cc,3256f628-e534-4826-9d63-4a8b88782852
lead_number,660737,660728,660727,660719,660681
lead_origin,api,api,landing_page_submission,landing_page_submission,landing_page_submission
lead_source,olark_chat,organic_search,direct_traffic,direct_traffic,google
do_not_email,no,no,no,no,no
do_not_call,no,no,no,no,no
converted,0,0,1,0,1
totalvisits,0.0,5.0,2.0,1.0,2.0
total_time_spent_on_website,0,674,1532,305,1428
page_views_per_visit,0.0,2.5,2.0,1.0,1.0


In [6]:
for column in df.columns:
        df[column].replace(('yes', 'no'), (1, 0), inplace=True)
df.head()

Unnamed: 0,prospect_id,lead_number,lead_origin,lead_source,do_not_email,do_not_call,converted,totalvisits,total_time_spent_on_website,page_views_per_visit,...,get_updates_on_dm_content,lead_profile,city,asymmetrique_activity_index,asymmetrique_profile_index,asymmetrique_activity_score,asymmetrique_profile_score,i_agree_to_pay_the_amount_through_cheque,a_free_copy_of_mastering_the_interview,last_notable_activity
0,7927b2df-8bba-4d29-b9a2-b6e0beafe620,660737,api,olark_chat,0,0,0,0.0,0,0.0,...,0,select,select,02.medium,02.medium,15.0,15.0,0,0,modified
1,2a272436-5132-4136-86fa-dcc88c88f482,660728,api,organic_search,0,0,0,5.0,674,2.5,...,0,select,select,02.medium,02.medium,15.0,15.0,0,0,email_opened
2,8cc8c611-a219-4f35-ad23-fdfd2656bd8a,660727,landing_page_submission,direct_traffic,0,0,1,2.0,1532,2.0,...,0,potential_lead,mumbai,02.medium,01.high,14.0,20.0,0,1,email_opened
3,0cc2df48-7cf4-4e39-9de9-19797f9b38cc,660719,landing_page_submission,direct_traffic,0,0,0,1.0,305,1.0,...,0,select,mumbai,02.medium,01.high,13.0,17.0,0,0,modified
4,3256f628-e534-4826-9d63-4a8b88782852,660681,landing_page_submission,google,0,0,1,2.0,1428,1.0,...,0,select,mumbai,02.medium,01.high,15.0,18.0,0,0,modified


In [7]:
categorical = ['lead_origin', 'lead_source','lead_profile', 'city','asymmetrique_activity_index','asymmetrique_profile_index','last_notable_activity']
numerical = ['do_not_email', 'do_not_call', 'totalvisits','total_time_spent_on_website','page_views_per_visit',
'get_updates_on_dm_content','asymmetrique_activity_score','asymmetrique_profile_score','i_agree_to_pay_the_amount_through_cheque','a_free_copy_of_mastering_the_interview']
df[categorical].nunique()

lead_origin                     5
lead_source                    20
lead_profile                    6
city                            7
asymmetrique_activity_index     3
asymmetrique_profile_index      3
last_notable_activity          16
dtype: int64

In [8]:
from sklearn.model_selection import train_test_split
df_train_full, df_test = train_test_split(df, test_size=0.2, random_state=1)

df_train, df_val = train_test_split(df_train_full, test_size=0.33, random_state=1)
y_train = df_train.converted.values
y_val = df_val.converted.values

del df_train['converted']
del df_val['converted']

df_train.head().T

Unnamed: 0,3885,6733,577,6067,2859
prospect_id,d69a2f5e-7260-4690-81e6-c1dfec5c57ce,96c5d1c4-b424-49a1-b4f5-f2ffaa4af15a,224db710-8508-4f14-abff-550ceeeb23c3,7b443574-0c7d-4a7c-ac9d-027a1390b0a0,f2cbbb2e-573d-48b6-a95b-91740abc3386
lead_number,622730,598165,654291,603286,632244
lead_origin,landing_page_submission,api,landing_page_submission,landing_page_submission,landing_page_submission
lead_source,direct_traffic,google,google,direct_traffic,google
do_not_email,0,0,0,1,1
do_not_call,0,0,0,0,0
totalvisits,3.0,2.0,3.0,5.0,2.0
total_time_spent_on_website,1625,1345,239,60,1017
page_views_per_visit,3.0,1.0,3.0,5.0,2.0
last_activity,converted_to_lead,email_opened,email_opened,page_visited_on_website,email_bounced


# 2. Análisis de importancia de propiedades

In [9]:
global_mean = df_train_full.converted.mean()
df.groupby('converted').count()
round(global_mean, 3)

0.383

In [10]:
landing_mean = df_train_full[df_train_full.lead_origin == 'landing_page_submission'].converted.mean()
print(round(landing_mean, 3))
api_mean = df_train_full[df_train_full.lead_origin == 'api'].converted.mean()
print(round(api_mean, 3))

0.361
0.308


In [11]:
direct_traffic_mean = df_train_full[df_train_full.lead_source == 'direct_traffic'].converted.mean()
print(round(direct_traffic_mean, 3))
google_mean = df_train_full[df_train_full.lead_source == 'google'].converted.mean()
print(round(google_mean, 3))

0.319
0.401


In [12]:
from sklearn.metrics import mutual_info_score

calculate_mi = lambda col: mutual_info_score(col, df_train_full.converted)

df_mi = df_train_full[categorical].apply(calculate_mi)
df_mi = df_mi.sort_values(ascending=False).to_frame(name='MI')
df_mi

Unnamed: 0,MI
last_notable_activity,0.072517
lead_source,0.061367
lead_origin,0.055599
lead_profile,0.040482
asymmetrique_activity_index,0.005435
asymmetrique_profile_index,0.004365
city,0.002335


In [17]:
print(df_train_full[numerical].corrwith(df_train_full.converted))

print(round(df_train_full[df_train_full.total_time_spent_on_website <= 2].converted.mean(), 3))
print(round(df_train_full[(df_train_full.total_time_spent_on_website > 3) & 
                          (df_train_full.total_time_spent_on_website <= 12)].converted.mean(), 3))
print(round(df_train_full[df_train_full.total_time_spent_on_website > 12].converted.mean(), 3))

print(round(df_train_full[df_train_full.do_not_email != 0].converted.mean(), 3))
print(round(df_train_full[df_train_full.do_not_email == 0].converted.mean(), 3))

do_not_email                               -0.126633
do_not_call                                 0.014760
totalvisits                                 0.033772
total_time_spent_on_website                 0.363470
page_views_per_visit                        0.002021
get_updates_on_dm_content                        NaN
asymmetrique_activity_score                 0.095044
asymmetrique_profile_score                  0.112090
i_agree_to_pay_the_amount_through_cheque         NaN
a_free_copy_of_mastering_the_interview     -0.038502
dtype: float64
0.409
0.122
0.379
0.174
0.401


# 3. Ingeniería de propiedades

In [13]:
train_dict = df_train[categorical + numerical].to_dict(orient='records')
dict(sorted(train_dict[0].items()))

{'a_free_copy_of_mastering_the_interview': 1,
 'asymmetrique_activity_index': '02.medium',
 'asymmetrique_activity_score': 14.0,
 'asymmetrique_profile_index': '02.medium',
 'asymmetrique_profile_score': 16.0,
 'city': 'other_cities_of_maharashtra',
 'do_not_call': 0,
 'do_not_email': 0,
 'get_updates_on_dm_content': 0,
 'i_agree_to_pay_the_amount_through_cheque': 0,
 'last_notable_activity': 'modified',
 'lead_origin': 'landing_page_submission',
 'lead_profile': 'select',
 'lead_source': 'direct_traffic',
 'page_views_per_visit': 3.0,
 'total_time_spent_on_website': 1625,
 'totalvisits': 3.0}

In [14]:
from sklearn.feature_extraction import DictVectorizer

dv = DictVectorizer(sparse=False)
dv.fit(train_dict)

In [15]:
X_train = dv.transform(train_dict)
X_train[0]

array([1.000e+00, 0.000e+00, 1.000e+00, 0.000e+00, 1.400e+01, 0.000e+00,
       1.000e+00, 0.000e+00, 1.600e+01, 0.000e+00, 0.000e+00, 1.000e+00,
       0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00,
       0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00,
       0.000e+00, 0.000e+00, 0.000e+00, 1.000e+00, 0.000e+00, 0.000e+00,
       0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00,
       1.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00,
       0.000e+00, 0.000e+00, 1.000e+00, 0.000e+00, 0.000e+00, 0.000e+00,
       0.000e+00, 1.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00,
       0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00,
       0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 3.000e+00, 1.625e+03,
       3.000e+00])

In [16]:
dv.get_feature_names_out()

array(['a_free_copy_of_mastering_the_interview',
       'asymmetrique_activity_index=01.high',
       'asymmetrique_activity_index=02.medium',
       'asymmetrique_activity_index=03.low',
       'asymmetrique_activity_score',
       'asymmetrique_profile_index=01.high',
       'asymmetrique_profile_index=02.medium',
       'asymmetrique_profile_index=03.low', 'asymmetrique_profile_score',
       'city=mumbai', 'city=other_cities',
       'city=other_cities_of_maharashtra', 'city=other_metro_cities',
       'city=select', 'city=thane_&_outskirts', 'city=tier_ii_cities',
       'do_not_call', 'do_not_email', 'get_updates_on_dm_content',
       'i_agree_to_pay_the_amount_through_cheque',
       'last_notable_activity=email_bounced',
       'last_notable_activity=email_link_clicked',
       'last_notable_activity=email_marked_spam',
       'last_notable_activity=email_opened',
       'last_notable_activity=email_received',
       'last_notable_activity=form_submitted_on_website',
       'l

# 4. Entrenamiento del modelo

In [17]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(solver='liblinear')
model.fit(X_train, y_train)

In [18]:
val_dict = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dict)
y_pred = model.predict_proba(X_val)
y_pred

array([[0.85215614, 0.14784386],
       [0.77063904, 0.22936096],
       [0.29597393, 0.70402607],
       ...,
       [0.81933945, 0.18066055],
       [0.71375189, 0.28624811],
       [0.74431444, 0.25568556]])

In [19]:
y_pred = model.predict_proba(X_val)[:, 1]
y_pred

array([0.14784386, 0.22936096, 0.70402607, ..., 0.18066055, 0.28624811,
       0.25568556])

In [20]:
converted = y_pred >= 0.5
converted

array([False, False,  True, ..., False, False, False])

In [21]:
round((y_val == converted).mean(), 3) 

0.813

In [26]:
model.score(X_val,y_val)

0.8127049180327869

# 5. Serialización del modelo

In [22]:
import pickle

In [24]:
with open('../models/converted-model.pck', 'wb') as f:
    pickle.dump((dv, model), f)

In [25]:
with open('../models/converted-model.pck', 'rb') as f:
    dv, model = pickle.load(f)
    X_val = dv.transform(val_dict)
    y_pred = model.predict_proba(X_val)

y_pred

array([[0.85215614, 0.14784386],
       [0.77063904, 0.22936096],
       [0.29597393, 0.70402607],
       ...,
       [0.81933945, 0.18066055],
       [0.71375189, 0.28624811],
       [0.74431444, 0.25568556]])

# Pruebas

'lead_origin' 'lead_source','lead_profile', 'city','asymmetrique_activity_index','asymmetrique_profile_index','last_notable_activity'
'do_not_email', 'do_not_call', 'totalvisits','total_time_spent_on_website','page_views_per_visit',
'get_updates_on_dm_content','asymmetrique_activity_score','asymmetrique_profile_score','i_agree_to_pay_the_amount_through_cheque','a_free_copy_of_mastering_the_interview'


He estado probando el curl y no me funciona, al final he ido mejor por Postman que suele funcionar mejor para estas cosas
![alt text](Capture.png "Title")

In [36]:
!curl --request POST 'http://127.0.0.1:5000/predict' \
--header 'Content-Type: application/json' \
--data-raw '{ \
     "lead_origin": "API",\
     "lead_source": "Olark Chat",\
     "lead_profile": "Select",\
     "city": "Select",\
     "asymmetrique_activity_index": "02.Medium",\
     "asymmetrique_profile_index": "02.Medium",\
     "last_notable_activity": "Modified",\
     "do_not_email": "0",\
     "do_not_call": "0",\
     "totalvisits": "0",\
     "total_time_spent_on_website": "0",\
     "page_views_per_visit": "0",\
     "get_updates_on_dm_content": "0" ,\
     "asymmetrique_activity_score": "15",\
     "asymmetrique_profile_score": "15",\
     "i_agree_to_pay_the_amount_through_cheque": "0",\
     "a_free_copy_of_mastering_the_interview": "0"\
 }'

^C


curl: (1) Protocol "'http" not supported or disabled in libcurl
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed

  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
  0     0    0     0    0     0      0      0 --:--:--  0:00:01 --:--:--     0curl: (6) Could not resolve host: application

  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
  0     0    0     0    0     0      0      0 --:--:--  0:00:01 --:--:--     0curl: (6) Could not resolve host: lead_origin

  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0curl: (6) Could not resolve host: API,

  0     0    0     0    0     0      0      0 --:--:--  0:00:01 --:--:--     0curl: (6) Could