In [10]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler, LabelEncoder, LabelBinarizer, OrdinalEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import FeatureUnion, Pipeline, make_pipeline
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn import svm, datasets
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import Normalizer

## Machine Learning Model Building Pipeline: Interpretability

## Prospect dataset: Data Analysis

In the following cells, we will analyse the variables of the prospect Dataset from data.csv. 

In [1]:
# to handle datasets
import pandas as pd
import numpy as np

# for plotting
import matplotlib
import matplotlib.pyplot as plt

import seaborn as sns; sns.set_theme()

# to display all the columns of the dataframe in the notebook
pd.pandas.set_option('display.max_columns', None)

In [4]:
# load dataset
data = pd.read_csv('../data/data.csv', sep=';')

# rows and columns of the data
print(data.shape)

# visualise the dataset
data.head()

(9193, 35)


Unnamed: 0,ID_CLIENT,ORIGINE_LEAD,SOURCE_LEAD,NIVEAU_LEAD,QUALITE_LEAD,CONTACT_PAR_MAIL,CONTACT_PAR_TELEPHONE,STATUT_ACTUEL,CONVERTI,NB_VISITES,DUREE_SUR_SITEWEB,NB_PAGES_VUES_PAR_VISITE,DERNIERE_ACTIVITE,DERNIERE_ACTIVITE_NOTABLE,PAYS,VILLE,SPECIALISATION,TAGS,INDEX_ACTIVITE,INDEX_PROFIL,SCORE_ACTIVITE,SCORE_PROFIL,ANNONCE_VUE,MAGAZINE,ARTICLE_JOURNAL,FORUM,JOURNAUX,PUB_DIGITALE,RECOMMANDATION,Comment avez-vous entendu parler de nous ?,Souhaites-tu recevoir plus d'infos sur notre cours ?,Souhaites-tu recevoir des mises à jour sur nos programmes ?,Souhaites-tu recevoir des mises à jour par message privé ?,Souhaites-tu payer par chèque ?,Souhaites-tu recevoir une copie de notre livre blanc ?
0,628707,Formulaire Lead Add,Olark Chat,Select,,Non,Non,Sans emploi,0,0.0,0,0.0,Email ouvert,Email ouvert,,Select,Marketing Management,,,,,,Non,Non,Non,Non,Non,Non,Non,Select,Non,Non,Non,Non,Non
1,650444,Soumission landing page,Organic Search,Autre leads,Pas du tout pertinent,Non,Non,Sans emploi,0,3.0,519,3.0,Page visitée sur le site,Modifié,India,Autres villes de Maharashtra,"Banking, Investment And Insurance",Ne pas suivre de formation continue,Moyen,Elevé,15.0,17.0,Non,Non,Non,Non,Non,Non,Non,Select,Non,Non,Non,Non,Oui
2,631159,Soumission landing page,Google,,,Non,Non,,0,3.0,323,3.0,Email ouvert,Email ouvert,India,Mumbai,Hospitality Management,,,,,,Non,Non,Non,Non,Non,Non,Non,,Non,Non,Non,Non,Non
3,637163,Formulaire Lead Add,Reference,Lead potentiel,Pourrait être pertinent,Non,Non,Sans emploi,1,0.0,0,0.0,Email ouvert,Email ouvert,,Select,"Banking, Investment And Insurance",Reviendra après avoir lu le courriel,Moyen,Elevé,15.0,19.0,Non,Non,Non,Non,Non,Non,Non,Select,Non,Non,Non,Non,Non
4,644599,Soumission landing page,Direct Traffic,,,Non,Non,,0,3.0,258,3.0,Email ouvert,Email ouvert,India,Autres villes,Finance Management,,Moyen,Moyen,14.0,16.0,Non,Non,Non,Non,Non,Non,Non,Etudiant d'une certaine école,Non,Non,Non,Non,Non


In [None]:
X_train.isna().sum()

In [16]:
categorical_variables = [
    'ORIGINE_LEAD', 'SOURCE_LEAD', 'NIVEAU_LEAD', 'TAGS', 'VILLE', 'SPECIALISATION', 'INDEX_ACTIVITE',
    'QUALITE_LEAD', 'CONTACT_PAR_MAIL', 'STATUT_ACTUEL', 'DERNIERE_ACTIVITE', 'DERNIERE_ACTIVITE_NOTABLE'
]

numerical_variables = [ 'NB_VISITES', 'SCORE_ACTIVITE', 'NB_PAGES_VUES_PAR_VISITE', 'SCORE_PROFIL', 'DUREE_SUR_SITEWEB' ]

In [17]:
X = data[categorical_variables + numerical_variables]
y = data['CONVERTI']

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [5]:
def change_separator(df, categorical_variables, sep_in='_', sep_out='__'):
    column_names_serie = pd.Series(df.columns.tolist())

    for var in categorical_variables:
        column_names_serie = column_names_serie.str.replace(var+sep_in, var+sep_out)
    
    df.columns = column_names_serie.tolist()
    return df

In [19]:
from category_encoders import OneHotEncoder

onehot = OneHotEncoder(
    cols=categorical_variables,
    handle_unknown='ignore',
    use_cat_names=True
).fit(X_train)


X_train_dummified = change_separator(
    onehot.transform(X_train),
    onehot.cols
)

X_test_dummified = change_separator(
    onehot.transform(X_test),
    onehot.cols
)

In [20]:
X_train_dummified

Unnamed: 0,ORIGINE_LEAD__API,ORIGINE_LEAD__Soumission landing page,ORIGINE_LEAD__Formulaire Lead Add,ORIGINE_LEAD__Importation de lead,ORIGINE_LEAD__Formulaire Quick Add,SOURCE_LEAD__Olark Chat,SOURCE_LEAD__Organic Search,SOURCE_LEAD__Google,SOURCE_LEAD__Direct Traffic,SOURCE_LEAD__Reference,SOURCE_LEAD__Welingak Website,SOURCE_LEAD__Referral Sites,SOURCE_LEAD__Facebook,SOURCE_LEAD__nan,SOURCE_LEAD__Pay per Click Ads,SOURCE_LEAD__Click2call,SOURCE_LEAD__google,SOURCE_LEAD__Live Chat,SOURCE_LEAD__testone,SOURCE_LEAD__Social Media,SOURCE_LEAD__blog,SOURCE_LEAD__WeLearn,SOURCE_LEAD__bing,SOURCE_LEAD__youtubechannel,SOURCE_LEAD__Press_Release,SOURCE_LEAD__NC_EDM,NIVEAU_LEAD__Select,NIVEAU_LEAD__nan,NIVEAU_LEAD__Lead potentiel,NIVEAU_LEAD__Autre leads,NIVEAU_LEAD__Etudiant d'une certaine école,NIVEAU_LEAD__Etudiant,NIVEAU_LEAD__Etudiant en double spécialisation,TAGS__Intéressé par d'autres cours,TAGS__nan,TAGS__Appelé,TAGS__Désactivé,TAGS__Reviendra après avoir lu le courriel,TAGS__Intéressé par un MBA full-time,TAGS__Fermé,TAGS__Diplôme en cours,TAGS__Mauvais numéro,TAGS__Occupé,TAGS__Numéro invalide,TAGS__Perdu au profit d'un concurrent,TAGS__Déjà un étudiant,TAGS__Ne pas suivre de formation continue,TAGS__Titulaire d'un diplôme (non éligible),TAGS__Intéressé mais problème financier,TAGS__Raccroché,TAGS__En contact avec un concurrent,TAGS__Université non reconnue,TAGS__Intéressé pour la prochaine session,TAGS__Numéro non fourni,TAGS__Reflexion en cours,TAGS__Hésite entre le part-time et full-time,TAGS__Etudiant en arrivé tardive,TAGS__Va s'inscrire dans le mois à venir,TAGS__Problème de reconnaissance,VILLE__Select,VILLE__Autres villes,VILLE__Mumbai,VILLE__Autres villes métropolitaines,VILLE__nan,VILLE__Thane et sa périphérie,VILLE__Villes de niveau II,VILLE__Autres villes de Maharashtra,SPECIALISATION__Select,SPECIALISATION__Supply Chain Management,SPECIALISATION__Marketing Management,SPECIALISATION__Finance Management,SPECIALISATION__nan,SPECIALISATION__Operations Management,SPECIALISATION__Business Administration,"SPECIALISATION__Banking, Investment And Insurance",SPECIALISATION__E-Business,SPECIALISATION__Hospitality Management,SPECIALISATION__IT Projects Management,SPECIALISATION__Human Resource Management,SPECIALISATION__Healthcare Management,SPECIALISATION__Rural and Agribusiness,SPECIALISATION__Media and Advertising,SPECIALISATION__Travel and Tourism,SPECIALISATION__International Business,SPECIALISATION__Retail Management,SPECIALISATION__Services Excellence,SPECIALISATION__E-COMMERCE,INDEX_ACTIVITE__Moyen,INDEX_ACTIVITE__nan,INDEX_ACTIVITE__Elevé,INDEX_ACTIVITE__Faible,QUALITE_LEAD__nan,QUALITE_LEAD__Pas sur,QUALITE_LEAD__Pourrait être pertinent,QUALITE_LEAD__Très pertinent,QUALITE_LEAD__Pas du tout pertinent,QUALITE_LEAD__Peu pertinent,CONTACT_PAR_MAIL__Non,CONTACT_PAR_MAIL__Oui,STATUT_ACTUEL__Sans emploi,STATUT_ACTUEL__nan,STATUT_ACTUEL__Etudiant,STATUT_ACTUEL__Professionnel en activité,STATUT_ACTUEL__Homme d'affaire,STATUT_ACTUEL__Autre,STATUT_ACTUEL__Femme au foyer,DERNIERE_ACTIVITE__Conversation Chat,DERNIERE_ACTIVITE__SMS envoyé,DERNIERE_ACTIVITE__Email ouvert,DERNIERE_ACTIVITE__Page visitée sur le site,DERNIERE_ACTIVITE__Désinscrit,DERNIERE_ACTIVITE__A cliqué sur le lien dans le mail,DERNIERE_ACTIVITE__Stand visité au salon,DERNIERE_ACTIVITE__Email rejeté,DERNIERE_ACTIVITE__Converti en lead,DERNIERE_ACTIVITE__Approche directe,DERNIERE_ACTIVITE__Formulaire soumis sur le site,DERNIERE_ACTIVITE__nan,DERNIERE_ACTIVITE__Injoignable,DERNIERE_ACTIVITE__Email marqué comme Spam,DERNIERE_ACTIVITE__A eu une conversation téléphonique,DERNIERE_ACTIVITE__A cliqué sur le lien dand le navigateur,DERNIERE_ACTIVITE__Email reçu,DERNIERE_ACTIVITE__Réinscrit aux emails,DERNIERE_ACTIVITE__NOTABLE_Modifié,DERNIERE_ACTIVITE__NOTABLE_SMS envoyé,DERNIERE_ACTIVITE__NOTABLE_Email ouvert,DERNIERE_ACTIVITE__NOTABLE_Page visitée sur le site,DERNIERE_ACTIVITE__NOTABLE_Désinscrit,DERNIERE_ACTIVITE__NOTABLE_A cliqué sur le lien dans le mail,DERNIERE_ACTIVITE__NOTABLE_Conversation Chat,DERNIERE_ACTIVITE__NOTABLE_Email rejeté,DERNIERE_ACTIVITE__NOTABLE_Injoignable,DERNIERE_ACTIVITE__NOTABLE_Email marqué comme Spam,DERNIERE_ACTIVITE__NOTABLE_A eu une conversation téléphonique,DERNIERE_ACTIVITE__NOTABLE_A cliqué sur le lien dand le navigateur,DERNIERE_ACTIVITE__NOTABLE_Réinscrit aux emails,DERNIERE_ACTIVITE__NOTABLE_Email reçu,NB_VISITES,SCORE_ACTIVITE,NB_PAGES_VUES_PAR_VISITE,SCORE_PROFIL,DUREE_SUR_SITEWEB
1369,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,15.0,0.00,15.0,0
1961,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0.0,15.0,0.00,15.0,0
773,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,11.0,13.0,3.67,16.0,127
6322,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,3.0,,3.00,,35
8711,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,5.0,14.0,2.50,16.0,343
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4373,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,2.0,14.0,2.00,20.0,330
7891,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,2.0,,2.00,,315
4859,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,3.0,,3.00,,117
3264,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1.0,15.0,1.00,18.0,1203


In [21]:
model = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)

In [22]:
model.fit(X_train_dummified, y_train)

ValueError: Input contains NaN, infinity or a value too large for dtype('float32').