In [137]:
# data manipulation and plotting
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# for saving the pipeline
import joblib

# from Scikit-learn
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, Binarizer

# from feature-engine
from feature_engine.imputation import (
    AddMissingIndicator,
    MeanMedianImputer,
    CategoricalImputer,
)

from feature_engine.encoding import (
    RareLabelEncoder,
    OrdinalEncoder,
)

from feature_engine.transformation import LogTransformer

from feature_engine.selection import DropFeatures
from feature_engine.wrappers import SklearnTransformerWrapper
from sklearn.preprocessing import MultiLabelBinarizer

# import preprocessor as pp


from sklearn.linear_model import LogisticRegression
from imblearn.under_sampling import RandomUnderSampler

#XGboost classifier
from xgboost import XGBClassifier


#import PCA
from sklearn.decomposition import PCA

#one hot encoding
from sklearn.preprocessing import OneHotEncoder

In [138]:
# load dataset
df = pd.read_csv('../output/saida.csv')

# rows and columns of the data
print(df.shape)

# visualise the dataset
df.head()

(484381, 35)


Unnamed: 0,id_form,id_cliente,data,sistema_operacional,navegador,localizacao,locale_x,cidade_x,estado_x,qtde_lavagem_semana,...,status_pagamento,tipo_pagamento,valor_total,valor_desconto,frete_gratis,cidade_y,estado_y,locale_y,utm_source_y,target
0,c7d453c9c01223a42968d9fe136047,d2620a3e6f00a6dc6e88,2022-01-03 15:42:22,iOS,Safari,"São Paulo, Sao Paulo",BR,SÃO PAULO,SAO PAULO,4,...,Entregue,CARTÃO,124.03,0.0,False,São Paulo,SP,BR,,1.0
1,72830da153824af486be15cd48b3f3,9a48802d8290d19471f3,2022-01-04 08:39:53,Windows,Chrome,"Ribeirão Preto, Sao Paulo",BR,RIBEIRÃO PRETO,SAO PAULO,7,...,Cancelado,PIX,282.22,0.0,False,Jundiaí,SP,BR,,1.0
2,72830da153824af486be15cd48b3f3,9a48802d8290d19471f3,2022-01-04 08:39:53,Windows,Chrome,"Ribeirão Preto, Sao Paulo",BR,RIBEIRÃO PRETO,SAO PAULO,7,...,Cancelado,PIX,282.22,0.0,False,Jundiaí,SP,BR,,1.0
3,d101fc0075dac29db1168cfc61ad44,c3344fad450ee424700d,2022-01-04 19:34:05,Android,Chrome,"São Paulo, Sao Paulo",BR,SÃO PAULO,SAO PAULO,2,...,,,,,,,,,,
4,0957fb9d7be36b4d079b6582bb6785,8105e34124e90300d8b5,2022-01-05 18:29:41,Android,Chrome,"Manchester, England",BR,MANCHESTER,ENGLAND,5,...,,,,,,,,,,


In [139]:
#convert columns caracteristic and procedimentos to list
df['caracteristica'] = df['caracteristica'].apply(lambda x: x.replace(' ',''))
df['caracteristica'] = df['caracteristica'].apply(lambda x: x.split(','))
df.procedimentos.replace('N/C','N_C', inplace=True)
df['procedimentos'] = df['procedimentos'].apply(lambda x: x.split(','))

df.efeitos_desejados.replace('N/C','efeitos_desejados_N_C', inplace=True)
df.efeitos_desejados.fillna('Missing_efeitos_desejados', inplace=True)
df['efeitos_desejados'] = df['efeitos_desejados'].apply(lambda x: x.replace(' ',''))
df['efeitos_desejados'] = df['efeitos_desejados'].apply(lambda x: x.split(','))


In [140]:
from sklearn.preprocessing import MultiLabelBinarizer


mlb = MultiLabelBinarizer(sparse_output=True)

df = df.join(
            pd.DataFrame.sparse.from_spmatrix(
                mlb.fit_transform(df.pop('caracteristica')),
                index=df.index,
                columns=mlb.classes_))

In [141]:
mlb = MultiLabelBinarizer(sparse_output=True)

df = df.join(
            pd.DataFrame.sparse.from_spmatrix(
                mlb.fit_transform(df.pop('efeitos_desejados')),
                index=df.index,
                columns=mlb.classes_))

mlb = MultiLabelBinarizer(sparse_output=True)

df = df.join(
            pd.DataFrame.sparse.from_spmatrix(
                mlb.fit_transform(df.pop('procedimentos')),
                index=df.index,
                columns=mlb.classes_))

In [142]:
# Let's separate into train and test set
# Remember to set the seed (random_state for this sklearn function)

X_train, X_test, y_train, y_test = train_test_split(
    df.drop(['target'], axis=1), # predictive variables
    df['target'], # target
    test_size=0.2, # portion of dataset to allocate to test set
    random_state=0, # we are setting the seed here
)

X_train.shape, X_test.shape

((387504, 63), (96877, 63))

In [143]:
y_train.fillna(0, inplace=True)
y_test.fillna(0, inplace=True)

In [144]:
# categorical variables with NA in train set
CATEGORICAL_VARS_WITH_NA_MISSING = [
 'estado_x',
]

# categorical variables with NA in train set
CATEGORICAL_VARS_WITH_NA_FREQUENT = [
 'navegador',
 'utm_source_x','cidade_x'
]

# numerical variables
NUMERICAL_VARS = ['qtde_lavagem_semana']



# this variable is to calculate the temporal variable,
# can be dropped afterwards
DROP_FEATURES = ['cidade_y',
 'Característica5',
 'Efeito4',
 'locale_y',
 'Característica8',
 'id_cliente',
 'N_C',
 'Corte agressivo',
 'Efeito5',
 'N/C',
 ' Exposição ao sol',
 'frete_gratis',
 'Tintura',
 'Efeito6',
 ' Corte agressivo',
 'Efeito1',
 'tipo_pagamento',
 ' Nenhum',
 'status_pagamento',
 'Missing_efeitos_desejados',
 'utm_source_y',
 'Efeito2',
 'Característica9',
 'estado_y',
 'localizacao',
 'valor_total',
 'Característica3',
 'Característica6',
 ' Tintura',
 'Efeito3',
 'id_form',
 'Característica7',
 'Característica4',
 'Efeito7',
 'Característica10',
 'efeitos_desejados_N_C',
 'data_pedido',
 'Efeito8',
 'valor_desconto',
 'Efeito10',
 'Característica2',
 'Exposição ao sol',
 'data',
 'Nenhum',
 'id_pedido',
 'Todos acima',
 'Característica1']




# categorical variables to encode
CATEGORICAL_VARS = [
                     'sistema_operacional',
                     'navegador',
                     'locale_x',
                     'cidade_x',
                     'estado_x',
                     'utm_source_x',
                     'tipo_cabelo',
                     'comprimento',
                     'tipo_fios',
                     'tempo_procedimento',
                     'dieta',
                     'atividade_fisica',
                     'frequencia_estresse',
                     'faixa_etaria',
                     'fragancia',

]

#RESULT = set(df.columns.tolist()) - set(FEATURES)
FEATURES = ['faixa_etaria','utm_source_x','atividade_fisica','estado_x','qtde_lavagem_semana','dieta','frequencia_estresse','tipo_cabelo','navegador','tempo_procedimento','tipo_fios','comprimento','sistema_operacional','locale_x', 'cidade_x', 'fragancia']



In [145]:
under = RandomUnderSampler(sampling_strategy='majority')
X_train, y_train = under.fit_resample(X_train, y_train)
X_test, y_test = under.fit_resample(X_test, y_test)



In [146]:
class_pipe = Pipeline([

    #drop features
    ('drop_features', DropFeatures(features_to_drop=DROP_FEATURES)),

    # ===== IMPUTATION =====
    # impute categorical variables with string missing
    ('missing_imputation', CategoricalImputer(
        imputation_method='missing', variables=CATEGORICAL_VARS_WITH_NA_MISSING)),

    #TODO: Entender como iremos fazer isso
    ('frequent_imputation', CategoricalImputer(
        imputation_method='frequent', variables=CATEGORICAL_VARS_WITH_NA_FREQUENT)),

     # == CATEGORICAL ENCODING
    ('rare_label_encoder', RareLabelEncoder(
        tol=0.01, n_categories=1, variables=CATEGORICAL_VARS
    )),

    # encode categorical and discrete variables using the target mean
    ('categorical_encoder', OrdinalEncoder(
        encoding_method='ordered', variables=CATEGORICAL_VARS)),

    #MODEL Xgboost
    ('xgboost', XGBClassifier(
        learning_rate= 0.5,
        max_depth= 7,
        n_estimators= 200,
        verbose=True
    )),


])



In [147]:
# train the pipeline
class_pipe.fit(X_train, y_train)



Parameters: { "verbose" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




In [148]:
pred_train = class_pipe.predict(X_train)

In [149]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_train, pred_train)

array([[103065,  20881],
       [ 36085,  87861]], dtype=int64)

In [150]:
from sklearn.metrics import accuracy_score
from sklearn import metrics

print(metrics.classification_report(y_train, pred_train))
print('ROC: %.3f' % metrics.roc_auc_score(y_train, pred_train, average='micro'))

              precision    recall  f1-score   support

         0.0       0.74      0.83      0.78    123946
         1.0       0.81      0.71      0.76    123946

    accuracy                           0.77    247892
   macro avg       0.77      0.77      0.77    247892
weighted avg       0.77      0.77      0.77    247892

ROC: 0.770


In [151]:
pred_test = class_pipe.predict(X_test)


In [152]:
from sklearn import metrics

print(metrics.classification_report(y_test, pred_test))

              precision    recall  f1-score   support

         0.0       0.68      0.75      0.71     30987
         1.0       0.72      0.64      0.68     30987

    accuracy                           0.70     61974
   macro avg       0.70      0.70      0.70     61974
weighted avg       0.70      0.70      0.70     61974



In [153]:
from sklearn import metrics


print('ROC: %.3f' % metrics.roc_auc_score(y_test, pred_test, average='micro'))

ROC: 0.698


In [154]:
#save model
import joblib

joblib.dump(class_pipe, r'C:\Users\cassio.reis_saudeid\Desktop\CASSIO\leads_classification\app\models\class_pipe.pkl')

['C:\\Users\\cassio.reis_saudeid\\Desktop\\CASSIO\\leads_classification\\app\\models\\class_pipe.pkl']