In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split


In [4]:
# load dataset
df = pd.read_csv('../output/saida.csv')

# rows and columns of the data
print(df.shape)

# visualise the dataset
df.head()

(484381, 35)


Unnamed: 0,id_form,id_cliente,data,sistema_operacional,navegador,localizacao,locale_x,cidade_x,estado_x,qtde_lavagem_semana,...,status_pagamento,tipo_pagamento,valor_total,valor_desconto,frete_gratis,cidade_y,estado_y,locale_y,utm_source_y,target
0,c7d453c9c01223a42968d9fe136047,d2620a3e6f00a6dc6e88,2022-01-03 15:42:22,iOS,Safari,"São Paulo, Sao Paulo",BR,SÃO PAULO,SAO PAULO,4,...,Entregue,CARTÃO,124.03,0.0,False,São Paulo,SP,BR,,1.0
1,72830da153824af486be15cd48b3f3,9a48802d8290d19471f3,2022-01-04 08:39:53,Windows,Chrome,"Ribeirão Preto, Sao Paulo",BR,RIBEIRÃO PRETO,SAO PAULO,7,...,Cancelado,PIX,282.22,0.0,False,Jundiaí,SP,BR,,1.0
2,72830da153824af486be15cd48b3f3,9a48802d8290d19471f3,2022-01-04 08:39:53,Windows,Chrome,"Ribeirão Preto, Sao Paulo",BR,RIBEIRÃO PRETO,SAO PAULO,7,...,Cancelado,PIX,282.22,0.0,False,Jundiaí,SP,BR,,1.0
3,d101fc0075dac29db1168cfc61ad44,c3344fad450ee424700d,2022-01-04 19:34:05,Android,Chrome,"São Paulo, Sao Paulo",BR,SÃO PAULO,SAO PAULO,2,...,,,,,,,,,,
4,0957fb9d7be36b4d079b6582bb6785,8105e34124e90300d8b5,2022-01-05 18:29:41,Android,Chrome,"Manchester, England",BR,MANCHESTER,ENGLAND,5,...,,,,,,,,,,


### Fillna - Variável Target

In [5]:
df.target.fillna(0, inplace=True)

In [6]:
#convert columns caracteristic and procedimentos to list
df['caracteristica'] = df['caracteristica'].apply(lambda x: x.replace(' ',''))
df['caracteristica'] = df['caracteristica'].apply(lambda x: x.split(','))
df.procedimentos.replace('N/C','N_C', inplace=True)
df['procedimentos'] = df['procedimentos'].apply(lambda x: x.split(','))

df.efeitos_desejados.replace('N/C','efeitos_desejados_N_C', inplace=True)
df.efeitos_desejados.fillna('Missing_efeitos_desejados', inplace=True)
df['efeitos_desejados'] = df['efeitos_desejados'].apply(lambda x: x.replace(' ',''))
df['efeitos_desejados'] = df['efeitos_desejados'].apply(lambda x: x.split(','))



In [7]:
from sklearn.preprocessing import MultiLabelBinarizer


mlb = MultiLabelBinarizer(sparse_output=True)

df = df.join(
            pd.DataFrame.sparse.from_spmatrix(
                mlb.fit_transform(df.pop('caracteristica')),
                index=df.index,
                columns=mlb.classes_))

In [8]:
mlb = MultiLabelBinarizer(sparse_output=True)

df = df.join(
            pd.DataFrame.sparse.from_spmatrix(
                mlb.fit_transform(df.pop('efeitos_desejados')),
                index=df.index,
                columns=mlb.classes_))

mlb = MultiLabelBinarizer(sparse_output=True)

df = df.join(
            pd.DataFrame.sparse.from_spmatrix(
                mlb.fit_transform(df.pop('procedimentos')),
                index=df.index,
                columns=mlb.classes_))

### Split data into train and test

In [9]:
# Let's separate into train and test set
# Remember to set the seed (random_state for this sklearn function)

X_train, X_test, y_train, y_test = train_test_split(
    df.drop(['target'], axis=1), # predictive variables
    df['target'], # target
    test_size=0.2, # portion of dataset to allocate to test set
    random_state=0, # we are setting the seed here
)

X_train.shape, X_test.shape

((387504, 63), (96877, 63))

In [10]:
from imblearn.under_sampling import RandomUnderSampler

under = RandomUnderSampler(sampling_strategy='majority')
X_train, y_train = under.fit_resample(X_train, y_train)
X_test, y_test = under.fit_resample(X_test, y_test)



In [11]:
####CONFIGURATION

DROP_FEATURES = [
    'id_cliente',
    'localizacao',
    'id_form',
    'status_pagamento',
    'tipo_pagamento',
    'valor_total',
    'valor_desconto',
    'frete_gratis',
    'cidade_y',
    'estado_y',
    'locale_y',
    'utm_source_y',
    'id_pedido',
    'data',
    'data_pedido',
    'locale_x',
    'cidade_x',
    'sistema_operacional',
]

# categorical variables with NA in train set
CATEGORICAL_VARS_WITH_NA_FREQUENT = [
 'navegador',
 'utm_source_x',

]


CATEGORICAL_VARS = [
                     'estado_x',
                     'utm_source_x',
                     'tipo_cabelo',
                     'comprimento',
                     'tipo_fios',
                     'tempo_procedimento',
                     'dieta',
                     'atividade_fisica',
                     'frequencia_estresse',
                     'faixa_etaria',
                     'fragancia',
                     'navegador'
]



#### Drop columns

In [12]:
from feature_engine.selection import DropFeatures

X_train = DropFeatures(DROP_FEATURES).fit_transform(X_train)
X_test = DropFeatures(DROP_FEATURES).fit_transform(X_test)


#### Missing Values Input

In [13]:
### Imputation
from feature_engine.imputation import (
    CategoricalImputer,
)

for col in ['estado_x']:
    inpt = CategoricalImputer(imputation_method='missing')
    X_train[col] = inpt.fit_transform(X_train[[col]])
    X_test[col] = inpt.transform(X_test[[col]])




In [14]:
# ('frequent_imputation', CategoricalImputer(
#         imputation_method='frequent', variables=CATEGORICAL_VARS_WITH_NA_FREQUENT)

for col in CATEGORICAL_VARS_WITH_NA_FREQUENT:
    inpt = CategoricalImputer(imputation_method='frequent')
    X_train[col] = inpt.fit_transform(X_train[[col]])
    X_test[col] = inpt.transform(X_test[[col]])

#### Encoder

In [15]:
pd.DataFrame({
    'col': X_train.columns,
    'type': X_train.dtypes,
    'missing': X_train.isnull().sum().values / X_train.shape[0]
}).sort_values('missing', ascending=False)

Unnamed: 0,col,type,missing
navegador,navegador,object,0.0
N/C,N/C,"Sparse[int32, 0]",0.0
Efeito10,Efeito10,"Sparse[int32, 0]",0.0
Efeito2,Efeito2,"Sparse[int32, 0]",0.0
Efeito3,Efeito3,"Sparse[int32, 0]",0.0
Efeito4,Efeito4,"Sparse[int32, 0]",0.0
Efeito5,Efeito5,"Sparse[int32, 0]",0.0
Efeito6,Efeito6,"Sparse[int32, 0]",0.0
Efeito7,Efeito7,"Sparse[int32, 0]",0.0
Efeito8,Efeito8,"Sparse[int32, 0]",0.0


#### Rare Labels

In [16]:
from feature_engine.encoding import (
    RareLabelEncoder,
)

for col in CATEGORICAL_VARS:
    encoder = RareLabelEncoder(tol=0.01)
    X_train[col] = encoder.fit_transform(X_train[[col]])
    X_test[col] = encoder.transform(X_test[[col]])



In [17]:
from feature_engine.encoding import (
    OrdinalEncoder,
)

encoder = OrdinalEncoder(encoding_method='ordered', variables=CATEGORICAL_VARS)
encoder.fit(X_train, y_train)

X_train= encoder.transform(X_train)
X_test= encoder.transform(X_test)


#### Redução de dimensionalidade

In [18]:
#### Analise de variancia

In [19]:
# from sklearn.feature_selection import VarianceThreshold
#
# def variance_threshold_selector(data, threshold=0.5):
#     selector = VarianceThreshold(threshold)
#     selector.fit(data)
#     return data[data.columns[selector.get_support(indices=True)]]
#
# X_train = variance_threshold_selector(X_train, threshold=0.5)
# X_test = variance_threshold_selector(X_test, threshold=0.5)

In [20]:
pd.DataFrame({
    'col': X_train.columns,
    'type': X_train.dtypes,
    'missing': X_train.isnull().sum().values / X_train.shape[0]
}).sort_values('missing', ascending=False)

Unnamed: 0,col,type,missing
navegador,navegador,int64,0.0
N/C,N/C,"Sparse[int32, 0]",0.0
Efeito10,Efeito10,"Sparse[int32, 0]",0.0
Efeito2,Efeito2,"Sparse[int32, 0]",0.0
Efeito3,Efeito3,"Sparse[int32, 0]",0.0
Efeito4,Efeito4,"Sparse[int32, 0]",0.0
Efeito5,Efeito5,"Sparse[int32, 0]",0.0
Efeito6,Efeito6,"Sparse[int32, 0]",0.0
Efeito7,Efeito7,"Sparse[int32, 0]",0.0
Efeito8,Efeito8,"Sparse[int32, 0]",0.0


In [21]:
from xgboost import XGBClassifier

clf = XGBClassifier(
    n_estimators=100,
    max_depth=3,
    learning_rate=0.1,
    objective='binary:logistic',
    subsample=0.8,
    colsample_bytree=0.8,
    seed=0,
    nthread=4,
    silent=True,
)

clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

from sklearn.metrics import *

print(f'Accuracy: {accuracy_score(y_test, y_pred)}')
print(f'Precision: {precision_score(y_test, y_pred)}')
print(f'Recall: {recall_score(y_test, y_pred)}')
print(f'F1: {f1_score(y_test, y_pred)}')


Parameters: { "silent" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Accuracy: 0.661632297415045
Precision: 0.6895651186556149
Recall: 0.5879562397134283
F1: 0.634719899665552


In [22]:
print(classification_report(y_test, y_pred))
print('ROC: %.3f' % roc_auc_score(y_test, y_pred, average='micro'))


              precision    recall  f1-score   support

         0.0       0.64      0.74      0.68     30987
         1.0       0.69      0.59      0.63     30987

    accuracy                           0.66     61974
   macro avg       0.67      0.66      0.66     61974
weighted avg       0.67      0.66      0.66     61974

ROC: 0.662


In [None]:
print i

### PyCaret

In [23]:
#concat X_train and y train
X_train_y = pd.concat([X_train, y_train], axis=1)


In [None]:
from pycaret.classification import *

pycaret_automl = setup(data=X_train_y,
                       target='target',
                       fold_shuffle=True,
                       session_id=2,
                       remove_multicollinearity = True,
                       multicollinearity_threshold = 0.95,
                       fix_imbalance = True,
                       imputation_type='iterative'
                       )



In [None]:
pycaret_automl

In [None]:
X_train.columns
cols=['']

In [None]:
pd.DataFrame({
    'col': X_train.columns,
    'type': X_train.dtypes,
    'missing': X_train.isnull().sum().values / X_train.shape[0],
    'cardinality' : X_train.nunique().values,
}).sort_values('missing', ascending=False)