<a href="https://colab.research.google.com/github/caioita/training/blob/main/XGBoost.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Data manipulation
import numpy as np
import pandas as pd

# Data Visualiation
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.ticker import PercentFormatter

# System
import os
import glob

## Pipeline and sklearn
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.pipeline import make_pipeline

# Preprocessing
from sklearn.preprocessing import StandardScaler, MinMaxScaler, FunctionTransformer, OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.impute import SimpleImputer

# Machine Learning
from sklearn.neighbors import KNeighborsClassifier

#Metrics
from sklearn.metrics import auc, accuracy_score, confusion_matrix, mean_squared_error



In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
### Download just one to test
data = pd.read_csv('/content/drive/MyDrive/raw_data/solicitacoes_servicos_amazonas.csv',delimiter=',',header=0)

In [4]:
data.columns = ['id_null','date','client_id','service_id','device','platform','data_source', 'source','medium','channel_grouping','campaign','keyword','landing_page']

In [5]:
data.head()

Unnamed: 0,id_null,date,client_id,service_id,device,platform,data_source,source,medium,channel_grouping,campaign,keyword,landing_page
0,0,2022-04-07,1930474000.0,/pt-br/servicos/consultar-restituicao-de-impos...,desktop,Windows,web,servicos.receita.fazenda.gov.br,referral,Referral,(not set),(not set),/receitafederal/pt-br
1,0,2022-04-07,1930474000.0,/pt-br/servicos/consultar-cadastro-de-pessoas-...,desktop,Windows,web,servicos.receita.fazenda.gov.br,referral,Referral,(not set),(not set),/receitafederal/pt-br
2,0,2022-04-07,1930474000.0,/pt-br/servicos/consultar-cadastro-de-pessoas-...,desktop,Windows,web,servicos.receita.fazenda.gov.br,referral,Referral,(not set),(not set),/receitafederal/pt-br
3,0,2022-04-06,1930474000.0,/receitafederal/pt-br/assuntos/meu-cpf,desktop,Windows,web,servicos.receita.fazenda.gov.br,referral,Referral,(not set),(not set),/receitafederal/pt-br
4,0,2022-04-06,1930474000.0,/pt-br/servicos/consultar-cadastro-de-pessoas-...,desktop,Windows,web,servicos.receita.fazenda.gov.br,referral,Referral,(not set),(not set),/receitafederal/pt-br


In [6]:
data = data.drop_duplicates()

In [7]:
# filtragem de serviços acima de 50 solicitações
#data = data[['service_id'].map(data['service_id'].value_counts() >= 10)]
data = data[data['service_id'].map(data['service_id'].value_counts() > 5)]

In [8]:
data.isnull().sum()

id_null             0
date                0
client_id           0
service_id          0
device              0
platform            0
data_source         0
source              0
medium              0
channel_grouping    0
campaign            0
keyword             0
landing_page        0
dtype: int64

In [9]:
data.head()

Unnamed: 0,id_null,date,client_id,service_id,device,platform,data_source,source,medium,channel_grouping,campaign,keyword,landing_page
0,0,2022-04-07,1930474000.0,/pt-br/servicos/consultar-restituicao-de-impos...,desktop,Windows,web,servicos.receita.fazenda.gov.br,referral,Referral,(not set),(not set),/receitafederal/pt-br
1,0,2022-04-07,1930474000.0,/pt-br/servicos/consultar-cadastro-de-pessoas-...,desktop,Windows,web,servicos.receita.fazenda.gov.br,referral,Referral,(not set),(not set),/receitafederal/pt-br
3,0,2022-04-06,1930474000.0,/receitafederal/pt-br/assuntos/meu-cpf,desktop,Windows,web,servicos.receita.fazenda.gov.br,referral,Referral,(not set),(not set),/receitafederal/pt-br
4,0,2022-04-06,1930474000.0,/pt-br/servicos/consultar-cadastro-de-pessoas-...,desktop,Windows,web,servicos.receita.fazenda.gov.br,referral,Referral,(not set),(not set),/receitafederal/pt-br
5,0,2022-04-05,1930474000.0,/pt-br/servicos/consultar-restituicao-de-impos...,desktop,Windows,web,servicos.receita.fazenda.gov.br,referral,Referral,(not set),(not set),/receitafederal/pt-br


Fazendo o pipeline e transformação das features

In [10]:
## separar X e Y
## separar teste e train
## no treino separar o que é categorico do que é não é. 

In [11]:
X = data[['date','device','platform','source','landing_page']]
y = np.array(data['service_id'])

In [12]:
# Split data into train, test and validation sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size = 0.3, random_state = 42  # TEST = 30%
)

In [13]:
cat_features_train = X_train[['device','platform','source','landing_page']]
cat_features_test = X_test[['device','platform','source','landing_page']]

In [14]:
X_train['date'].apply(lambda x: str(x)[5:7]).value_counts()

03    1031
02     773
05     760
04     632
06     477
01     396
07     357
11     226
12     225
10     218
09      52
Name: date, dtype: int64

In [15]:
def date_transform(data: pd.DataFrame):
    
    data['month'] = data['date'].apply(lambda x: int(str(x)[4:6]))
    data['sin_month'] = data['month'].apply(lambda x: np.sin(2*np.pi*(1-x)/12))
    data['cos_month'] = data['month'].apply(lambda x: np.cos(2*np.pi*(1-x)/12))
    
    dt_datetime = pd.to_datetime(
        data['date'],
        format="%Y-%m-%d") # provavelmente será necessário realizar um ajuste nesse formato, já que é diferente do anterior
                        # "%Y%m%d%s%Z"
    
    data['day'] = dt_datetime.dt.weekday + 1 # segunda = 1 e domingo = 7
    
    data = data.drop(columns=['month','date',])
    #data = data.drop(columns='date')
   

    return data [['sin_month','cos_month','day']]

In [16]:
print(X_train.info())
print(X_test.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5147 entries, 4654 to 9906
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   date          5147 non-null   object
 1   device        5147 non-null   object
 2   platform      5147 non-null   object
 3   source        5147 non-null   object
 4   landing_page  5147 non-null   object
dtypes: object(5)
memory usage: 241.3+ KB
None
<class 'pandas.core.frame.DataFrame'>
Int64Index: 2207 entries, 3880 to 3980
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   date          2207 non-null   object
 1   device        2207 non-null   object
 2   platform      2207 non-null   object
 3   source        2207 non-null   object
 4   landing_page  2207 non-null   object
dtypes: object(5)
memory usage: 103.5+ KB
None


In [17]:
datas_train = date_transform(X_train)


In [18]:
X_train['date'].value_counts()

2022-03-17    62
2022-02-14    62
2022-03-16    60
2022-03-21    53
2022-03-28    49
              ..
2021-10-03     1
2021-10-01     1
2021-12-05     1
2021-10-09     1
2021-09-25     1
Name: date, Length: 302, dtype: int64

In [19]:
X_test['date'].value_counts()

2022-02-14    35
2022-03-21    33
2022-03-16    32
2022-03-15    29
2022-03-17    24
              ..
2022-01-30     1
2022-01-29     1
2021-12-19     1
2021-11-26     1
2021-10-12     1
Name: date, Length: 295, dtype: int64

In [20]:
datas_test = date_transform(X_test)

In [21]:
ohc = OneHotEncoder(handle_unknown="ignore",sparse = False)
cat_train_ohc = ohc.fit_transform(cat_features_train)
cat_test_ohc = ohc.transform(cat_features_test)


In [22]:
cat_train_ohc = pd.DataFrame(cat_train_ohc)

In [23]:
cat_test_ohc = pd.DataFrame(cat_test_ohc)

In [24]:
X_train_prepoc = pd.concat([datas_train.reset_index(drop=True),cat_train_ohc.reset_index(drop=True)],axis=1)
X_test_prepoc = pd.concat([datas_test.reset_index(drop=True),cat_test_ohc.reset_index(drop=True)],axis=1)

In [25]:
print(datas_train.shape)
print(cat_train_ohc.shape)
print(X_train.shape)
print(X_train_prepoc.shape)

(5147, 3)
(5147, 544)
(5147, 9)
(5147, 547)


In [26]:
print(X_test_prepoc.shape)
print(X_test.shape)

(2207, 547)
(2207, 9)


In [27]:
lbe_target = LabelEncoder()
ohc_target = OneHotEncoder(handle_unknown="ignore",sparse = False)

In [28]:
y_train

array(['/pt-br/servicos/solicitar-o-seguro-desemprego',
       '/pt-br/servicos/consultar-restituicao-de-imposto-de-renda',
       '/pt-br/servicos/consultar-dividas-e-pendencias-fiscais', ...,
       '/pt-br/servicos/contestar-o-resultado-do-auxilio-emergencial-coronavirus-covid-19',
       '/pt-br/servicos/receber-o-abono-salarial',
       '/pt-br/servicos/consultar-dividas-e-pendencias-fiscais'],
      dtype=object)

In [29]:
target_lbe_train = lbe_target.fit_transform(y_train)
target_lbe_test = lbe_target.transform(y_test)

In [30]:
target_ohc_train = ohc_target.fit_transform(target_lbe_train.reshape(-1,1))
target_ohc_test = ohc_target.transform(target_lbe_test.reshape(-1,1))


Testando o XGBoost

In [38]:
X_train_prepoc.shape

(5147, 547)

In [33]:
target_ohc_train.shape

(5147, 115)

In [34]:
target_ohc_train.shape

(5147, 115)

In [31]:
from xgboost import XGBClassifier

xgb_reg = XGBClassifier(objective = "multi:softmax", max_depth=10, n_estimators=100, learning_rate=0.1)

xgb_reg.fit(X_train_prepoc, target_lbe_train,
    #evaluate loss at each iteration
    eval_set=[(X_train_prepoc, target_lbe_train), (X_test_prepoc, target_lbe_test)],
    #stop iterating when eval loss increases 5 times in a row
    early_stopping_rounds=5
)

y_pred = xgb_reg.predict(X_test_prepoc)

[0]	validation_0-merror:0.514086	validation_1-merror:0.530131
Multiple eval metrics have been passed: 'validation_1-merror' will be used for early stopping.

Will train until validation_1-merror hasn't improved in 5 rounds.
[1]	validation_0-merror:0.498931	validation_1-merror:0.511101
[2]	validation_0-merror:0.480668	validation_1-merror:0.487993
[3]	validation_0-merror:0.474257	validation_1-merror:0.482556
[4]	validation_0-merror:0.470177	validation_1-merror:0.483009
[5]	validation_0-merror:0.460074	validation_1-merror:0.475306
[6]	validation_0-merror:0.45852	validation_1-merror:0.472134
[7]	validation_0-merror:0.453857	validation_1-merror:0.470775
[8]	validation_0-merror:0.450748	validation_1-merror:0.468962
[9]	validation_0-merror:0.448417	validation_1-merror:0.467603
[10]	validation_0-merror:0.447057	validation_1-merror:0.465791
[11]	validation_0-merror:0.442782	validation_1-merror:0.462619
[12]	validation_0-merror:0.440451	validation_1-merror:0.460353
[13]	validation_0-merror:0.438

In [32]:
from sklearn.metrics import top_k_accuracy_score
y_pred_proba = xgb_reg.predict_proba(X_test_prepoc)
top_k_accuracy_score(y_test,y_pred_proba,labels = lbe_target.classes_, k=6)


0.7195287720888084

In [39]:
recomendados

array(['/pt-br/servicos/consultar-dividas-e-pendencias-fiscais',
       '/pt-br/servicos/sacar-o-abono-salarial',
       '/pt-br/servicos/consultar-restituicao-de-imposto-de-renda',
       '/pt-br/servicos/consultar-cadastro-de-pessoas-fisicas',
       '/pt-br/servicos/consultar-e-solicitar-a-devolucao-de-valores-a-receber'],
      dtype=object)

In [34]:
range(y_pred_proba.shape[0])[3]

3

In [42]:
for i in range(1):
  recomendados = lbe_target.inverse_transform(np.argsort(y_pred_proba[i,:])[-5:])
  print("_________________________________________________")
  print(y_pred_proba)
  print("____________5_Recomendacoes______________________________")
  print(recomendados)
  print("**************************************")

_________________________________________________
[[0.00439072 0.00429974 0.00533262 ... 0.00391761 0.00407609 0.00386145]
 [0.01917453 0.00729104 0.00339097 ... 0.00249118 0.00259196 0.00245547]
 [0.00269082 0.00665046 0.00340501 ... 0.00240088 0.002498   0.00236646]
 ...
 [0.0023631  0.00231413 0.01325828 ... 0.00210847 0.00219376 0.00207824]
 [0.00455007 0.00425933 0.01459937 ... 0.00595516 0.00422402 0.00400159]
 [0.0222144  0.00827544 0.00417062 ... 0.00294072 0.00305968 0.00289856]]
____________5_Recomendacoes______________________________
['/pt-br/servicos/consultar-dividas-e-pendencias-fiscais'
 '/pt-br/servicos/sacar-o-abono-salarial'
 '/pt-br/servicos/consultar-restituicao-de-imposto-de-renda'
 '/pt-br/servicos/consultar-cadastro-de-pessoas-fisicas'
 '/pt-br/servicos/consultar-e-solicitar-a-devolucao-de-valores-a-receber']
**************************************


In [47]:
  recomendados = lbe_target.inverse_transform(np.argsort(y_pred_proba[100,:])[-5:])
  print("_________________________________________________")
  print("____________5_Recomendacoes______________________________")
  print(recomendados)
  print("**************************************")

_________________________________________________
____________5_Recomendacoes______________________________
['/pt-br/servicos/consultar-debitos-inscritos-em-divida-ativa-da-uniao'
 '/pt-br/servicos/consultar-cadastro-nacional-de-pessoas-juridicas'
 '/pt-br/servicos/validar-certidao-de-antecedentes-criminais'
 '/pt-br/servicos/consultar-multas-aplicadas-pelo-dnit'
 '/pt-br/servicos/consultar-cadastro-de-pessoas-fisicas']
**************************************


In [None]:
from xgboost import XGBClassifier
xgb_reg_2 = XGBClassifier(objective = "multi:softmax", max_depth=10, n_estimators=1000, learning_rate=0.1)

xgb_reg_2.fit(X_train_prepoc, target_lbe_train,
    #evaluate loss at each iteration
    eval_set=[(X_train_prepoc, target_lbe_train), (X_test_prepoc, target_lbe_test)],
    #stop iterating when eval loss increases 5 times in a row
    early_stopping_rounds=10
)

y_pred_2 = xgb_reg_2.predict(X_test_prepoc)

In [None]:
from sklearn.metrics import top_k_accuracy_score
y_pred_proba_2 = xgb_reg_2.predict_proba(X_test_prepoc)
top_k_accuracy_score(y_test,y_pred_proba_2,labels = lbe_target.classes_, k=6)