In [1]:
###################################################################################################
#Auxiliares
import os
import re
import pickle
import random
from time import time
import numpy as np
import tensorflow as tf
os.environ['TF_DETERMINISTIC_OPS'] = '1'

# Now using tensorflow 2.1.0, so no need to patch
# from tfdeterminism import patch
#patch()

seed = 42
os.environ['PYTHONHASHSEED']=str(seed)
random.seed(seed)
np.random.seed(seed)
tf.random.set_seed(seed)
###################################################################################################
# utils
from distutils.version import LooseVersion
from tqdm import tqdm_notebook
from datetime import datetime
from time import time

import warnings
import pickle
import gc
import sys
from json import dumps
import itertools
import re

# Data
import spacy

# Viz
import matplotlib.pyplot as plt

# Machine Learning
import tensorflow.keras.backend as K
from keras.preprocessing.text import one_hot, Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Input, Dropout, Bidirectional, LSTM, Flatten, Dense, Reshape
from keras.layers.embeddings import Embedding
from keras.models import Sequential, Model, load_model
from keras.optimizers import Adam
from keras.callbacks.callbacks import EarlyStopping
from keras_self_attention import SeqSelfAttention
from keras_multi_head import MultiHead, MultiHeadAttention
###################################################################################################
#Dados
import pandas as pd
import matplotlib as pl

#preprocessing and transformation
from sklearn.preprocessing import normalize, MaxAbsScaler, MinMaxScaler, StandardScaler
from nltk.corpus import stopwords
#from nltk import word_tokenize, pos_tag

#Machine learning
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer, HashingVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from sklearn.ensemble import StackingClassifier
from sklearn.preprocessing import LabelEncoder

#Metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report, accuracy_score, roc_curve, auc, precision_recall_fscore_support, f1_score

seed = 42

def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

Using TensorFlow backend.


In [2]:
#Variables
label='Atendimento'
exp = label + '-Balanced-Multiclass'
base_path = 'D:/03. Documentos/Mestrado/22032020 - Experimentos/05. Organizado/03. Datasets/'+exp
save_path = 'output'
data='resp-text'

# Models to be loaded
reglog_model_path = 'D:/03. Documentos/Mestrado/22032020 - Experimentos/05. Organizado/02. Notebooks/01. '+label+'/03. Resposta/output'
reglog_model_name = '2020_05_20_00_27_55_Atendimento_Resposta_Multiclass_Balanced_word.sav'
bilstm_mha_model_path = 'D:/Outputs_Mestrado/resultados_Atendimento/checkpoins_resposta_keras_mh_att'
bilstm_mha_model_name = '20200516_215511/model.h5'

x_train_file = 'X_train.csv'
y_train_file = 'y_train.csv'
x_test_file = 'X_test.csv'
y_test_file = 'y_test.csv'

#Load data
X_train = pd.read_csv(os.path.join(base_path, x_train_file), sep=';', encoding='utf-8')
y_train = pd.read_csv(os.path.join(base_path, y_train_file), sep=';', encoding='utf-8')
X_test = pd.read_csv(os.path.join(base_path, x_test_file), sep=';', encoding='utf-8')
y_test = pd.read_csv(os.path.join(base_path, y_test_file), sep=';', encoding='utf-8')

#Checking on data
print(X_test.columns)
print(X_test.shape)
print(y_test[label].value_counts())
print(y_test[label].value_counts())

Index(['pid', 'req-text', 'resp-text', '1funct-request', '2pronoun-request',
       '3ppron-request', '4i-request', '5we-request', '6you-request',
       '7shehe-request',
       ...
       '58home-response', '59money-response', '60relig-response',
       '61death-response', '62assent-response', '63nonfl-response',
       '64filler-response', 'Clareza', 'Atendimento', 'tempo_resposta'],
      dtype='object', length=134)
(2993, 134)
2    1029
1    1006
0     958
Name: Atendimento, dtype: int64
2    1029
1    1006
0     958
Name: Atendimento, dtype: int64


In [3]:
# load the RegLog model from disk
reglog_model = pickle.load(open(os.path.join(reglog_model_path, reglog_model_name), 'rb'))
bilstm_mha = load_model(os.path.join(bilstm_mha_model_path, bilstm_mha_model_name),
                        custom_objects={'MultiHeadAttention': MultiHeadAttention,
                                        'recall_m': recall_m,
                                        'precision_m': precision_m,
                                        'f1_m': f1_m})

In [4]:
max_num_words = bilstm_mha._layers[1]._trainable_weights[0].shape[0]
max_length = bilstm_mha.inputs[0].shape[1]

In [5]:
# PREDICT TRAIN
y_pred_reglog_train = reglog_model.predict(X_train[data])
y_pred_reglog_prob_train = reglog_model.predict_proba(X_train[data])

# PREDICT TEST
y_pred_reglog_test = reglog_model.predict(X_test[data])
y_pred_reglog_prob_test = reglog_model.predict_proba(X_test[data])

In [6]:
# Define tokenizer and fit train data
t = Tokenizer(num_words=max_num_words)
t.fit_on_texts(X_train[data].append(X_test[data]))
word_index = t.word_index
vocab_size = len(word_index) + 1
print('Found %s unique tokens.' % len(word_index))
    
def get_seqs(text):    
    sequences = t.texts_to_sequences(text)
    padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post')
    return padded_sequences

# prepare target
def prepare_targets(y_train, y_test):
    le = LabelEncoder()
    le.fit(y_train)
    y_train_enc = le.transform(y_train)
    y_test_enc = le.transform(y_test)
    return pd.get_dummies(y_train_enc), pd.get_dummies(y_test_enc)

# X and Y
label_train, label_test = prepare_targets(y_train[label].values, y_test[label].values)
num_labels = len(set(label_train))
input_train = get_seqs(X_train[data])
input_test = get_seqs(X_test[data])

Found 36763 unique tokens.


In [7]:
# PREDICT TRAIN
y_pred_bilstm_mha_prob_train = bilstm_mha.predict(input_train)
y_pred_bilstm_mha_train = np.argmax(y_pred_bilstm_mha_prob_train, axis=1)

# PREDICT TEST
y_pred_bilstm_mha_prob_test = bilstm_mha.predict(input_test)
y_pred_bilstm_mha_test = np.argmax(y_pred_bilstm_mha_prob_test, axis=1)

In [8]:
pipe = Pipeline([
    ('clf', LogisticRegression(random_state=seed, max_iter=1000, n_jobs=6, solver='lbfgs'))
])

parameters = {
        'clf__C': (0.001, 0.01, 0.1, 1, 10, 100, 1000),
}

grid_search_ensemble = GridSearchCV(pipe,
                               parameters,
                               cv=10,
                               scoring='f1_macro',
                               n_jobs=-1,
                               verbose=False)

In [9]:
df_prob_reglog_train = pd.DataFrame(y_pred_reglog_prob_train, columns=['reglog_0', 'reglog_1', 'reglog_2'])
df_prob_bilstm_mha_train = pd.DataFrame(y_pred_bilstm_mha_prob_train, columns=['bilstm_mha_0', 'bilstm_mha_1', 'bilstm_mha_2'])
df_y_train = pd.DataFrame(y_train[label], columns=[label]).reset_index(drop=True)
df_probs_train = pd.concat([df_prob_reglog_train, df_prob_bilstm_mha_train, df_y_train], axis=1)

df_probs_train.head()

Unnamed: 0,reglog_0,reglog_1,reglog_2,bilstm_mha_0,bilstm_mha_1,bilstm_mha_2,Atendimento
0,0.109557,0.733358,0.157085,0.19018,0.388625,0.421195,1
1,0.112849,0.855987,0.031164,0.239681,0.568711,0.191608,1
2,0.604796,0.251669,0.143535,0.460415,0.383559,0.156025,0
3,0.40286,0.365054,0.232086,0.225136,0.299651,0.475213,0
4,0.075019,0.757206,0.167775,0.140205,0.405719,0.454076,1


In [10]:
print("Executando Gridsearch para Classe " + label + " - " + data)

# Time now
now = datetime.now().strftime("%Y%m%d_%H%M%S")
print(now)

t0 = time()
grid_search_ensemble.fit(df_probs_train.iloc[:, :-1], df_probs_train[label])
print("done in %0.3fs" % (time() - t0))
print("Best score: %0.3f" % grid_search_ensemble.best_score_)
print("Best parameters set:")
best_parameters_ensemble = grid_search_ensemble.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters_ensemble[param_name]))

Executando Gridsearch para Classe Atendimento - resp-text
20200607_001404
done in 5.245s
Best score: 0.941
Best parameters set:
	clf__C: 1


In [11]:
f_save = now + '_Ensemble_LR_' + exp + '_' + label + '.sav'
pickle.dump(grid_search_ensemble, open(f_save,'wb'))

In [12]:
df_prob_reglog_test = pd.DataFrame(y_pred_reglog_prob_test, columns=['reglog_0', 'reglog_1', 'reglog_2'])
df_prob_bilstm_mha_test = pd.DataFrame(y_pred_bilstm_mha_prob_test, columns=['bilstm_mha_0', 'bilstm_mha_1', 'bilstm_mha_2'])
df_y_test = pd.DataFrame(y_test[label], columns=[label]).reset_index(drop=True)
df_probs_test = pd.concat([df_prob_reglog_test, df_prob_bilstm_mha_test, df_y_test], axis=1)

df_probs_test.head()

Unnamed: 0,reglog_0,reglog_1,reglog_2,bilstm_mha_0,bilstm_mha_1,bilstm_mha_2,Atendimento
0,0.513663,0.313459,0.172878,0.571354,0.315296,0.11335,1
1,0.47709,0.456902,0.066008,0.544615,0.365261,0.090124,1
2,0.508545,0.373423,0.118033,0.332954,0.338737,0.328309,1
3,0.184952,0.445563,0.369485,0.275219,0.287289,0.437493,2
4,0.408795,0.569731,0.021474,0.35906,0.391429,0.249512,1


In [13]:
y_pred_ensemble = grid_search_ensemble.predict(df_probs_test.iloc[:, :-1])

In [14]:
print('#'*50)
print('Report for TEST')
print('#'*50)
print(classification_report(df_probs_test.iloc[:,-1], y_pred_ensemble))

##################################################
Report for TEST
##################################################
              precision    recall  f1-score   support

           0       0.43      0.44      0.43       958
           1       0.35      0.38      0.37      1006
           2       0.46      0.41      0.43      1029

    accuracy                           0.41      2993
   macro avg       0.41      0.41      0.41      2993
weighted avg       0.41      0.41      0.41      2993

