In [1]:
# https://keras.io/examples/nlp/text_classification_with_transformer/
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import pandas as pd
import numpy as np



from sklearn.model_selection import train_test_split
from gensim.models import KeyedVectors
tf.__version__

'2.2.0-rc3'

In [2]:
# Definindo as variáveis do projeto
vocab_size = 200000  # Considerar 200k palavras
maxlen = 200  # Considerar apenas as 100 primeiras palavras do texto da review

embed_dim = 50 # tamanho do Embedding de cada token ( também do word2vec da NILC)
num_heads = 2  # N. de cabeças de atenção
ff_dim = 32   # tamanho da camada oculta nas redes feed forward dentro do transformer

# Path para o arquivo de dados da b2w
B2W_DATAFILE = "/home/wseidel/workspaces/usp/b2w-reviews01/B2W-Reviews01.csv"
# B2W_DATAFILE = "/home/wseidel/workspaces/usp/b2w-reviews01/B2W-10k.csv"

# Path para o arquivo de dados de embeddings do NILC
NILC_W2V_DATAFILE = "/home/wseidel/workspaces/usp/NILC/word2vec_200k.txt"

# Quantidade de epocas para o treino
QNT_EPOCAS_A_TREINAR = 2

In [3]:
# Carregar dados a serem analisados
b2wCorpus = pd.read_csv(B2W_DATAFILE, sep=';', usecols=["review_text", "overall_rating"])

# Carregar o Word2Vec do NILC
model_w2v = KeyedVectors.load_word2vec_format(NILC_W2V_DATAFILE)

In [4]:
b2wCorpus.groupby(['overall_rating']).count()
# b2wCorpus.describe()

Unnamed: 0_level_0,review_text
overall_rating,Unnamed: 1_level_1
1,27369
2,8389
3,16315
4,32345
5,47955


In [5]:
def train_test_val_split(dataset, train_size=0.6, test_size=0.3, colname_stratify='overall_rating',random_seed=29):
    val_size = 1 - round((train_size + test_size),1)
    split_train_test_size = test_size + val_size

    train, val = train_test_split(dataset, 
                                  test_size=split_train_test_size, 
                                  stratify=dataset[colname_stratify], 
                                  random_state=random_seed)

    test, val = train_test_split(val, 
                                  test_size=val_size/split_train_test_size, 
                                  stratify=val[colname_stratify], 
                                  random_state=random_seed)
    return train.reset_index(), test, val


def sentence_to_nilc_index_token(text, stem=False):
    # Traduzindo os tokens do B2W para o index do NILC
    tokens = text.lower().split() # Pegar um tokenizer decente...
    tokens = [model_w2v.vocab[t].index if t in model_w2v.vocab else 1 for t in tokens ]
    return tokens

def sort_by_size(df, col_to_sort):
    df['sentence_length'] = df[col_to_sort].apply(lambda x: len(x))
    df.sort_values(by=['sentence_length'], inplace=True, ignore_index=True)
    return df

def getXY(serieX, serieY, padding_maxlen=50):
    x_train = keras.preprocessing.sequence.pad_sequences(train['review_text_clean'], maxlen=padding_maxlen, padding='post')
    y_train = train['overall_rating']
    return x_train, y_train


# ------ main ----
df_to_work = b2wCorpus

TAMMAX_SENTENCE=50

values_to_retain=[1,2,3,4,5]
df_to_work = df_to_work[df_to_work['overall_rating'].isin(values_to_retain)]
# df_to_work
df_to_work['overall_rating'] = df_to_work.overall_rating.apply(lambda x: x-1)

# Aplicando o sentence_to_nilc_index_token
df_to_work['review_text_clean'] = df_to_work.review_text.apply(lambda x: sentence_to_nilc_index_token(x))

# train, test, val = train_test_val_split(df_to_work, train_size=0.75, test_size=0.15)
train, test, val = train_test_val_split(df_to_work)

sort_by_size(train, 'review_text_clean')


x_train, y_train = getXY(train['review_text_clean'], train['overall_rating'], padding_maxlen=TAMMAX_SENTENCE)
x_test,  y_test  = getXY(test['review_text_clean'], test['overall_rating'], padding_maxlen=TAMMAX_SENTENCE)
x_val,   y_val   = getXY(val['review_text_clean'], val['overall_rating'], padding_maxlen=TAMMAX_SENTENCE)


print("train..:", len(train), round(len(train) / len(df_to_work),3) ) 
print("test...:", len(test), round(len(test) / len(df_to_work),3) )
print("val....:", len(val), round(len(val) / len(df_to_work),3) )
print("--" * 20) 
print("x_train..:", len(x_train[-1]), ) 
print("x_test...:", len(x_test[-1]), ) 
print("x_val....:", len(x_val[-1]), ) 
# train = train.reset_index(drop=True)
# train = train.reset_index(inplace=True)
# train = train.copy()

# df_to_work.groupby
df_to_work.groupby(['overall_rating']).count()
# b2wCorpus.describe()

train..: 79423 0.6
test...: 39712 0.3
val....: 13238 0.1
----------------------------------------
x_train..: 50
x_test...: 50
x_val....: 50


Unnamed: 0_level_0,review_text,review_text_clean
overall_rating,Unnamed: 1_level_1,Unnamed: 2_level_1
0,27369,27369
1,8389,8389
2,16315,16315
3,32345,32345
4,47955,47955


In [6]:
dados = list(range(19))
lote_size = 3
lote_count = int(np.ceil(len(dados)/ lote_size))
print("qnt dados..:", len(dados))
print("lote size..:", lote_size)
print("lote count..:", lote_count)
for i in range(0,lote_count):
    print(f"Pegando lote {i} de {lote_count}:", end="")
    print(dados[ i*lote_size : i*lote_size+lote_size ] )

qnt dados..: 19
lote size..: 3
lote count..: 7
Pegando lote 0 de 7:[0, 1, 2]
Pegando lote 1 de 7:[3, 4, 5]
Pegando lote 2 de 7:[6, 7, 8]
Pegando lote 3 de 7:[9, 10, 11]
Pegando lote 4 de 7:[12, 13, 14]
Pegando lote 5 de 7:[15, 16, 17]
Pegando lote 6 de 7:[18]


In [7]:
# from keras import Sequential
# from keras.utils import Sequence
# from keras.layers import LSTM, Dense, Masking
# import numpy as np
from tensorflow.keras import layers
# model = tf.keras.Sequential([
from tensorflow import keras

def get_lstm_model(dropout_prob=0.0):
    embedding_layer = model_w2v.get_keras_embedding()
    embedding_layer.trainable = True

    model = keras.Sequential()
    model.add(layers.Input(shape=(TAMMAX_SENTENCE, )))
    model.add(embedding_layer)
    model.add(layers.LSTM(64))
    model.add(layers.Dropout(dropout_prob))
    model.add(keras.layers.Dense(5, activation='softmax'))
    model.compile("adam", "sparse_categorical_crossentropy", metrics=["accuracy"])
    return model

model = get_lstm_model()
model.summary()


Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 50, 50)            10000000  
_________________________________________________________________
lstm (LSTM)                  (None, 64)                29440     
_________________________________________________________________
dropout (Dropout)            (None, 64)                0         
_________________________________________________________________
dense (Dense)                (None, 5)                 325       
Total params: 10,029,765
Trainable params: 10,029,765
Non-trainable params: 0
_________________________________________________________________


In [8]:
# Ver lista06

# Ler aqui pro batch generator:
#     https://datascience.stackexchange.com/questions/48796/how-to-feed-lstm-with-different-input-array-sizes

# Seu código aqui

QNT_EPOCAS_TREINO = 10


model.compile("adam", "sparse_categorical_crossentropy", metrics=["accuracy"])
history = model.fit(
    x_train, y_train, batch_size=32, epochs=QNT_EPOCAS_TREINO, validation_data=(x_val, y_val)
)

loss, accuracy = model.evaluate(x=x_test,y=y_test)

print("Loss: ", loss)
print("Accuracy: ", accuracy)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Loss:  0.5268877744674683
Accuracy:  0.7812220454216003
