Observação: necessário baixar base de dados Sentiment140 e colocá-la na pasta input:
https://www.kaggle.com/datasets/kazanova/sentiment140

In [None]:
import re
import emoji
import pickle
import numpy as np
import pandas as pd
from tqdm import tqdm

import seaborn as sns
from wordcloud import WordCloud
import matplotlib.pyplot as plt

import spacy
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from gensim.models import Word2Vec
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Bidirectional, GlobalMaxPool1D, Dense, LSTM, Conv1D, Embedding
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping
from sklearn.metrics import confusion_matrix, classification_report

DATASET_COLUMNS = ["sentiment", "ids", "date", "flag", "user", "text"]
DATASET_ENCODING = "ISO-8859-1"
dataset = pd.read_csv('./input/training.1600000.processed.noemoticon.csv',
                      encoding=DATASET_ENCODING, names=DATASET_COLUMNS)
dataset.head()

nlp = spacy.load('en_core_web_sm')

stop_words = nlp.Defaults.stop_words

lemmatizer = WordNetLemmatizer()

dataset = dataset[['sentiment', 'text']]

dataset['sentiment'] = dataset['sentiment'].replace(0, 1)

dataset['sentiment'] = dataset['sentiment'].replace(4, 0)

contractions = pd.read_csv(
    './input/contractions.csv', index_col='Contraction')
contractions.index = contractions.index.str.lower()
contractions.Meaning = contractions.Meaning.str.lower()
contractions_dict = contractions.to_dict()['Meaning']

urlPattern = r"((http://)[^ ]*|(https://)[^ ]*|(www\.)[^ ]*)"
userPattern = '@[^\s]+'
hashtagPattern = '#[^\s]+'
alphaPattern = "[^a-z0-9<>]"
sequencePattern = r"(.)\1\1+"
seqReplacePattern = r"\1\1"


def preprocess_apply(tweet):

    # Tansform all the tweets to LowerCase
    tweet = tweet.lower()

    # Remove Emojis
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               "]+", flags=re.UNICODE)
    tweet = emoji_pattern.sub(r'', tweet)

    # Strip all URLs
    tweet = re.sub(urlPattern, '', tweet)

    # Remove mentions
    tweet = re.sub(userPattern, '', tweet)

    # Remove Hashtags from the end of the sentence
    tweet = re.sub(r'(\s+#[\w-]+)+\s*$', '', tweet).strip()

    # Remove the # symbol from hashtags in the middle of the sentence
    tweet = re.sub(r'#([\w-]+)', r'\1', tweet).strip()

    # Replace 3 or more consecutive letters by 2 letter.
    tweet = re.sub(sequencePattern, seqReplacePattern, tweet)

    # Remove Multiple Spaces
    tweet = re.sub(r"\s\s+", " ", tweet)

    # Remove Numbers
    tweet = re.sub(r'\d+', '', tweet)

    for contraction, replacement in contractions_dict.items():
        tweet = tweet.replace(contraction, replacement)

    # Remove non-alphanumeric and symbols
    tweet = re.sub(alphaPattern, ' ', tweet)

    # Adding space on either side of '/' to seperate words (After replacing URLS).
    tweet = re.sub(r'/', ' / ', tweet)

    # Lemmatize the text
    words = word_tokenize(tweet)
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    tweet = ' '.join(lemmatized_words)

    return tweet


tqdm.pandas()
dataset['processed_text'] = dataset.text.progress_apply(preprocess_apply)
print("[!] Cleaning Done!")

X_data, y_data = np.array(
    dataset['processed_text']), np.array(dataset['sentiment'])

X_train, X_test, y_train, y_test = train_test_split(
    X_data, y_data, test_size=0.3, random_state=0)
print('[!] Data Split done.')

Embedding_dimensions = 100
Word2vec_train_data = list(map(lambda x: x.split(), X_train))

word2vec_model = Word2Vec(
    Word2vec_train_data, vector_size=Embedding_dimensions, workers=8, min_count=5)
print("[*] Vocabulary Length:", len(word2vec_model.wv.key_to_index))

input_length = 60
vocab_length = 60000

tokenizer = Tokenizer(filters="", lower=False, oov_token="<oov>")
tokenizer.fit_on_texts(X_data)
tokenizer.num_words = vocab_length
print("[*] Tokenizer vocab length:", vocab_length)

X_train = pad_sequences(
    tokenizer.texts_to_sequences(X_train), maxlen=input_length)
X_test = pad_sequences(
    tokenizer.texts_to_sequences(X_test), maxlen=input_length)

print("[!] X_train.shape:", X_train.shape)
print("[!] X_test.shape :", X_test.shape)

embedding_matrix = np.zeros((vocab_length, Embedding_dimensions))

for word, token in tokenizer.word_index.items():
    if word2vec_model.wv.__contains__(word):
        embedding_matrix[token] = word2vec_model.wv.__getitem__(word)

print("[*] Embedding Matrix Shape:", embedding_matrix.shape)


def getModel():
    embedding_layer = Embedding(input_dim=vocab_length,
                                output_dim=Embedding_dimensions,
                                weights=[embedding_matrix],
                                input_length=input_length,
                                trainable=False)

    model = Sequential([
        embedding_layer,
        Bidirectional(LSTM(100, dropout=0.3, return_sequences=True)),
        Bidirectional(LSTM(100, dropout=0.3, return_sequences=True)),
        Conv1D(100, 5, activation='relu'),
        GlobalMaxPool1D(),
        Dense(16, activation='relu'),
        Dense(1, activation='sigmoid'),
    ],
        name="Sentiment_Model")
    return model


training_model = getModel()
training_model.summary()

callbacks = [ReduceLROnPlateau(monitor='val_loss', patience=5, cooldown=0), EarlyStopping(
    monitor='val_accuracy', min_delta=1e-4, patience=5)]

training_model.compile(loss='binary_crossentropy',
                       optimizer='adam', metrics=['accuracy'])

history = training_model.fit(
    X_train, y_train,
    batch_size=1024,
    epochs=12,
    validation_split=0.1,
    callbacks=callbacks,
    verbose=1,
)

word2vec_model.wv.save('Word2Vec-twitter-100')
word2vec_model.wv.save_word2vec_format('Word2Vec-twitter-100-trainable')

with open('Tokenizer.pickle', 'wb') as file:
    pickle.dump(tokenizer, file)

training_model.save('Sentiment-BiLSTM')
training_model.save_weights("Model Weights/weights")

acc,  val_acc = history.history['accuracy'], history.history['val_accuracy']
loss, val_loss = history.history['loss'], history.history['val_loss']
epochs = range(len(acc))

plt.plot(epochs, acc, 'b', label='Training acc')
plt.plot(epochs, val_acc, 'r', label='Validation acc')
plt.title('Training and validation accuracy')
plt.legend()

plt.figure()

plt.plot(epochs, loss, 'b', label='Training loss')
plt.plot(epochs, val_loss, 'r', label='Validation loss')
plt.title('Training and validation loss')
plt.legend()

plt.show()