### IMPORTS

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import gensim
import nltk
from nltk.tokenize import word_tokenize
from modules.preprocess import *
from modules.utils import build_dataset, text_to_word2vec, evaluate
from modules.rnn_model import TextRNN
import gensim.downloader as api
from nltk.tokenize import word_tokenize
import numpy as np
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset
from torch.utils.tensorboard import SummaryWriter

[nltk_data] Downloading package stopwords to /home/xavier/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
2024-03-28 16:36:17.114153: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-03-28 16:36:19.060929: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


### LOAD DATASET

In [2]:
dataset = build_dataset('archive/truth_seeker.xlsx', num_class_samples=-1, rnd_state=10)

### PREPROCESS DATA

In [3]:
dataset = text_edit(dataset,
                    grp_num=False,
                    rm_newline=True,
                    rm_punctuation=True,
                    rm_stop_words=False,
                    lowercase=True,
                    lemmatize=False,
                    expand=False,
                    html_=True,
                    symb_to_text=False,
                    convert_entities=False,
                    reduce_mentions=False)

### CREATE SAMPLE AND TARGET LISTS

In [4]:
X = [x['tweet'] for x in dataset.values()]
Y = [x['BinaryNumTarget'] for x in dataset.values()]

### TRAIN/TEST SPLIT

In [5]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state = 42)

In [6]:
model_name = 'fasttext-wiki-news-subwords-300'
word2vec_model = api.load(model_name)

In [7]:
text = "Ceci est un texte exemple"
vector = text_to_word2vec(text, word2vec_model)

In [8]:
input_size = vector.shape[0]  
hidden_size = 128
output_size = 1 

In [9]:
model = TextRNN(input_size, hidden_size, output_size, batch_first=True, nonlinearity='relu', dropout=0, bidirectional=True, num_layers=1)
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

In [10]:
X_train = torch.stack([torch.tensor(text_to_word2vec(x, word2vec_model), dtype=torch.float32).view(1,-1) for x in X_train], dim=0)
X_test = torch.stack([torch.tensor(text_to_word2vec(x, word2vec_model), dtype=torch.float32).view(1,-1) for x in X_test], dim=0)

In [11]:
Y_train = torch.tensor(Y_train, dtype=torch.float32)
Y_test = torch.tensor(Y_test, dtype=torch.float32)

In [12]:
writer = SummaryWriter()
test_writer = SummaryWriter()

In [13]:
batch_size = 32

dataset = TensorDataset(X_train, Y_train)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

test_dataset = TensorDataset(X_test, Y_test)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)

best_test_loss = float('inf')
epochs_without_improvement = 0  
epochs = 100

for epoch in range(epochs):
    train_losses = []
    test_losses = []
    for X, Y in dataloader:  
        model.train()
        optimizer.zero_grad()
        outputs = model(X).view(-1)
        loss = criterion(outputs, Y)
        writer.add_scalar("Loss/train", loss, epoch)
        loss.backward() 
        optimizer.step()
        train_losses.append(loss.detach())
    for X, Y in test_dataloader:  
        model.eval()
        outputs = model(X).view(-1)
        loss = criterion(outputs, Y)
        writer.add_scalar("Loss/test", loss, epoch)
        test_losses.append(loss.detach())

    mean_test_loss = np.mean(test_losses)
    print(f'Results for epoch {epoch}:')
    print(f'Mean train loss for epoch: {np.mean(train_losses)}')
    print(f'Mean test loss for epoch: {mean_test_loss}')

    if mean_test_loss < best_test_loss:
        best_test_loss = mean_test_loss
        epochs_without_improvement = 0  
        torch.save(model.state_dict(), 'rnn_best.pt') 
        print(f'Model saved at epoch {epoch} with test loss {mean_test_loss}')
    else:
        epochs_without_improvement += 1  

    if epochs_without_improvement >= 10:
        print(f'Stopping early at epoch {epoch} due to no improvement in test loss for 5 consecutive epochs.')
        break  

Results for epoch 0:
Mean train loss for epoch: 0.43203526735305786
Mean test loss for epoch: 0.3752400577068329
Model saved at epoch 0 with test loss 0.3752400577068329
Results for epoch 1:
Mean train loss for epoch: 0.3340316712856293
Mean test loss for epoch: 0.30630311369895935
Model saved at epoch 1 with test loss 0.30630311369895935
Results for epoch 2:
Mean train loss for epoch: 0.28202348947525024
Mean test loss for epoch: 0.28220826387405396
Model saved at epoch 2 with test loss 0.28220826387405396
Results for epoch 3:
Mean train loss for epoch: 0.2536272704601288
Mean test loss for epoch: 0.24460996687412262
Model saved at epoch 3 with test loss 0.24460996687412262
Results for epoch 4:
Mean train loss for epoch: 0.23049144446849823
Mean test loss for epoch: 0.22559095919132233
Model saved at epoch 4 with test loss 0.22559095919132233
Results for epoch 5:
Mean train loss for epoch: 0.21539407968521118
Mean test loss for epoch: 0.22569037973880768
Results for epoch 6:
Mean trai

In [14]:
writer.flush()
test_writer.flush()
writer.close()
test_writer.close()

In [15]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = TextRNN(input_size, hidden_size, output_size, batch_first=True, nonlinearity='relu', dropout=0, bidirectional=True, num_layers=1).to(device)
state_dict = torch.load('rnn_best.pt', map_location=device)  
model.load_state_dict(state_dict)

<All keys matched successfully>

In [16]:
model.eval()
pred_outputs = []
for tensor_ in X_test:
    output = model(tensor_.view(1,1,-1)).view(-1)
    pred_outputs.append(output)
pred_outputs = [1 if x > 0.5 else 0 for x in pred_outputs]

In [17]:
evaluate(Y_test.numpy(), pred_outputs)

Precision:  0.93095735162541
Recall:  0.9395505167051269
F1_score:  0.9352341955457905
accuracy:  0.9326950520225215


In [18]:
#%load_ext tensorboard

In [19]:
#%tensorboard --logdir=runs --port=6044