In [11]:
import numpy as np
import torch
import pickle
from torch.utils.data import TensorDataset, DataLoader
import pandas as pd
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
import tqdm
from spoilernet import SpoilerNet

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


## Read the preprocessed data

In [3]:
df = pd.read_json("../data/processed/review_context_final.json.gz", compression='infer', lines=True)
print(df.shape)
df.head()

(4967400, 2)


Unnamed: 0,Processed_Sentence,Is_Spoiler
0,special book,0
1,start slow first third middl third start get i...,0
2,love good scienc fiction push think thing go,0
3,2015 hugo winner translat origin chines made i...,0
4,instanc intermix chines revolutionari histori ...,0


## Tokenization

In [4]:
vocabulary = set()
word_to_index = {}
index_to_word = {}
current_index = 1

for idx, row in tqdm.tqdm(df.iterrows(), total=len(df)):
    sentence = row['Processed_Sentence']
    words = sentence.split()
    sentence_numeric = []

    for word in words:
        if word not in vocabulary:
            vocabulary.add(word)
            word_to_index[word] = current_index
            index_to_word[current_index] = word
            sentence_numeric.append(current_index)
            current_index += 1
        else:
            sentence_numeric.append(word_to_index[word])

    df.at[idx, 'Processed_Sentence'] = sentence_numeric

100%|██████████| 4967400/4967400 [04:16<00:00, 19335.64it/s]


In [12]:
with open('../static/word_to_index.pkl', 'wb') as f:
    pickle.dump(word_to_index, f)

with open('../static/index_to_word.pkl', 'wb') as f:
    pickle.dump(index_to_word, f)

## Train-test split

In [5]:
def extract_and_pad_data(dataframe):
    sequences = list(dataframe['Processed_Sentence'])
    labels = torch.tensor(dataframe['Is_Spoiler'].values).to(device)
    padded_sequences = torch.nn.utils.rnn.pad_sequence(
        [torch.tensor(sequence).to(device) for sequence in tqdm.tqdm(sequences, total=len(sequences))],
        batch_first=True, padding_value=0).type(torch.LongTensor)

    return padded_sequences, labels

training_data, remaining_data = train_test_split(df, test_size=0.3, random_state=42)
validation_data, test_data = train_test_split(remaining_data, test_size=0.5, random_state=42)

train_features, train_labels = extract_and_pad_data(training_data)
validation_features, validation_labels = extract_and_pad_data(validation_data)
test_features, test_labels = extract_and_pad_data(test_data)

print(f"Training data shape: {train_features.shape}, Labels shape: {train_labels.shape}")
print(f"Validation data shape: {validation_features.shape}, Labels shape: {validation_labels.shape}")
print(f"Test data shape: {test_features.shape}, Labels shape: {test_labels.shape}")

100%|██████████| 3477180/3477180 [01:24<00:00, 41010.25it/s]
100%|██████████| 745110/745110 [00:16<00:00, 45372.50it/s]
100%|██████████| 745110/745110 [00:17<00:00, 41597.16it/s]


Training data shape: torch.Size([3477180, 122]), Labels shape: torch.Size([3477180])
Validation data shape: torch.Size([745110, 76]), Labels shape: torch.Size([745110])
Test data shape: torch.Size([745110, 78]), Labels shape: torch.Size([745110])


## Embedding generation

In [5]:
# # this is a cpu intensive task as fasttext cannot run on gpus
# # I ran this on my own computer and saved the embeddings
# import fasttext
# import fasttext.util
# ft = fasttext.load_model('./static/cc.es.300.bin')

# embedding_matrix = torch.zeros((current_index, 300))

# for i, word in tqdm.tqdm(enumerate(vocabulary), total=len(vocabulary)):
#     if word in ft:
#         embedding_vector = torch.tensor(ft.get_word_vector(word))
#         embedding_matrix[i] = embedding_vector
#     else:
#         embedding_matrix[i] = torch.randn(300)

# torch.save(embedding_matrix, './static/embedding_matrix.pth')

100%|████████████████████████████████████████████████████| 313517/313517 [1:55:22<00:00, 45.29it/s]


In [6]:
embedding_matrix = torch.load('../static/embedding_matrix.pth')

## SpoilerNet

In [7]:
# Parameters (proposed in the original paper)
batch_size = 128
embedding_dim = 300
vocab_size = len(embedding_matrix)
hidden_dim = 50
gradient_clip_threshold = 50.0
learning_rate = 1e-3
num_epochs = 5

model = SpoilerNet(embedding_dim, hidden_dim, embedding_matrix).to(device)
loss_function = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)


train_labels = train_labels.to(device)
train_features = train_features.to(device)

train_dataset = TensorDataset(train_features, train_labels)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

for epoch in range(num_epochs):
    model.train()
    epoch_loss = 0.0
    for features_batch, labels_batch in tqdm.tqdm(train_loader):
        features_batch = features_batch.to(device)
        labels_batch = labels_batch.to(device)
        
        optimizer.zero_grad()
        predictions = model(features_batch)
        loss = loss_function(predictions.squeeze(), labels_batch)
        epoch_loss += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), gradient_clip_threshold)
        optimizer.step()

    model.eval()
    print(f"Loss at epoch {epoch}: {epoch_loss / len(train_loader)}\t")

100%|██████████| 27166/27166 [10:28<00:00, 43.23it/s]


Loss at epoch 0: 0.12911670388046248	


100%|██████████| 27166/27166 [10:27<00:00, 43.29it/s]


Loss at epoch 1: 0.12290414831694538	


100%|██████████| 27166/27166 [10:25<00:00, 43.45it/s]


Loss at epoch 2: 0.11942349402570614	


100%|██████████| 27166/27166 [10:25<00:00, 43.43it/s]


Loss at epoch 3: 0.11629931952764284	


100%|██████████| 27166/27166 [10:25<00:00, 43.45it/s]

Loss at epoch 4: 0.11342593664740183	





In [8]:
test_features = test_features.to(device)
test_labels = test_labels.to(device)

y_predictions = []
y_probabilities = []
correct_predictions = 0
sigmoid = torch.nn.Sigmoid().to(device)

model.eval()

for index in tqdm.tqdm(range(len(test_features))):
    probabilities = sigmoid(model.forward(test_features[index].unsqueeze(0)))
    prediction = torch.argmax(probabilities)
    y_predictions.append(prediction.item())
    y_probabilities.append(probabilities[0][0][1].item())

    if prediction == test_labels[index]:
        correct_predictions += 1

100%|██████████| 745110/745110 [11:56<00:00, 1039.85it/s]


In [9]:
roc_auc = roc_auc_score(test_labels.cpu().numpy(), y_probabilities)
print(f"ROC_AUC: {roc_auc}")

ROC_AUC: 0.7458641313194289


In [10]:
torch.save(model, '../models/complete_model.pth')
torch.save(model.state_dict(), '../models/model_state_dict.pth')