In [None]:
## Importing required modules

import numpy as np
import torch
import matplotlib.pyplot as plt
from PIL import Image
import pandas as pd
from sklearn.metrics import precision_recall_fscore_support, roc_curve
from sklearn.metrics import auc

In [None]:
## Reading dataset

full_df = pd.read_csv("/content/drive/MyDrive/cleaned_reviews_summaries.zip")

Generating vocab and embeddings

In [None]:
from sklearn.model_selection import train_test_split
import torch

In [None]:
full_df.head()

In [None]:
full_df = full_df.sample(100000)

In [None]:
vocab = set()
word_to_id = dict()
id_to_word = dict()
word_to_count = dict()
vocab_size = 1

In [None]:
max_len = 30 # placeholder value

In [None]:
cols = ['cleaned_reviews'] # Use if you want vocab to include just review text

Generate vocab

In [None]:
def generate_vocab(cols):
    
    global vocab_size, vocab
    global word_to_id, id_to_word, word_to_count
    global full_df
    global df_mini
    cleaned_reviews_ids = []
    cleaned_summaries_ids = []
    
    for index, row in full_df.iterrows():
        
        for c in cols:
            
            s2n = []
            split = row[c].split()
            
            for word in split:
                
                if word not in vocab:
                    vocab.add(word)
                    word_to_id[word] = vocab_size
                    # word_to_count[word] = 1
                    s2n.append(vocab_size)
                    id_to_word[vocab_size] = word
                    vocab_size += 1
                    
                else:
                    # word_to_count[word] += 1
                    s2n.append(word_to_id[word])
                

            full_df.at[index, c] = s2n

   

In [None]:
import time as time

In [None]:
# Takes about 100 seconds with just cleaned reviews column, will take longer if you include cleaned summaries
start = time.time()
generate_vocab(cols)
print(time.time() - start)

Calculating Item Specificivity DF and IIF

In [None]:
# def convert_movie_ids():

#   movie_to_idx = {}
#   new_id = 0
#   for index, row in full_df.iterrows():
#     key = row['movie_id']
#     # if key in movie_to_idx:
#       full_df.at[index, 'movie_id'] = movie_to_idx[key]
#     else:
#       full_df.at[index, 'movie_id'] = new_id
#       movie_to_idx[key] = new_id
#       new_id += 1


In [None]:
# convert_movie_ids()

In [None]:
# num_movies = full_df.movie_id.unique().shape[0]

In [None]:
# d = np.zeros(num_movies)
# dw_i = np.zeros((vocab_size, num_movies))
# # dw_i = {}
# for index, row in full_df.iterrows():
#   item = row['movie_id']
#   # d[item] = d.get(item, 0) + 1 # number of reviews for movie i
#   d[item] += 1
#   words = row['cleaned_reviews']
#   movie_id = row['movie_id']
#   seen = set()
#   for word in words:
#     if word not in seen:
#       seen.add(word)
#       # key = str(word) + str(movie_id)
#       dw_i[word][item] += 1 # number of reveiws of movie i that contain given word
#       # dw_i[key] = dw_i.get(key, 0) + 1


In [None]:
# dw_i.shape, d.shape

In [None]:
# DF = dw_i/d # (vocab_size, num_movies)

In [None]:
# # Iw = np.count_nonzero(dw_i, axis = 1)
# I = num_movies
# IIF = np.log((I+1)/(Iw + 1)) # (vocab_size,)

In [None]:
# print(DF.shape, IIF.shape)  

In [None]:
# print(len(vocab))

Train, Test, Val Split

In [None]:
X_train = []
y_train = []
X_test = []
y_test = []
X_val = []
y_val = []

In [None]:
full_df['is_spoiler'] = full_df['is_spoiler'].fillna(0)
full_df['is_spoiler'].isna().sum()

In [None]:
def split_data(cols):
    
    global X_train, y_train
    global X_test, y_test
    global X_val, y_val
    global X_train_movie_ids, X_test_movie_ids, X_val_movie_ids
    
    train_ratio = 0.8 # Split into training and validation
    test_ratio = 0.85 # Split into training and testing
    
    X = list(full_df['cleaned_reviews'])
    y = list(full_df['is_spoiler'].astype(int))
    
    X_rem, X_test, y_rem, y_test = train_test_split(X, y, train_size=test_ratio)
    X_train, X_val, y_train, y_val = train_test_split(X_rem, y_rem, train_size=train_ratio)
    
    
    # X_train = [X_train[c] for c in cols]
    # X_val = [X_val[c] for c in cols]
    # X_test = [X_test[c] for c in cols]
    
    # y_train = y_train.values
    # y_test = y_test.values
    # y_val = y_val.values

In [None]:
start = time.time()
split_data(cols)
print(time.time() - start)

In [None]:
print(len(X_train), len(y_train), len(X_test), len(y_test))

In [None]:
def pad_sequences(X_list):
    
    X_padded = torch.nn.utils.rnn.pad_sequence([torch.as_tensor(l) for l in X_list]).type(torch.LongTensor) # padding the sequences with 0
    X_mask   = torch.nn.utils.rnn.pad_sequence([torch.as_tensor([1.0] * len(l)) for l in X_list]).type(torch.FloatTensor)
    
    return X_padded, X_mask

In [None]:
# Takes about 20 seconds to run
start = time.time()
X_train = pad_sequences(X_train)[0]
print(time.time() - start)

In [None]:
X_test = pad_sequences(X_test)[0]
X_val = pad_sequences(X_val)[0]

In [None]:
# y_train = torch.Tensor(y_train)
# y_test = torch.Tensor(y_test)
# y_val = torch.Tensor(y_val)

In [None]:
X_train.shape # [max_sentence_len, num_data_points]

In [None]:
X_train = X_train.T
X_test = X_test.T
X_val = X_val.T

print(X_train.shape, X_test.shape, X_val.shape)

## Spoiler Net

In [None]:
class SpoilerNet(torch.nn.Module):

  def __init__(self,vocab_size, emb_dim = 300, hid_dim = 50):
    super(SpoilerNet, self).__init__()

    # initialize parameters
    self.EMB_DIM = emb_dim
    self.DIM_HIDDEN = hid_dim
    self.VOCAB_SIZE = vocab_size

    # initialize layers

    self.embedding = torch.nn.Embedding(self.VOCAB_SIZE, self.EMB_DIM)
    self.word_encoder = torch.nn.GRU(self.EMB_DIM, self.DIM_HIDDEN, bidirectional = True) 

    ## for word attention ##
    self.mu = torch.nn.Linear(self.DIM_HIDDEN, self.DIM_HIDDEN)
    self.tanh = torch.nn.Tanh()
    self.v = torch.nn.Linear(self.DIM_HIDDEN, self.DIM_HIDDEN, bias = False)
    self.alpha = torch.nn.Softmax(dim = 1) 
  
    

    self.sentence_encoder = torch.nn.GRU(self.DIM_HIDDEN, self.DIM_HIDDEN, bidirectional = True, batch_first = True)
    self.dropout = torch.nn.Dropout(0.5) # according to paper 
    self.output = torch.nn.Linear(self.DIM_HIDDEN, 2)
    self.activation = torch.nn.Sigmoid()


  def forward(self, X):
  
    text_embeddings = self.embedding(X) # generating text embeddings

    word_enc_out, word_hn = self.word_encoder(text_embeddings) # word encoder outputs

    h_w = word_enc_out[:, :, :self.DIM_HIDDEN] + word_enc_out[:, :, self.DIM_HIDDEN:] # adding output represen of both GRUs

    
    # applying word attention

    mu_w = self.tanh(self.mu(h_w))
    
    v_out = self.v(mu_w)

    alpha_w = self.alpha(v_out)

    # input for sentence encoder

    v_s = torch.sum(alpha_w * h_w, dim = 1)

    sent_enc_out, _ = self.sentence_encoder(v_s) # sentence encoder outputs

    h_s = sent_enc_out[:, :self.DIM_HIDDEN] + sent_enc_out[:, self.DIM_HIDDEN:] # adding output represen of both GRUs
    
    out = self.output(self.dropout((h_s))) # unnormalized class scores (batchSize, 2)

    return out     
  


In [None]:
def predict(model, x, y): # function for predicting
  y_pred = []
  num_correct = 0
  x = x.cuda()
  sigmoid = torch.nn.Sigmoid()
  for i in range(len(x)):
    probs = sigmoid(model.forward(x[i].unsqueeze(0)))
    pred = torch.argmax(probs)
    y_pred.append(pred.item())
    
    if pred == y[i]:
      num_correct += 1
  print("Accuracy: %s" % (float(num_correct) / float(len(x))))
  return y_pred


## TRAINING

## Parameters according to paper ##

NUM_EPOCHS = 4
batchSize = 64 
LEARNING_RATE = 1e-3

model = SpoilerNet(vocab_size).cuda()

loss_func = torch.nn.CrossEntropyLoss()

optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

y_train = torch.LongTensor(y_train)
clip = 50.0

for epoch in range(NUM_EPOCHS):
  model.train()
  total_loss = 0.0
  for i in range(0, X_train.shape[0], batchSize):
    optimizer.zero_grad()
    s = i
    e = i + batchSize
    if i + batchSize >= X_train.shape[0]:
      e = X_train.shape[0]

    X_batch = X_train[s:e, :]
    Y_batch = y_train[s:e]
    output = model.forward(X_batch.cuda())
    loss = loss_func(output, Y_batch.cuda())
    total_loss += loss.item()
    loss.backward() 
    _ = torch.nn.utils.clip_grad_norm_(model.parameters(), clip) # gradient clipping
    optimizer.step()
  model.eval()
  print("Loss at epoch", epoch,":", total_loss, end = '\t')
  _ = predict(model, X_val, y_val)

In [None]:
model.eval()
y_pred = predict(model, X_test, y_test)

## Results

In [None]:
precision, _, _, _ = precision_recall_fscore_support(y_test, y_pred)

In [None]:
print(precision.mean())

In [None]:
## ROC

fpr, tpr, thresholds = roc_curve(y_test, y_pred)


In [None]:
plt.plot(fpr, tpr)
plt.title("ROC")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.grid()
plt.show()

In [None]:
print(auc(fpr, tpr))