In [None]:
import json
import pandas as pd
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
import re
from nltk import word_tokenize
import torch.nn as nn
import torch.optim as optim
import torch
from torch.utils.data import TensorDataset, DataLoader, random_split
import os

def parse_data(file):
    for l in open(file, 'r'):
        yield json.loads(l)
        
data = pd.DataFrame(parse_data('..\data\\Sarcasm_Headlines_Dataset_v2.json'))

In [3]:
data.columns = ['Sarcastic', 'Headline', 'Link']
data.head(5)

Unnamed: 0,Sarcastic,Headline,Link
0,1,thirtysomething scientists unveil doomsday clo...,https://www.theonion.com/thirtysomething-scien...
1,0,dem rep. totally nails why congress is falling...,https://www.huffingtonpost.com/entry/donna-edw...
2,0,eat your veggies: 9 deliciously different recipes,https://www.huffingtonpost.com/entry/eat-your-...
3,1,inclement weather prevents liar from getting t...,https://local.theonion.com/inclement-weather-p...
4,1,mother comes pretty close to using word 'strea...,https://www.theonion.com/mother-comes-pretty-c...


In [4]:
data['Link'] = data['Link'].apply(lambda x: x.split('.')[1].capitalize()) #extracts publications
data.rename(columns = {'Link': 'Publication'}, inplace = True) #This is not immediately useful, but I'm doing this just in case we use this later

In [5]:
stop_words_removed = []
for i in data['Headline']:
    i = re.sub(r"[^a-zA-Z\s]", "", i)
    main_words = ' '.join([j for j in i.split() if j.lower() not in ENGLISH_STOP_WORDS])
    stop_words_removed.append(main_words)

tokenized_corpus = [word_tokenize(i) for i in stop_words_removed]
vocab = set([word for sentence in tokenized_corpus for word in sentence])

In [6]:
class CBOW(nn.Module):
    def __init__(self, embedding_size, vocab_size):
        super().__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_size)
        self.linear = nn.Linear(embedding_size, vocab_size)
        
    def forward(self, inputs):
        embeddings = self.embeddings(inputs).mean(1)
        return self.linear(embeddings)
    
 
        

In [7]:
word_to_idx = {word: i for i, word in enumerate(vocab)}
idx_to_word = {i: word for i, word in enumerate(vocab)}


In [8]:
context_data = []
for i in tokenized_corpus:
    for j in range(2, len(i)-2):
        context = [
            i[j-2],
            i[j-1],
            i[j+1],
            i[j+2]]
        target = i[j]
        
        context_idxs = [word_to_idx[k] for k in context]
        target_idx = word_to_idx[target]
        context_data.append((context_idxs, target_idx))

#We will use a 60 - 20 - 20 split, defined in training function



In [None]:
def trainingCBOW():
    embedding_size = 50
    batch_size = 32
    learning_rate = 0.001
    epochs = 50

    print(f"Training CBOW with Embedding Size: {embedding_size}, LR: {learning_rate}")

    vocab_size = len(word_to_idx)
    net = CBOW(embedding_size=embedding_size, vocab_size=vocab_size)
    net.train()

    loss_function = nn.CrossEntropyLoss()
    optimizer = optim.Adam(net.parameters(), lr=learning_rate)

    context, target = zip(*context_data)
    context_tensor = torch.tensor(context, dtype=torch.long)
    target_tensor = torch.tensor(target, dtype=torch.long)

    final_dataset = TensorDataset(context_tensor, target_tensor)

    dataset_size = len(final_dataset)
    train_size = int(0.6 * dataset_size)
    test_size = dataset_size - train_size

    train_dataset, test_dataset = random_split(final_dataset, [train_size, test_size])

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

    for epoch in range(epochs):
        total_loss = 0
        net.train()

        for con, tar in train_loader:
            optimizer.zero_grad()  
            output = net(con)  
            loss = loss_function(output, tar)  
            loss.backward()  
            optimizer.step()  

            total_loss += loss.item()  

        print(f"Epoch {epoch+1}/{epochs} | Train Loss: {total_loss:.4f}")

    torch.save(net.state_dict(), os.path.join(os.getcwd(), "cbow_model.pth"))
    print("Model saved!")

    return net

    
    
    

In [15]:
trainingCBOW()

Training CBOW with Embedding Size: 50, LR: 0.001
Epoch 1/50 | Train Loss: 15084.3544
Epoch 2/50 | Train Loss: 13359.2523
Epoch 3/50 | Train Loss: 12498.0950
Epoch 4/50 | Train Loss: 11825.5349
Epoch 5/50 | Train Loss: 11191.7801
Epoch 6/50 | Train Loss: 10573.1579
Epoch 7/50 | Train Loss: 9964.2801
Epoch 8/50 | Train Loss: 9366.3468
Epoch 9/50 | Train Loss: 8781.3365
Epoch 10/50 | Train Loss: 8211.8560
Epoch 11/50 | Train Loss: 7659.3167
Epoch 12/50 | Train Loss: 7126.3360
Epoch 13/50 | Train Loss: 6615.4802
Epoch 14/50 | Train Loss: 6130.5224
Epoch 15/50 | Train Loss: 5673.4378
Epoch 16/50 | Train Loss: 5248.2726
Epoch 17/50 | Train Loss: 4853.9102
Epoch 18/50 | Train Loss: 4493.1320
Epoch 19/50 | Train Loss: 4160.8545
Epoch 20/50 | Train Loss: 3857.3161
Epoch 21/50 | Train Loss: 3578.2639
Epoch 22/50 | Train Loss: 3319.7355
Epoch 23/50 | Train Loss: 3081.4636
Epoch 24/50 | Train Loss: 2859.6389
Epoch 25/50 | Train Loss: 2653.4197
Epoch 26/50 | Train Loss: 2460.4354
Epoch 27/50 | Trai

CBOW(
  (embeddings): Embedding(28514, 50)
  (linear): Linear(in_features=50, out_features=28514, bias=True)
)

In [16]:
os.getcwd()

'c:\\Users\\david\\OneDrive\\Desktop\\Python Projects\\134'