In [1]:
pip install pandas


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install torch


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [3]:
pip install scikit-learn


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [4]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from collections import Counter
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from sklearn.model_selection import train_test_split

In [5]:
class PoemClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_classes):
        super(PoemClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.fc = nn.Linear(embed_dim, num_classes)

    def forward(self, x):
        """
        x: (batch_size, seq_len) tensor of token indices
        """
        embeds = self.embedding(x)  # Shape: (batch_size, seq_len, embed_dim)
        pooled = embeds.mean(dim=1) # Mean pooling over sequence length
        result = self.fc(pooled)    # Shape: (batch_size, num_classes)
        return result

In [6]:
df = pd.read_csv("https://happy-research.s3.ap-southeast-1.amazonaws.com/unique_poem_nlp_dataset.csv")

df

Unnamed: 0,Poem,Label
0,"Laugh laugh in the dance, dance whispers in th...",1
1,"Cold lost in the lost, empty whispers in the s...",0
2,"Shadow night in the lost, night whispers in th...",0
3,"Shadow shadow in the farewell, cold whispers i...",0
4,"Dance sky in the morning, laugh whispers in th...",1
...,...,...
95,"Night farewell in the tears, farewell whispers...",0
96,"Tears farewell in the shadow, cold whispers in...",0
97,"Empty night in the farewell, rain whispers in ...",0
98,"Light sun in the sky, bright whispers in the l...",1


# Build Vocabulary from Dataset

In [7]:
def build_vocab(texts):
    word_counter = Counter()
    for text in texts:
        words = text.split()  # Simple word splitting
        word_counter.update(words)

    vocab = {word: idx+1 for idx, (word, _) in enumerate(word_counter.most_common())}
    vocab["<unk>"] = 0  # Assign index 0 to unknown words
    return vocab

In [8]:
vocab = build_vocab(df['Poem'])

vocab

{'in': 1,
 'the': 2,
 'whispers': 3,
 'bright': 4,
 'shadow': 5,
 'sun': 6,
 'love': 7,
 'dark': 8,
 'sky': 9,
 'farewell': 10,
 'light': 11,
 'silent': 12,
 'empty': 13,
 'farewell,': 14,
 'cold': 15,
 'morning': 16,
 'laugh': 17,
 'tears': 18,
 'Light': 19,
 'dance,': 20,
 'lost': 21,
 'silent.': 22,
 'Shadow': 23,
 'night': 24,
 'rain': 25,
 'dream': 26,
 'laugh.': 27,
 'joy': 28,
 'bright.': 29,
 'Laugh': 30,
 'love.': 31,
 'lost,': 32,
 'empty.': 33,
 'rain,': 34,
 'Silent': 35,
 'light,': 36,
 'dance.': 37,
 'Dream': 38,
 'cold.': 39,
 'Night': 40,
 'laugh,': 41,
 'Bright': 42,
 'shadow.': 43,
 'Cold': 44,
 'morning,': 45,
 'Sky': 46,
 'farewell.': 47,
 'sky,': 48,
 'cold,': 49,
 'Tears': 50,
 'Love': 51,
 'shadow,': 52,
 'rain.': 53,
 'Rain': 54,
 'dark,': 55,
 'sun.': 56,
 'joy.': 57,
 'lost.': 58,
 'love,': 59,
 'morning.': 60,
 'dream,': 61,
 'dance': 62,
 'Dance': 63,
 'Lost': 64,
 'silent,': 65,
 'Dark': 66,
 'bright,': 67,
 'Sun': 68,
 'empty,': 69,
 'Empty': 70,
 'dark.':

# Convert text to tensor using our vocabulary

In [9]:
def text_pipeline(text):
    return torch.tensor([vocab.get(word, vocab["<unk>"]) for word in text.split()], dtype=torch.long)

# Custom Dataset

In [10]:
class PoemDataset(Dataset):
    def __init__(self, df):
        self.texts = df['Poem'].apply(text_pipeline).tolist() # Convert to list
        self.labels = torch.tensor(df['Label'].values, dtype=torch.long).tolist() # Convert to list

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        return self.texts[idx], self.labels[idx]

# Train-Test Split (80% Train, 20% Test)

In [11]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Create datasets

In [12]:
train_dataset = PoemDataset(train_df)
test_dataset = PoemDataset(test_df)

# Hyperparameters

In [13]:
embed_dim = 128
num_classes = 2
batch_size = 32
epochs = 10
learning_rate = 0.001

# Padding function


In [14]:
def collate_batch(batch):
    texts, labels = zip(*batch)
    max_len = max(len(t) for t in texts)
    padded_texts = [torch.cat([t, torch.zeros(max_len - len(t), dtype=torch.long)], dim=0) for t in texts]

    return torch.stack(padded_texts), torch.tensor(labels)



# Create dataset and DataLoader

In [15]:
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_batch)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_batch)

# Model, Loss, Optimizer

In [16]:
vocab_size = len(vocab)
model = PoemClassifier(vocab_size, embed_dim, num_classes)
criterion = nn.CrossEntropyLoss() # determine how well we did when we input to x
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Training Loop


In [17]:
for epoch in range(epochs):
    total_loss = 0
    for texts, labels in train_loader:
        print(texts)
        optimizer.zero_grad()
        outputs = model.forward(texts)
        print(outputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss:.4f}")

print("Training complete!")

tensor([[44, 13,  1,  2, 49, 13,  3,  1,  2, 39],
        [66, 15,  1,  2, 55, 25,  3,  1,  2, 80],
        [46,  9,  1,  2, 61,  7,  3,  1,  2, 37],
        [23,  8,  1,  2, 49, 12,  3,  1,  2, 53],
        [23,  8,  1,  2, 32, 18,  3,  1,  2, 22],
        [38, 26,  1,  2, 20, 26,  3,  1,  2, 31],
        [23,  5,  1,  2, 78, 15,  3,  1,  2, 58],
        [35, 13,  1,  2, 49, 15,  3,  1,  2, 39],
        [68,  6,  1,  2, 74,  4,  3,  1,  2, 72],
        [70, 24,  1,  2, 14, 25,  3,  1,  2, 22],
        [35, 18,  1,  2, 14,  5,  3,  1,  2, 79],
        [38, 16,  1,  2, 20,  6,  3,  1,  2, 56],
        [50, 15,  1,  2, 55, 12,  3,  1,  2, 39],
        [51, 11,  1,  2, 61,  7,  3,  1,  2, 27],
        [70, 21,  1,  2, 14, 15,  3,  1,  2, 71],
        [23,  8,  1,  2, 52, 25,  3,  1,  2, 43],
        [73,  9,  1,  2, 36, 11,  3,  1,  2, 31],
        [63,  6,  1,  2, 59,  7,  3,  1,  2, 29],
        [66,  8,  1,  2, 34, 13,  3,  1,  2, 79],
        [19, 16,  1,  2, 45, 62,  3,  1,  2, 29],


# Evaluation Function

In [18]:
def evaluate(model, dataloader):
    model.eval()  # Set model to evaluation mode
    correct = 0
    total = 0

    with torch.no_grad():
        for texts, labels in dataloader:
            outputs = model(texts)
            preds = torch.argmax(outputs, dim=1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)

    accuracy = correct / total * 100
    print(f"Test Accuracy: {accuracy:.2f}%")
    model.train()  # Set model back to training mode

# Evaluate on Test Data
evaluate(model, test_loader)

Test Accuracy: 100.00%
