In [5]:
import torch
import torch.nn as nn
import torch.optim as optim
import psycopg2
import re
import nltk
from nltk.tokenize import word_tokenize
from torch.utils.data import Dataset, DataLoader
#nltk.download('punkt')
nltk.download('punkt_tab')

# -----------------------------
# 1. Load and preprocess Hacker News data
# -----------------------------
print("Fetching Hacker News titles...")
conn = psycopg2.connect("postgres://sy91dhb:g5t49ao@178.156.142.230:5432/hd64m1ki")
cur = conn.cursor()
cur.execute("SELECT title, score FROM hacker_news.items WHERE title IS NOT NULL AND score IS NOT NULL LIMIT 10000;")
data = cur.fetchall()
conn.close()

[nltk_data] Downloading package punkt_tab to /home/usa/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Fetching Hacker News titles...


In [6]:
def preprocess(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    return word_tokenize(text)

In [7]:
tokenized_titles = [preprocess(title) for title, _ in data]
upvotes = [float(up) for _, up in data]

In [8]:
print(tokenized_titles)



In [9]:
len(tokenized_titles)

10000

In [14]:
# -----------------------------
# 2. Build Vocabulary
# -----------------------------
from collections import Counter
all_tokens = [token for title in tokenized_titles for token in title]
vocab = [word for word, freq in Counter(all_tokens).items() if freq >= 5]
word_to_ix = {word: i for i, word in enumerate(vocab)}
vocab_size = len(vocab)

print(f"Vocab size: {vocab_size}")



Vocab size: 2588


In [15]:
# -----------------------------
# 3. Prepare CBOW training data
# -----------------------------
window_size = 2
cbow_data = []

for title in tokenized_titles:
    indexed = [word_to_ix[word] for word in title if word in word_to_ix]
    for i in range(window_size, len(indexed) - window_size):
        context = indexed[i - window_size:i] + indexed[i + 1:i + window_size + 1]
        target = indexed[i]
        cbow_data.append((context, target))

print(f"Training CBOW pairs: {len(cbow_data)}")


Training CBOW pairs: 24174


In [16]:

# -----------------------------
# 4. CBOW Model
# -----------------------------
embedding_dim = 100

class CBOW(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super().__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear = nn.Linear(embedding_dim, vocab_size)

    def forward(self, context_idxs):
        embeds = self.embeddings(context_idxs)  # (batch_size, context_len, emb_dim)
        avg_embed = embeds.mean(dim=1)          # (batch_size, emb_dim)
        out = self.linear(avg_embed)            # (batch_size, vocab_size)
        return out

cbow_model = CBOW(vocab_size, embedding_dim)
cbow_loss_fn = nn.CrossEntropyLoss()
cbow_optimizer = optim.Adam(cbow_model.parameters(), lr=0.001)

In [None]:
# -----------------------------
# 5. Train CBOW Model
# -----------------------------
print("Training CBOW model on HN titles...")
for epoch in range(5):
    total_loss = 0
    for context, target in cbow_data:
        context_var = torch.tensor([context], dtype=torch.long)
        target_var = torch.tensor([target], dtype=torch.long)

        cbow_model.zero_grad()
        logits = cbow_model(context_var)
        loss = cbow_loss_fn(logits, target_var)
        loss.backward()
        cbow_optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}, CBOW Loss: {total_loss:.2f}")


Training CBOW model on HN titles...


In [None]:

# -----------------------------
# 6. Create averaged title embeddings using trained CBOW
# -----------------------------
title_embeddings = []
valid_labels = []

for tokens, label in zip(tokenized_titles, upvotes):
    token_ids = [word_to_ix[t] for t in tokens if t in word_to_ix]
    if token_ids:
        vectors = cbow_model.embeddings(torch.tensor(token_ids))
        avg_vector = vectors.mean(dim=0)
        title_embeddings.append(avg_vector)
        valid_labels.append(label)

X = torch.stack(title_embeddings)
y = torch.tensor(valid_labels, dtype=torch.float32).unsqueeze(1)


In [None]:

# -----------------------------
# 7. Dataset and Dataloader
# -----------------------------
class HNTitleDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y
    def __len__(self):
        return len(self.X)
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

dataset = HNTitleDataset(X, y)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)


In [None]:

# -----------------------------
# 8. Regression Model
# -----------------------------
class UpvotePredictor(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 1)
        )
    def forward(self, x):
        return self.model(x)

model = UpvotePredictor(embedding_dim)
loss_fn = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)



In [None]:
# -----------------------------
# 9. Train Regression Model
# -----------------------------
print("Training upvote regression model...")
for epoch in range(10):
    total_loss = 0
    for batch_x, batch_y in dataloader:
        pred = model(batch_x)
        loss = loss_fn(pred, batch_y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {total_loss:.4f}")

