In [10]:

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import psycopg2
from urllib.parse import urlparse
from collections import Counter
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np
import re
from collections import Counter
from datasets import load_dataset
import json



In [7]:
# -----------------------------
# 1. LOAD 80% TRAINING & VALIDATION DATA - LIMITED TO 100K ROWS & RECENT DATA (>2023)
# -----------------------------
print("Fetching Hacker News data...")
conn = psycopg2.connect("postgres://sy91dhb:g5t49ao@178.156.142.230:5432/hd64m1ki")
cur = conn.cursor()
cur.execute('''SELECT i.title, i.url, i.score, u.id, u.karma 
            FROM hacker_news.items AS i
            LEFT JOIN hacker_news.users AS u 
                ON i.by = u.id
            WHERE title IS NOT NULL AND score IS NOT NULL AND (abs(hashtext(i.id::text)) % 100) >= 20
            LIMIT 100000;
            ''')
rows = cur.fetchall()
conn.close()

titles, urls, scores, by, karmas = zip(*rows)


# -----------------------------
# 2. Preprocess titles
# -----------------------------
def preprocess(text):
    text = text.lower()
    text = re.sub(r'[^a-z0-9 ]+', '', text)
    return text.split()

tokenized_titles = [preprocess(title) for title in titles]
word_counts = Counter(word for title in tokenized_titles for word in title)
vocab = sorted(word_counts, key=word_counts.get, reverse=True)
word_to_ix = {word: i for i, word in enumerate(vocab)}
ix_to_word = {i: word for word, i in word_to_ix.items()}
vocab_size = len(word_to_ix)

Fetching Hacker News data...


In [8]:
# -----------------------------
# 3. Load Vocabulary
# -----------------------------
with open("vocab-w2-200titles.json", "r", encoding="utf-8") as f:
    word_to_ix = json.load(f)

ix_to_word = {int(i): w for w, i in word_to_ix.items()}
vocab_size = len(word_to_ix)


# -----------------------------
# 3. Load Pre-trained Embeddings
# -----------------------------
embed_dim = 300  
embeddings = torch.load("embeddings-w2-200titles-300dim-10e.pt", map_location='cpu')  # Shape: [vocab_size, embed_dim]
assert embeddings.shape[0] == vocab_size, "Vocab size mismatch!"

In [12]:
# -----------------------------
# 4. CBOW Model
# -----------------------------
class CBOW(nn.Module):
    def __init__(self, vocab_size, embed_dim):
        super().__init__()
        self.embeddings = nn.Embedding(vocab_size, embed_dim)
        self.linear = nn.Linear(embed_dim, vocab_size)

    def forward(self, inputs):
        embeds = self.embeddings(inputs).mean(dim=1)
        return self.linear(embeds)

cbow_model = CBOW(vocab_size, embed_dim)
cbow_model.embeddings.weight.data.copy_(embeddings)
cbow_model.embeddings.weight.requires_grad = False  


# -----------------------------
# 6. Create title embeddings
# -----------------------------
title_embeddings = []
valid_indices = []
for i, tokens in enumerate(tokenized_titles):
    token_ids = [word_to_ix[t] for t in tokens if t in word_to_ix]
    if token_ids:
        with torch.no_grad():
            vectors = cbow_model.embeddings(torch.tensor(token_ids))
            avg_vector = vectors.mean(dim=0)
        title_embeddings.append(avg_vector)
        valid_indices.append(i)

X_title = torch.stack(title_embeddings)
y = torch.tensor([scores[i] for i in valid_indices], dtype=torch.float32).unsqueeze(1)



karmas_tensor = torch.tensor([karmas[i] for i in valid_indices], dtype=torch.float32).unsqueeze(1)  # NEW
user_ids = [by[i] for i in valid_indices]  # NEW
user_karma_lookup = {user_ids[i]: karmas_tensor[i].item() for i in range(len(user_ids))}  # NEW


# -----------------------------
# 7. Process domain names
# -----------------------------
parsed_domains = []
for url in urls:
    try:
        parsed = urlparse(url)
        domain = parsed.netloc or 'unknown'
    except:
        domain = 'unknown'
    parsed_domains.append(domain)

le = LabelEncoder()
domain_ids = le.fit_transform(parsed_domains)
domain_ids_tensor = torch.tensor(domain_ids, dtype=torch.long)[valid_indices]
domain_vocab_size = len(le.classes_)
domain_embed_dim = 3


# -----------------------------
# 8. Regression Model
# -----------------------------


class UpvotePredictor(nn.Module):
    def __init__(self, title_embed_dim, domain_vocab_size, domain_embed_dim):
        super().__init__()
        self.domain_embedding = nn.Embedding(domain_vocab_size, domain_embed_dim)
        self.model = nn.Sequential(
            nn.Linear(title_embed_dim + domain_embed_dim + 1, 128),  # CHANGED
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 1)
        )

    def forward(self, title_embed, domain_id, karma):  # CHANGED
        domain_vec = self.domain_embedding(domain_id)
        x = torch.cat([title_embed, domain_vec, karma], dim=1)  # CHANGED
        return self.model(x)



model = UpvotePredictor(embed_dim, domain_vocab_size, domain_embed_dim)
loss_fn = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)


# -----------------------------
# 8b. Train-validation split
# -----------------------------

# Assume you already have these full N-length tensors from V8 preprocessing:
#   X_title           â€“ (N, title_embed_dim) torch.Tensor  
#   domain_ids_tensor â€“ (N,)               torch.LongTensor
#   karma_tensor      â€“ (N,)               torch.FloatTensor
#   y                 â€“ (N,)               torch.FloatTensor (scores)

# Build an array of row indices
all_idx = list(range(y.size(0)))

# Split indices into 80% train / 20% val
train_idx, val_idx = train_test_split(
    all_idx,
    test_size=0.2,
    random_state=42,
    shuffle=True
)

# Index into each tensor
X_title_train      = X_title[train_idx]
X_title_val        = X_title[val_idx]

domain_ids_train   = domain_ids_tensor[train_idx]
domain_ids_val     = domain_ids_tensor[val_idx]

karmas_train        = karmas_tensor[train_idx]
karmas_val          = karmas_tensor[val_idx]

y_train            = y[train_idx]
y_val              = y[val_idx]

# Print sizes of training and validation sets
print(f"Train / Val sizes: {len(train_idx)} / {len(val_idx)}")


# -----------------------------
# 9. Dataset and Training
# -----------------------------
class HNDataset(Dataset):
    def __init__(self, title_embeds, domain_ids, karmas, labels):  # CHANGED
        self.title_embeds = title_embeds
        self.domain_ids = domain_ids
        self.karmas = karmas  # NEW
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.title_embeds[idx], self.domain_ids[idx], self.karmas[idx], self.labels[idx]  # CHANGED

dataset = HNDataset(X_title_train, domain_ids_train, karmas_train, y_train)  # CHANGED
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

print("Training regression model...")
for epoch in range(40):
    total_loss = 0
    for title_embed, domain_id, karma, label in dataloader:  # CHANGED
        pred = model(title_embed, domain_id, karma)  # CHANGED
        loss = loss_fn(pred, label)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch}, Loss: {total_loss:.4f}")



# -----------------------------
# 10. Prediction Function
# -----------------------------
def predict_upvotes(title, url, user_id):  # CHANGED
    tokens = preprocess(title)
    token_ids = [word_to_ix.get(t) for t in tokens if t in word_to_ix]
    token_ids = [i for i in token_ids if i is not None]
    if not token_ids:
        return None

    with torch.no_grad():
        vectors = cbow_model.embeddings(torch.tensor(token_ids))
        avg_embed = vectors.mean(dim=0)

    try:
        parsed = urlparse(url)
        domain = parsed.netloc or 'unknown'
    except:
        domain = 'unknown'

    try:
        domain_id = le.transform([domain])[0]
    except:
        domain_id = 0  # fallback

    domain_tensor = torch.tensor([domain_id], dtype=torch.long)
    karma_value = user_karma_lookup.get(user_id, 0)  # NEW
    karma_tensor = torch.tensor([[karma_value]], dtype=torch.float32)  # NEW

    model.eval()
    with torch.no_grad():
        prediction = model(avg_embed.unsqueeze(0), domain_tensor, karma_tensor).item()  # CHANGED
    return max(prediction, 0)




Train / Val sizes: 78465 / 19617
Training regression model...
Epoch 0, Loss: 6253856.9892
Epoch 1, Loss: 5771655.0751
Epoch 2, Loss: 5683041.1936
Epoch 3, Loss: 5661360.7728
Epoch 4, Loss: 5641861.5457
Epoch 5, Loss: 5641470.2598
Epoch 6, Loss: 5610264.6250
Epoch 7, Loss: 5562382.0188
Epoch 8, Loss: 5527211.4172
Epoch 9, Loss: 5501900.3622
Epoch 10, Loss: 5476339.7773
Epoch 11, Loss: 5429195.1267
Epoch 12, Loss: 5395250.6327
Epoch 13, Loss: 5386188.8530
Epoch 14, Loss: 5346337.8243
Epoch 15, Loss: 5323652.2957
Epoch 16, Loss: 5271429.8059
Epoch 17, Loss: 5237640.1682
Epoch 18, Loss: 5180524.0599
Epoch 19, Loss: 5205181.3213
Epoch 20, Loss: 5172474.0960
Epoch 21, Loss: 5108987.4730
Epoch 22, Loss: 5815037.4241
Epoch 23, Loss: 4988263.6972
Epoch 24, Loss: 5135064.5453
Epoch 25, Loss: 5022083.6730
Epoch 26, Loss: 4915307.4249
Epoch 27, Loss: 4884120.4278
Epoch 28, Loss: 4835498.2604
Epoch 29, Loss: 4786737.5684
Epoch 30, Loss: 4783229.3823
Epoch 31, Loss: 4753886.8878
Epoch 32, Loss: 4675

In [14]:
# -----------------------------
# 11. Example
# -----------------------------
print("\nPredicted upvotes for new post:")
example_title = "Generates $1 billion today!"
example_title = "Close Votes Are A Feature, Not A Bug"
example_title = "what do you mean? okay"
example_title = "Show HN: AI Hacker generates $1 billion"
example_url = "https://openai.com"
user_id = "hackerl33t"
pred = predict_upvotes(example_title, example_url, user_id)
print(f"{example_title} â†’ Predicted Upvotes: {pred:.2f}")
print("Eat. Pray. Deploy. Blame the data. ðŸ˜Ž Â© Bayesian Buccaneers")


Predicted upvotes for new post:
Show HN: AI Hacker generates $1 billion â†’ Predicted Upvotes: 1.68
Eat. Pray. Deploy. Blame the data. ðŸ˜Ž Â© Bayesian Buccaneers


In [15]:
# -----------------------------
# 12. Validation on held-out split
# -----------------------------

# Build a DataLoader for the validation set
dataset_val = HNDataset(
    X_title_val,
    domain_ids_val,
    karmas_val,
    y_val
)
val_loader = DataLoader(dataset_val, batch_size=32, shuffle=False)

model.eval()
all_preds = []
all_targets = []

with torch.no_grad():
    for title_emb, dom_id, krm, target in val_loader:
        pred = model(title_emb, dom_id, krm).squeeze()
        pred = pred.view(-1)
        all_preds.extend(pred.tolist())
        all_targets.extend(target.tolist())

# Flatten any singleton lists in all_targets (and preds, just in case)
all_preds_flat   = [p[0] if isinstance(p, list) else p   for p in all_preds]
all_targets_flat = [t[0] if isinstance(t, list) else t   for t in all_targets]

# Compute metrics
mse = mean_squared_error(all_targets, all_preds)
r2  = r2_score(all_targets, all_preds)
print(f"Validation MSE: {mse:.4f}")
print(f"Validation RÂ² : {r2:.4f}")


# Optional: inspect a few examples
print("\nSample predictions vs. actuals:")
for i in range(100):
    print(f"  Predicted: {all_preds_flat[i]:.2f}  |  Actual: {all_targets_flat[i]:.0f}")


Validation MSE: 2391.8670
Validation RÂ² : -0.2494

Sample predictions vs. actuals:
  Predicted: 4.66  |  Actual: 2
  Predicted: 7.09  |  Actual: 3
  Predicted: 4.21  |  Actual: 1
  Predicted: 0.52  |  Actual: 1
  Predicted: 2.04  |  Actual: 3
  Predicted: 15.76  |  Actual: 1
  Predicted: -0.75  |  Actual: 1
  Predicted: 15.76  |  Actual: 1
  Predicted: 5.64  |  Actual: 1
  Predicted: 14.48  |  Actual: 1
  Predicted: 11.57  |  Actual: 1
  Predicted: 6.90  |  Actual: 3
  Predicted: 18.33  |  Actual: 2
  Predicted: 5.72  |  Actual: 1
  Predicted: 1.57  |  Actual: 1
  Predicted: -1.59  |  Actual: 1
  Predicted: 2.12  |  Actual: 1
  Predicted: 93.37  |  Actual: 1
  Predicted: 5.82  |  Actual: 1
  Predicted: 15.76  |  Actual: 3
  Predicted: 2.69  |  Actual: 2
  Predicted: 12.02  |  Actual: 1
  Predicted: 4.79  |  Actual: 1
  Predicted: 10.84  |  Actual: 2
  Predicted: 15.76  |  Actual: 19
  Predicted: 15.76  |  Actual: 3
  Predicted: 11.06  |  Actual: 3
  Predicted: 3.32  |  Actual: 2
  Pre