In [1]:

import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim.lr_scheduler import StepLR
from torch.utils.data import Dataset, DataLoader
import psycopg2
from urllib.parse import urlparse
from collections import Counter
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score
import numpy as np
import re
from collections import Counter
from datasets import load_dataset
import json



In [2]:
# -----------------------------
# 1. LOAD 80% TRAINING & VALIDATION DATA - LIMITED TO 100K ROWS & RECENT DATA (>2023)
# -----------------------------
print("Fetching Hacker News data...")
conn = psycopg2.connect("postgres://sy91dhb:g5t49ao@178.156.142.230:5432/hd64m1ki")
cur = conn.cursor()
cur.execute('''SELECT i.title, i.url, i.score, u.id, u.karma 
            FROM hacker_news.items AS i
            LEFT JOIN hacker_news.users AS u 
                ON i.by = u.id
            WHERE title IS NOT NULL AND score IS NOT NULL AND karma IS NOT NULL AND (abs(hashtext(i.id::text)) % 100) >= 20
            LIMIT 100000;
            ''')
rows = cur.fetchall()
conn.close()

titles, urls, scores, by, karmas = zip(*rows)


# -----------------------------
# 2. Preprocess titles
# -----------------------------
def preprocess(text):
    text = text.lower()
    text = re.sub(r'[^a-z0-9 ]+', '', text)
    return text.split()

tokenized_titles = [preprocess(title) for title in titles]
word_counts = Counter(word for title in tokenized_titles for word in title)
vocab = sorted(word_counts, key=word_counts.get, reverse=True)
word_to_ix = {word: i for i, word in enumerate(vocab)}
ix_to_word = {i: word for word, i in word_to_ix.items()}
vocab_size = len(word_to_ix)

Fetching Hacker News data...


In [3]:
# -----------------------------
# 3. Load Vocabulary
# -----------------------------
with open("vocab-w2-200titles.json", "r", encoding="utf-8") as f:
    word_to_ix = json.load(f)

ix_to_word = {int(i): w for w, i in word_to_ix.items()}
vocab_size = len(word_to_ix)

# -----------------------------
# 3. Load Pre-trained Embeddings
# -----------------------------
embed_dim = 300  
embeddings = torch.load("embeddings-w2-200titles-300dim-10e.pt", map_location='cpu')  # Shape: [vocab_size, embed_dim]

assert embeddings.shape[0] == vocab_size, "Vocab size mismatch!"

In [None]:
# -----------------------------
# 4. CBOW Model
# -----------------------------
class CBOW(nn.Module):
    def __init__(self, vocab_size, embed_dim):
        super().__init__()
        self.embeddings = nn.Embedding(vocab_size, embed_dim)
        self.linear = nn.Linear(embed_dim, vocab_size)

    def forward(self, inputs):
        embeds = self.embeddings(inputs).mean(dim=1)
        return self.linear(embeds)

cbow_model = CBOW(vocab_size, embed_dim)
cbow_model.embeddings.weight.data.copy_(embeddings)
cbow_model.embeddings.weight.requires_grad = False  


# -----------------------------
# 6. Create title embeddings
# -----------------------------
title_embeddings = []
valid_indices = []
for i, tokens in enumerate(tokenized_titles):
    token_ids = [word_to_ix[t] for t in tokens if t in word_to_ix]
    if token_ids:
        with torch.no_grad():
            vectors = cbow_model.embeddings(torch.tensor(token_ids))
            avg_vector = vectors.mean(dim=0)
        title_embeddings.append(avg_vector)
        valid_indices.append(i)

X_title = torch.stack(title_embeddings)
y = torch.tensor([scores[i] for i in valid_indices], dtype=torch.float32).unsqueeze(1)


karmas_tensor = torch.tensor([karmas[i] for i in valid_indices], dtype=torch.float32).unsqueeze(1)  # NEW
user_ids = [by[i] for i in valid_indices]  # NEW
user_karma_lookup = {user_ids[i]: karmas_tensor[i].item() for i in range(len(user_ids))}  # NEW


# -----------------------------
# 7. Process domain names
# -----------------------------
parsed_domains = []
for url in urls:
    try:
        parsed = urlparse(url)
        domain = parsed.netloc or 'unknown'
    except:
        domain = 'unknown'
    # Clean and normalize domain names
    domain = domain.lower()
    if domain.startswith("www."):
        domain = domain[4:]
    parsed_domains.append(domain)

le = LabelEncoder()
domain_ids = le.fit_transform(parsed_domains)
domain_ids_tensor = torch.tensor(domain_ids, dtype=torch.long)[valid_indices]
domain_vocab_size = len(le.classes_)
domain_embed_dim = 3


# -----------------------------
# 7b. Process usernames
# -----------------------------
user_le = LabelEncoder()
user_ids_encoded = user_le.fit_transform(user_ids)
user_ids_tensor = torch.tensor(user_ids_encoded, dtype=torch.long)

user_vocab_size = len(user_le.classes_)
user_embed_dim = 4  # chosen based on typical embedding heuristics


# -----------------------------
# 8. Regression Model
# -----------------------------


class UpvotePredictor(nn.Module):
    def __init__(self, title_embed_dim, domain_vocab_size, domain_embed_dim, users_vocab_size, users_embed_dim):
        super().__init__()
        self.domain_embedding = nn.Embedding(domain_vocab_size, domain_embed_dim)
        self.user_embedding = nn.Embedding(users_vocab_size, users_embed_dim)
        self.model = nn.Sequential(
            nn.Linear(title_embed_dim + domain_embed_dim + users_embed_dim + 1, 128),  # CHANGED
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 1)
        )

    def forward(self, title_embed, domain_id, user_id, karma):  # CHANGED
        domain_vec = self.domain_embedding(domain_id)
        user_vec = self.user_embedding(user_id)  # NEW
        x = torch.cat([title_embed, domain_vec, user_vec, karma], dim=1)  # CHANGED
        return self.model(x)



model = UpvotePredictor(embed_dim, domain_vocab_size, domain_embed_dim, user_vocab_size, user_embed_dim)
loss_fn = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)    # CAN CHANGE LR TO 0.1 FOR DYNAMIC LR WITH SCHEDULER

### Uncomment to activate learning rate scheduler for dynamic LR ###
# Wrap with a StepLR scheduler:
#    every 5 epochs, multiply the LR by 0.5
#scheduler = StepLR(optimizer, step_size=5, gamma=0.5)

# -----------------------------
# 8b. Train-validation split
# -----------------------------

# Assume you already have these full N-length tensors from V8 preprocessing:
#   X_title           – (N, title_embed_dim) torch.Tensor  
#   domain_ids_tensor – (N,)               torch.LongTensor
#   karma_tensor      – (N,)               torch.FloatTensor
#   y                 – (N,)               torch.FloatTensor (scores)

# Build an array of row indices
all_idx = list(range(y.size(0)))

# Split indices into 80% train / 20% val
train_idx, val_idx = train_test_split(
    all_idx,
    test_size=0.2,
    random_state=42,
    shuffle=True
)

# Index into each tensor
X_title_train      = X_title[train_idx]
X_title_val        = X_title[val_idx]

domain_ids_train   = domain_ids_tensor[train_idx]
domain_ids_val     = domain_ids_tensor[val_idx]

user_ids_train      = user_ids_tensor[train_idx]
user_ids_val        = user_ids_tensor[val_idx]

karmas_train        = karmas_tensor[train_idx]
karmas_val          = karmas_tensor[val_idx]

y_train            = y[train_idx]
y_val              = y[val_idx]

# Print sizes of training and validation sets
print(f"Train / Val sizes: {len(train_idx)} / {len(val_idx)}")
print('\n') 


# -----------------------------
# 9. Dataset and Training
# -----------------------------
class HNDataset(Dataset):
    def __init__(self, title_embeds, domain_ids, user_ids, karmas, labels):  # CHANGED
        self.title_embeds = title_embeds
        self.domain_ids = domain_ids
        self.user_ids = user_ids # NEW
        self.karmas = karmas  # NEW
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.title_embeds[idx], self.domain_ids[idx], self.user_ids[idx], self.karmas[idx], self.labels[idx]  # CHANGED

dataset = HNDataset(X_title_train, domain_ids_train, user_ids_train, karmas_train, y_train)  # CHANGED
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

print("Training regression model...")
for epoch in range(40):
    total_loss = 0
    for title_embed, domain_id, user_id, karma, label in dataloader:  # CHANGED
        pred = model(title_embed, domain_id, user_id, karma)  # CHANGED
        loss = loss_fn(pred, label)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch}, Loss: {total_loss:.4f}")

    ### Uncomment to activate learning rate schedular ###
    #scheduler.step()
    #print(f"Epoch {epoch:2d} — LR: {scheduler.get_last_lr()[0]:.4f}")



# -----------------------------
# 10. Prediction Function
# -----------------------------
def predict_upvotes(title, url, user_id):  # CHANGED
    tokens = preprocess(title)
    token_ids = [word_to_ix.get(t) for t in tokens if t in word_to_ix]
    token_ids = [i for i in token_ids if i is not None]
    if not token_ids:
        return None

    with torch.no_grad():
        vectors = cbow_model.embeddings(torch.tensor(token_ids))
        avg_embed = vectors.mean(dim=0)

    try:
        parsed = urlparse(url)
        domain = parsed.netloc or 'unknown'
        
    except:
        domain = 'unknown'

    if domain.startswith("www."):
        domain = domain[4:]

    try:
        domain_id = le.transform([domain])[0]
    except:
        domain_id = 0  # fallback

    try:
        user_enc = user_le.transform([user_id])[0]
    except:
        user_enc = 0 # fallback


    domain_tensor = torch.tensor([domain_id], dtype=torch.long)
    user_tensor = torch.tensor([user_enc], dtype=torch.long)
    karma_value = user_karma_lookup.get(user_id, 0)  # NEW
    karma_tensor = torch.tensor([[karma_value]], dtype=torch.float32)  # NEW

    model.eval()
    with torch.no_grad():
        prediction = model(avg_embed.unsqueeze(0), domain_tensor, user_tensor, karma_tensor).item()  # CHANGED

    prediction = round(prediction)
    return max(prediction, 1)


Train / Val sizes: 78551 / 19638


Training regression model...
Epoch 0, Loss: 13006680.1485
Epoch 1, Loss: 11821869.4514
Epoch 2, Loss: 11829296.1812
Epoch 3, Loss: 11656421.9206
Epoch 4, Loss: 11660789.3601
Epoch 5, Loss: 11566518.1992
Epoch 6, Loss: 11376788.1167
Epoch 7, Loss: 11298377.6361
Epoch 8, Loss: 11329899.2690
Epoch 9, Loss: 11094391.6639
Epoch 10, Loss: 10932208.9637
Epoch 11, Loss: 10979498.1269
Epoch 12, Loss: 10704100.0467
Epoch 13, Loss: 10532755.1593
Epoch 14, Loss: 10318310.4890
Epoch 15, Loss: 10223898.4981
Epoch 16, Loss: 10063215.8978
Epoch 17, Loss: 9969270.1662
Epoch 18, Loss: 10076711.1704
Epoch 19, Loss: 9711859.2655
Epoch 20, Loss: 9686440.7860
Epoch 21, Loss: 9433740.7262
Epoch 22, Loss: 9440820.8809
Epoch 23, Loss: 9227475.2588
Epoch 24, Loss: 9357782.2570
Epoch 25, Loss: 9801562.2012
Epoch 26, Loss: 9383798.5309
Epoch 27, Loss: 8997426.0554
Epoch 28, Loss: 8682737.0963
Epoch 29, Loss: 8714353.7903
Epoch 30, Loss: 9001108.7360
Epoch 31, Loss: 8834451.8839


In [5]:
# -----------------------------
# 11. Example
# -----------------------------
print("\nPredicted upvotes for new post:")
example_title = "Generates $1 billion today!"
example_title = "Close Votes Are A Feature, Not A Bug"
example_title = "what do you mean? okay"
example_title = "Show HN: AI Hacker generates $1 billion"
example_url = "https://openai.com"
user_id = "hackerl33t"
pred = predict_upvotes(example_title, example_url, user_id)
print(f"{example_title} → Predicted Upvotes: {pred:.2f}")


Predicted upvotes for new post:
Show HN: AI Hacker generates $1 billion → Predicted Upvotes: 5.55


In [None]:
# -----------------------------
# 12. Prediction on 10,000 rows of validation data
# -----------------------------

####### 10,000 predictions takes ~17m ######

all_preds   = []
all_targets = []

print("Evaluating model on limited rows using predict_upvotes...")

# take the first n validation indices
test_idx = val_idx[:10000]

for i in range(len(X_title_val)):
    title    = X_title_val[i]    # raw python str
    url      = domain_ids_val[i]      # raw url/domain
    user_id  = user_ids_val[i]        # raw user_id
    true_val = y_val[i]    # raw target

    pred = predict_upvotes(title, url, user_id)
    if pred is not None:
        all_preds.append(pred)
        all_targets.append(true_val)

# Compute metrics
mae = mean_absolute_error(all_targets, all_preds)
r2  = r2_score(all_targets, all_preds)

print(f"Validation MAE: {mae:.4f}")
print(f"Validation R² : {r2:.4f}")

# Optional: inspect a few examples
print("\nSample predictions vs. actuals:")
for p, t in list(zip(all_preds, all_targets))[:100]:
    print(f"  Predicted: {p}  |  Actual: {t}")


Evaluating model on limited rows using predict_upvotes...
Validation MAE: 21.4463
Validation R² : 0.0357

Sample predictions vs. actuals:
  Predicted: 21  |  Actual: 5
  Predicted: 21  |  Actual: 1
  Predicted: 3  |  Actual: 1
  Predicted: 1  |  Actual: 8
  Predicted: 5  |  Actual: 1
  Predicted: 9  |  Actual: 7
  Predicted: 21  |  Actual: 1
  Predicted: 21  |  Actual: 1
  Predicted: 1  |  Actual: 1
  Predicted: 8  |  Actual: 4
  Predicted: 21  |  Actual: 37
  Predicted: 3  |  Actual: 1
  Predicted: 4  |  Actual: 1
  Predicted: 6  |  Actual: 1
  Predicted: 1  |  Actual: 1
  Predicted: 207  |  Actual: 146
  Predicted: 5  |  Actual: 2
  Predicted: 21  |  Actual: 25
  Predicted: 2  |  Actual: 2
  Predicted: 3  |  Actual: 1
  Predicted: 8  |  Actual: 1
  Predicted: 4  |  Actual: 1
  Predicted: 21  |  Actual: 1
  Predicted: 4  |  Actual: 1
  Predicted: 4  |  Actual: 2
  Predicted: 21  |  Actual: 1
  Predicted: 11  |  Actual: 1
  Predicted: 21  |  Actual: 2
  Predicted: 21  |  Actual: 1
  Pr