In [15]:

import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim.lr_scheduler import StepLR
from torch.utils.data import Dataset, DataLoader
import psycopg2
from urllib.parse import urlparse
from collections import Counter
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score
import numpy as np
import re
from collections import Counter
from datasets import load_dataset
import json



In [16]:
# -----------------------------
# 1. LOAD 80% TRAINING & VALIDATION DATA - LIMITED TO 100K ROWS & RECENT DATA (>2023)
# -----------------------------
print("Fetching Hacker News data...")
conn = psycopg2.connect("postgres://sy91dhb:g5t49ao@178.156.142.230:5432/hd64m1ki")
cur = conn.cursor()
cur.execute('''SELECT i.title, i.url, i.score, u.id, u.karma 
            FROM hacker_news.items AS i
            LEFT JOIN hacker_news.users AS u 
                ON i.by = u.id
            WHERE title IS NOT NULL AND score IS NOT NULL AND (abs(hashtext(i.id::text)) % 100) >= 20
            LIMIT 100000;
            ''')
rows = cur.fetchall()
conn.close()

titles, urls, scores, by, karmas = zip(*rows)


# -----------------------------
# 2. Preprocess titles
# -----------------------------
def preprocess(text):
    text = text.lower()
    text = re.sub(r'[^a-z0-9 ]+', '', text)
    return text.split()

tokenized_titles = [preprocess(title) for title in titles]
word_counts = Counter(word for title in tokenized_titles for word in title)
vocab = sorted(word_counts, key=word_counts.get, reverse=True)
word_to_ix = {word: i for i, word in enumerate(vocab)}
ix_to_word = {i: word for word, i in word_to_ix.items()}
vocab_size = len(word_to_ix)

Fetching Hacker News data...


In [17]:
# -----------------------------
# 3. Load Vocabulary
# -----------------------------
with open("vocab-w2-200titles.json", "r", encoding="utf-8") as f:
    word_to_ix = json.load(f)

ix_to_word = {int(i): w for w, i in word_to_ix.items()}
vocab_size = len(word_to_ix)

# -----------------------------
# 3. Load Pre-trained Embeddings
# -----------------------------
embed_dim = 300  
embeddings = torch.load("embeddings-w2-200titles-300dim-10e.pt", map_location='cpu')  # Shape: [vocab_size, embed_dim]

assert embeddings.shape[0] == vocab_size, "Vocab size mismatch!"

In [None]:
# -----------------------------
# 4. CBOW Model
# -----------------------------
class CBOW(nn.Module):
    def __init__(self, vocab_size, embed_dim):
        super().__init__()
        self.embeddings = nn.Embedding(vocab_size, embed_dim)
        self.linear = nn.Linear(embed_dim, vocab_size)

    def forward(self, inputs):
        embeds = self.embeddings(inputs).mean(dim=1)
        return self.linear(embeds)

cbow_model = CBOW(vocab_size, embed_dim)
cbow_model.embeddings.weight.data.copy_(embeddings)
cbow_model.embeddings.weight.requires_grad = False  


# -----------------------------
# 6. Create title embeddings
# -----------------------------
title_embeddings = []
valid_indices = []
for i, tokens in enumerate(tokenized_titles):
    token_ids = [word_to_ix[t] for t in tokens if t in word_to_ix]
    if token_ids:
        with torch.no_grad():
            vectors = cbow_model.embeddings(torch.tensor(token_ids))
            avg_vector = vectors.mean(dim=0)
        title_embeddings.append(avg_vector)
        valid_indices.append(i)

X_title = torch.stack(title_embeddings)
y = torch.tensor([scores[i] for i in valid_indices], dtype=torch.float32).unsqueeze(1)



karmas_tensor = torch.tensor([karmas[i] for i in valid_indices], dtype=torch.float32).unsqueeze(1)  # NEW
user_ids = [by[i] for i in valid_indices]  # NEW
user_karma_lookup = {user_ids[i]: karmas_tensor[i].item() for i in range(len(user_ids))}  # NEW


# -----------------------------
# 7. Process domain names
# -----------------------------
parsed_domains = []
for url in urls:
    try:
        parsed = urlparse(url)
        domain = parsed.netloc or 'unknown'
    except:
        domain = 'unknown'
    # Clean and normalize domain names
    domain = domain.lower()
    if domain.startswith("www."):
        domain = domain[4:]
    parsed_domains.append(domain)


le = LabelEncoder()
domain_ids = le.fit_transform(parsed_domains)
domain_ids_tensor = torch.tensor(domain_ids, dtype=torch.long)[valid_indices]

# # Create index to word mapping for domains
# domain_to_ix = {domain: i for i, domain in enumerate(le.classes_)}
# domain_ix_to_word = {i: domain for domain, i in domain_to_ix.items()}
# # parsed_domains is your list of domain strings
# domain_counts = Counter(parsed_domains)
# # Sort descending by count
# domain_counts_desc = dict(domain_counts.most_common())
# # Example: print top 10 domains
# for domain, cnt in list(domain_counts_desc.items())[:10]:
#     print(f"{domain}: {cnt}")

domain_vocab_size = len(le.classes_)

domain_embed_dim = 3


# -----------------------------
# 8. Regression Model
# -----------------------------


class UpvotePredictor(nn.Module):
    def __init__(self, title_embed_dim, domain_vocab_size, domain_embed_dim):
        super().__init__()
        self.domain_embedding = nn.Embedding(domain_vocab_size, domain_embed_dim)
        self.model = nn.Sequential(
            nn.Linear(title_embed_dim + domain_embed_dim + 1, 128),  # CHANGED
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 1)
        )

    def forward(self, title_embed, domain_id, karma):  # CHANGED
        domain_vec = self.domain_embedding(domain_id)
        x = torch.cat([title_embed, domain_vec, karma], dim=1)  # CHANGED
        return self.model(x)



model = UpvotePredictor(embed_dim, domain_vocab_size, domain_embed_dim)
loss_fn = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)    # CAN CHANGE LR TO 0.1 FOR DYNAMIC LR WITH SCHEDULER

### Uncomment to activate learning rate scheduler for dynamic LR ###
# Wrap with a StepLR scheduler:
#    every 5 epochs, multiply the LR by 0.5
#scheduler = StepLR(optimizer, step_size=5, gamma=0.5)

# -----------------------------
# 8b. Train-validation split
# -----------------------------

# Assume you already have these full N-length tensors from V8 preprocessing:
#   X_title           – (N, title_embed_dim) torch.Tensor  
#   domain_ids_tensor – (N,)               torch.LongTensor
#   karma_tensor      – (N,)               torch.FloatTensor
#   y                 – (N,)               torch.FloatTensor (scores)

# Build an array of row indices
all_idx = list(range(y.size(0)))

# Split indices into 80% train / 20% val
train_idx, val_idx = train_test_split(
    all_idx,
    test_size=0.2,
    random_state=42,
    shuffle=True
)

# Index into each tensor
X_title_train      = X_title[train_idx]
X_title_val        = X_title[val_idx]

domain_ids_train   = domain_ids_tensor[train_idx]
domain_ids_val     = domain_ids_tensor[val_idx]

karmas_train        = karmas_tensor[train_idx]
karmas_val          = karmas_tensor[val_idx]

y_train            = y[train_idx]
y_val              = y[val_idx]

# Print sizes of training and validation sets
print(f"Train / Val sizes: {len(train_idx)} / {len(val_idx)}")
print('\n') 


# -----------------------------
# 9. Dataset and Training
# -----------------------------
class HNDataset(Dataset):
    def __init__(self, title_embeds, domain_ids, karmas, labels):  # CHANGED
        self.title_embeds = title_embeds
        self.domain_ids = domain_ids
        self.karmas = karmas  # NEW
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.title_embeds[idx], self.domain_ids[idx], self.karmas[idx], self.labels[idx]  # CHANGED

dataset = HNDataset(X_title_train, domain_ids_train, karmas_train, y_train)  # CHANGED
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

print("Training regression model...")
for epoch in range(40):
    total_loss = 0
    for title_embed, domain_id, karma, label in dataloader:  # CHANGED
        pred = model(title_embed, domain_id, karma)  # CHANGED
        loss = loss_fn(pred, label)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch}, Loss: {total_loss:.4f}")

    ### Uncomment to activate learning rate schedular ###
    #scheduler.step()
    #print(f"Epoch {epoch:2d} — LR: {scheduler.get_last_lr()[0]:.4f}")



# -----------------------------
# 10. Prediction Function
# -----------------------------
def predict_upvotes(title, url, user_id):  # CHANGED
    tokens = preprocess(title)
    token_ids = [word_to_ix.get(t) for t in tokens if t in word_to_ix]
    token_ids = [i for i in token_ids if i is not None]
    if not token_ids:
        return None

    with torch.no_grad():
        vectors = cbow_model.embeddings(torch.tensor(token_ids))
        avg_embed = vectors.mean(dim=0)

    try:
        parsed = urlparse(url)
        domain = parsed.netloc or 'unknown'
    except:
        domain = 'unknown'

    try:
        domain_id = le.transform([domain])[0]
    except:
        domain_id = 0  # fallback

    domain_tensor = torch.tensor([domain_id], dtype=torch.long)
    karma_value = user_karma_lookup.get(user_id, 0)  # NEW
    karma_tensor = torch.tensor([[karma_value]], dtype=torch.float32)  # NEW

    model.eval()
    with torch.no_grad():
        prediction = model(avg_embed.unsqueeze(0), domain_tensor, karma_tensor).item()  # CHANGED
    return max(prediction, 0)




Train / Val sizes: 78298 / 19575


Training regression model...
Epoch 0, Loss: 7363481.3937
Epoch 1, Loss: 6559593.4087
Epoch 2, Loss: 6489907.9647
Epoch 3, Loss: 6535337.1466
Epoch 4, Loss: 6502045.6733
Epoch 5, Loss: 6492359.2790
Epoch 6, Loss: 6462600.0565
Epoch 7, Loss: 6421978.1232
Epoch 8, Loss: 6402853.1522
Epoch 9, Loss: 6373585.7467
Epoch 10, Loss: 6332903.6531
Epoch 11, Loss: 6277093.5212
Epoch 12, Loss: 6264564.4477
Epoch 13, Loss: 6211051.8333
Epoch 14, Loss: 6183152.2103
Epoch 15, Loss: 6153330.9888
Epoch 16, Loss: 6088615.8522
Epoch 17, Loss: 6027872.6625
Epoch 18, Loss: 6005364.0397
Epoch 19, Loss: 5943246.3846
Epoch 20, Loss: 5892781.4818
Epoch 21, Loss: 5831646.1819
Epoch 22, Loss: 5745409.8031
Epoch 23, Loss: 5710093.2609
Epoch 24, Loss: 5639549.6791
Epoch 25, Loss: 5600129.3221
Epoch 26, Loss: 5551885.5971
Epoch 27, Loss: 5499896.4121
Epoch 28, Loss: 5436266.0857
Epoch 29, Loss: 5589235.7866
Epoch 30, Loss: 5410087.3565
Epoch 31, Loss: 5327318.2017
Epoch 32, Loss: 52

In [19]:
# -----------------------------
# 11. Example
# -----------------------------
print("\nPredicted upvotes for new post:")
example_title = "Generates $1 billion today!"
example_title = "Close Votes Are A Feature, Not A Bug"
example_title = "what do you mean? okay"
example_title = "Show HN: AI Hacker generates $1 billion"
example_url = "https://openai.com"
user_id = "hackerl33t"
pred = predict_upvotes(example_title, example_url, user_id)
print(f"{example_title} → Predicted Upvotes: {pred:.2f}")
print("Eat. Pray. Deploy. Blame the data. 😎 © Bayesian Buccaneers")


Predicted upvotes for new post:
Show HN: AI Hacker generates $1 billion → Predicted Upvotes: 0.00
Eat. Pray. Deploy. Blame the data. 😎 © Bayesian Buccaneers


In [None]:
# -----------------------------
# 12. Prediction on validation data
# -----------------------------

# Build a DataLoader for the validation set
dataset_val = HNDataset(
    X_title_val,
    domain_ids_val,
    karmas_val,
    y_val
)
val_loader = DataLoader(dataset_val, batch_size=32, shuffle=False)

model.eval()
all_preds = []
all_targets = []

with torch.no_grad():
    for title_emb, dom_id, krm, target in val_loader:
        pred = model(title_emb, dom_id, krm).squeeze()
        pred = pred.view(-1)
        all_preds.extend(pred.tolist())
        all_targets.extend(target.tolist())

# Flatten any singleton lists in all_targets (and preds, just in case)
all_preds_flat   = [p[0] if isinstance(p, list) else p   for p in all_preds]
all_targets_flat = [t[0] if isinstance(t, list) else t   for t in all_targets]

# Compute metrics
mae = mean_absolute_error(all_targets_flat, all_preds_flat)
r2  = r2_score(all_targets_flat, all_preds_flat)

# Average number of upvotes away from target
print(f"Validation MAE: {mae:.4f}")

print(f"Validation R² : {r2:.4f}")


# Optional: inspect a few examples
print("\nSample predictions vs. actuals:")
for i in range(100, 130):
    print(f"  Predicted: {all_preds_flat[i]:.2f}  |  Actual: {all_targets_flat[i]:.0f}")


Validation MAE: 19.9363
Validation R² : -0.6847

Sample predictions vs. actuals:
  Predicted: 3.16  |  Actual: 1
  Predicted: 11.66  |  Actual: 1
  Predicted: 18.15  |  Actual: 40
  Predicted: 2.41  |  Actual: 1
  Predicted: 7.81  |  Actual: 1
  Predicted: 6.34  |  Actual: 1
  Predicted: 10.88  |  Actual: 1
  Predicted: 6.55  |  Actual: 1
  Predicted: 2.42  |  Actual: 3
  Predicted: 18.15  |  Actual: 1
  Predicted: 6.17  |  Actual: 2
  Predicted: -25.23  |  Actual: 2
  Predicted: 18.15  |  Actual: 1
  Predicted: 13.47  |  Actual: 2
  Predicted: 18.15  |  Actual: 1
  Predicted: 4.44  |  Actual: 1
  Predicted: 8.69  |  Actual: 1
  Predicted: 8.11  |  Actual: 1
  Predicted: 18.15  |  Actual: 113
  Predicted: 127.65  |  Actual: 2
  Predicted: 18.15  |  Actual: 1
  Predicted: 4.73  |  Actual: 1
  Predicted: 0.33  |  Actual: 1
  Predicted: 13.98  |  Actual: 2
  Predicted: 18.15  |  Actual: 5
  Predicted: 15.69  |  Actual: 3
  Predicted: 4.83  |  Actual: 2
  Predicted: 2.24  |  Actual: 1
  Pr

In [None]:
### BASE MODEL EVALUATION ###
### DO NOT RUN THIS CELL ###

# -----------------------------
# 12. Prediction on validation data
# -----------------------------

# Build a DataLoader for the validation set
dataset_val = HNDataset(
    X_title_val,
    domain_ids_val,
    karmas_val,
    y_val
)
val_loader = DataLoader(dataset_val, batch_size=32, shuffle=False)

model.eval()
all_preds = []
all_targets = []

with torch.no_grad():
    for title_emb, dom_id, krm, target in val_loader:
        pred = model(title_emb, dom_id, krm).squeeze()
        pred = pred.view(-1)
        all_preds.extend(pred.tolist())
        all_targets.extend(target.tolist())

# Flatten any singleton lists in all_targets (and preds, just in case)
all_preds_flat   = [p[0] if isinstance(p, list) else p   for p in all_preds]
all_targets_flat = [t[0] if isinstance(t, list) else t   for t in all_targets]

# Compute metrics
mae = mean_absolute_error(all_targets_flat, all_preds_flat)
r2  = r2_score(all_targets_flat, all_preds_flat)

# Average number of upvotes away from target
print(f"Validation MAE: {mae:.4f}")

print(f"Validation R² : {r2:.4f}")


# Optional: inspect a few examples
print("\nSample predictions vs. actuals:")
for i in range(10):
    print(f"  Predicted: {all_preds_flat[i]:.2f}  |  Actual: {all_targets_flat[i]:.0f}")


Validation MAE: 13.0061
Validation R² : -0.0650

Sample predictions vs. actuals:
  Predicted: 1.72  |  Actual: 1
  Predicted: 8.44  |  Actual: 6
  Predicted: 2.10  |  Actual: 1
  Predicted: 14.29  |  Actual: 443
  Predicted: 21.57  |  Actual: 14
  Predicted: 1.73  |  Actual: 3
  Predicted: 0.80  |  Actual: 1
  Predicted: 3.10  |  Actual: 41
  Predicted: 3.24  |  Actual: 1
  Predicted: 14.29  |  Actual: 1
