In [7]:

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import psycopg2
from urllib.parse import urlparse
from collections import Counter
from sklearn.preprocessing import LabelEncoder
import numpy as np
import re
from collections import Counter
from datasets import load_dataset
import json


In [8]:
# -----------------------------
# 1. Fetch data from PostgreSQL
# -----------------------------
print("Fetching Hacker News data from PostgreSQL...")
conn = psycopg2.connect("postgres://sy91dhb:g5t49ao@178.156.142.230:5432/hd64m1ki")
cur = conn.cursor()
cur.execute("""
    SELECT i.title, i.url, i.score, i.by, u.karma
    FROM hacker_news.items i
    JOIN hacker_news.users u ON i.by = u.id
    WHERE i.title IS NOT NULL AND i.score IS NOT NULL AND i.by IS NOT NULL
    LIMIT 20000;
""")  # CHANGED
rows = cur.fetchall()
conn.close()

titles, urls, scores, by, karmas = zip(*rows)  # CHANGED


# -----------------------------
# 2. Preprocess titles
# -----------------------------
def preprocess(text):
    text = text.lower()
    text = re.sub(r'[^a-z0-9 ]+', '', text)
    return text.split()

tokenized_titles = [preprocess(title) for title in titles]
word_counts = Counter(word for title in tokenized_titles for word in title)
vocab = sorted(word_counts, key=word_counts.get, reverse=True)
word_to_ix = {word: i for i, word in enumerate(vocab)}
ix_to_word = {i: word for word, i in word_to_ix.items()}
vocab_size = len(word_to_ix)

Fetching Hacker News data from PostgreSQL...


In [9]:
# -----------------------------
# 3. Load Vocabulary
# -----------------------------
with open("vocab-w2-200titles.json", "r", encoding="utf-8") as f:
    word_to_ix = json.load(f)

ix_to_word = {int(i): w for w, i in word_to_ix.items()}
vocab_size = len(word_to_ix)


# -----------------------------
# 3. Load Pre-trained Embeddings
# -----------------------------
embed_dim = 300  
embeddings = torch.load("embeddings-w2-200titles-300dim-10e.pt", map_location='cpu')  # Shape: [vocab_size, embed_dim]
assert embeddings.shape[0] == vocab_size, "Vocab size mismatch!"

In [11]:
# -----------------------------
# 4. CBOW Model
# -----------------------------
class CBOW(nn.Module):
    def __init__(self, vocab_size, embed_dim):
        super().__init__()
        self.embeddings = nn.Embedding(vocab_size, embed_dim)
        self.linear = nn.Linear(embed_dim, vocab_size)

    def forward(self, inputs):
        embeds = self.embeddings(inputs).mean(dim=1)
        return self.linear(embeds)

cbow_model = CBOW(vocab_size, embed_dim)
cbow_model.embeddings.weight.data.copy_(embeddings)
cbow_model.embeddings.weight.requires_grad = False  


# -----------------------------
# 6. Create title embeddings
# -----------------------------
title_embeddings = []
valid_indices = []
for i, tokens in enumerate(tokenized_titles):
    token_ids = [word_to_ix[t] for t in tokens if t in word_to_ix]
    if token_ids:
        with torch.no_grad():
            vectors = cbow_model.embeddings(torch.tensor(token_ids))
            avg_vector = vectors.mean(dim=0)
        title_embeddings.append(avg_vector)
        valid_indices.append(i)

X_title = torch.stack(title_embeddings)
y = torch.tensor([scores[i] for i in valid_indices], dtype=torch.float32).unsqueeze(1)



karmas_tensor = torch.tensor([karmas[i] for i in valid_indices], dtype=torch.float32).unsqueeze(1)  # NEW
user_ids = [by[i] for i in valid_indices]  # NEW
user_karma_lookup = {user_ids[i]: karmas_tensor[i].item() for i in range(len(user_ids))}  # NEW


# -----------------------------
# 7. Process domain names
# -----------------------------
parsed_domains = []
for url in urls:
    try:
        parsed = urlparse(url)
        domain = parsed.netloc or 'unknown'
    except:
        domain = 'unknown'
    parsed_domains.append(domain)

le = LabelEncoder()
domain_ids = le.fit_transform(parsed_domains)
domain_ids_tensor = torch.tensor(domain_ids, dtype=torch.long)[valid_indices]
domain_vocab_size = len(le.classes_)
domain_embed_dim = 3

# -----------------------------
# 8. Regression Model
# -----------------------------


class UpvotePredictor(nn.Module):
    def __init__(self, title_embed_dim, domain_vocab_size, domain_embed_dim):
        super().__init__()
        self.domain_embedding = nn.Embedding(domain_vocab_size, domain_embed_dim)
        self.model = nn.Sequential(
            nn.Linear(title_embed_dim + domain_embed_dim + 1, 128),  # CHANGED
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 1)
        )

    def forward(self, title_embed, domain_id, karma):  # CHANGED
        domain_vec = self.domain_embedding(domain_id)
        x = torch.cat([title_embed, domain_vec, karma], dim=1)  # CHANGED
        return self.model(x)



model = UpvotePredictor(embed_dim, domain_vocab_size, domain_embed_dim)
loss_fn = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# -----------------------------
# 9. Dataset and Training
# -----------------------------
class HNDataset(Dataset):
    def __init__(self, title_embeds, domain_ids, karmas, labels):  # CHANGED
        self.title_embeds = title_embeds
        self.domain_ids = domain_ids
        self.karmas = karmas  # NEW
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.title_embeds[idx], self.domain_ids[idx], self.karmas[idx], self.labels[idx]  # CHANGED

dataset = HNDataset(X_title, domain_ids_tensor, karmas_tensor, y)  # CHANGED
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

print("Training regression model...")
for epoch in range(40):
    total_loss = 0
    for title_embed, domain_id, karma, label in dataloader:  # CHANGED
        pred = model(title_embed, domain_id, karma)  # CHANGED
        loss = loss_fn(pred, label)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch}, Loss: {total_loss:.4f}")



# -----------------------------
# 10. Prediction Function
# -----------------------------
def predict_upvotes(title, url, user_id):  # CHANGED
    tokens = preprocess(title)
    token_ids = [word_to_ix.get(t) for t in tokens if t in word_to_ix]
    token_ids = [i for i in token_ids if i is not None]
    if not token_ids:
        return None

    with torch.no_grad():
        vectors = cbow_model.embeddings(torch.tensor(token_ids))
        avg_embed = vectors.mean(dim=0)

    try:
        parsed = urlparse(url)
        domain = parsed.netloc or 'unknown'
    except:
        domain = 'unknown'

    try:
        domain_id = le.transform([domain])[0]
    except:
        domain_id = 0  # fallback

    domain_tensor = torch.tensor([domain_id], dtype=torch.long)
    karma_value = user_karma_lookup.get(user_id, 0)  # NEW
    karma_tensor = torch.tensor([[karma_value]], dtype=torch.float32)  # NEW

    model.eval()
    with torch.no_grad():
        prediction = model(avg_embed.unsqueeze(0), domain_tensor, karma_tensor).item()  # CHANGED
    return max(prediction, 0)




Training regression model...
Epoch 0, Loss: 1135843.9902
Epoch 1, Loss: 847319.9918
Epoch 2, Loss: 728133.7476
Epoch 3, Loss: 739053.3910
Epoch 4, Loss: 726677.7067
Epoch 5, Loss: 725502.8285
Epoch 6, Loss: 717715.7767
Epoch 7, Loss: 713087.0940
Epoch 8, Loss: 724224.6546
Epoch 9, Loss: 712961.7338
Epoch 10, Loss: 708985.8999
Epoch 11, Loss: 695394.8283
Epoch 12, Loss: 695832.0603
Epoch 13, Loss: 704587.1406
Epoch 14, Loss: 694023.5571
Epoch 15, Loss: 675457.8665
Epoch 16, Loss: 670517.3938
Epoch 17, Loss: 662820.6890
Epoch 18, Loss: 671402.3465
Epoch 19, Loss: 650162.2706
Epoch 20, Loss: 670692.0327
Epoch 21, Loss: 648065.9160
Epoch 22, Loss: 642095.5467
Epoch 23, Loss: 651686.3117
Epoch 24, Loss: 637660.4649
Epoch 25, Loss: 633404.4638
Epoch 26, Loss: 618947.4540
Epoch 27, Loss: 614007.5907
Epoch 28, Loss: 609817.5699
Epoch 29, Loss: 609713.2072
Epoch 30, Loss: 609642.0688
Epoch 31, Loss: 598740.7749
Epoch 32, Loss: 588326.4620
Epoch 33, Loss: 585273.9178
Epoch 34, Loss: 582644.8749


In [18]:
# -----------------------------
# 11. Example
# -----------------------------
print("\nPredicted upvotes for new post:")
example_title = "Generates $1 billion today!"
example_title = "Close Votes Are A Feature, Not A Bug"
example_title = "what do you mean? okay"
example_title = "Show HN: AI Hacker generates $1 billion"
example_url = "https://openai.com"
user_id = "hackerl33t"
pred = predict_upvotes(example_title, example_url, user_id)
print(f"{example_title} → Predicted Upvotes: {pred:.2f}")
print("Eat. Pray. Deploy. Blame the data. 😎 © Bayesian Buccaneers")


Predicted upvotes for new post:
Show HN: AI Hacker generates $1 billion → Predicted Upvotes: 2.75
Eat. Pray. Deploy. Blame the data. 😎 © Bayesian Buccaneers
