In [21]:
### """Make sure umuts_cbow_full.pth and umuts_cbow.pth are both in the same directory for this code to work!"""


import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import psycopg2
from urllib.parse import urlparse
from collections import Counter
from sklearn.preprocessing import LabelEncoder
import numpy as np
import re
import nltk
nltk.download('punkt')

# -----------------------------
# 1. Fetch data from PostgreSQL
# -----------------------------
print("Fetching Hacker News data from PostgreSQL...")
conn = psycopg2.connect("postgres://sy91dhb:g5t49ao@178.156.142.230:5432/hd64m1ki")
cur = conn.cursor()
cur.execute("""
    SELECT i.title, i.url, i.score, i.by, u.karma
    FROM hacker_news.items i
    JOIN hacker_news.users u ON i.by = u.id
    WHERE i.title IS NOT NULL AND i.score IS NOT NULL AND i.by IS NOT NULL
    LIMIT 20000;
""")
rows = cur.fetchall()
conn.close()

titles, urls, scores, by, karmas = zip(*rows)

# -----------------------------
# 2. Preprocess titles
# -----------------------------
def preprocess(text):
    text = text.lower()
    text = re.sub(r'[^a-z0-9 ]+', '', text)
    return text.split()

checkpoint = torch.load("umuts_cbow_full.pth")
word_to_ix = checkpoint['word_to_ix']

tokenized_titles = [preprocess(title) for title in titles]
ix_to_word = {i: word for word, i in word_to_ix.items()}
vocab_size = len(word_to_ix)

# -----------------------------
# 3. CBOW Model definition
# -----------------------------
class CBOW(nn.Module):
    def __init__(self, vocab_size, embed_dim):
        super().__init__()
        self.embeddings = nn.Embedding(vocab_size, embed_dim)
        self.linear = nn.Linear(embed_dim, vocab_size)

    def forward(self, inputs):
        embeds = self.embeddings(inputs).mean(dim=1)
        return self.linear(embeds)

embed_dim = 5
cbow_model = CBOW(vocab_size, embed_dim)
cbow_model.load_state_dict(torch.load("umuts_cbow.pth"))
cbow_model.eval()

# -----------------------------
# 4. Create title embeddings
# -----------------------------
title_embeddings = []
valid_indices = []
for i, tokens in enumerate(tokenized_titles):
    token_ids = [word_to_ix[t] for t in tokens if t in word_to_ix]
    if token_ids:
        with torch.no_grad():
            vectors = cbow_model.embeddings(torch.tensor(token_ids))
            avg_vector = vectors.mean(dim=0)
        title_embeddings.append(avg_vector)
        valid_indices.append(i)

X_title = torch.stack(title_embeddings)
y = torch.tensor([scores[i] for i in valid_indices], dtype=torch.float32).unsqueeze(1)

# -----------------------------
# 5. Process domains
# -----------------------------
parsed_domains = []
for url in urls:
    try:
        parsed = urlparse(url)
        domain = parsed.netloc or 'unknown'
    except:
        domain = 'unknown'
    parsed_domains.append(domain)

le = LabelEncoder()
domain_ids = le.fit_transform(parsed_domains)
domain_ids_tensor = torch.tensor(domain_ids, dtype=torch.long)[valid_indices]
domain_vocab_size = len(le.classes_)
domain_embed_dim = 3

# -----------------------------
# 6. User karma
# -----------------------------
karmas_tensor = torch.tensor([karmas[i] for i in valid_indices], dtype=torch.float32).unsqueeze(1)
user_ids = [by[i] for i in valid_indices]
user_karma_lookup = {user_ids[i]: karmas_tensor[i].item() for i in range(len(user_ids))}

# -----------------------------
# 7. Model definition
# -----------------------------
class UpvotePredictor(nn.Module):
    def __init__(self, title_embed_dim, domain_vocab_size, domain_embed_dim):
        super().__init__()
        self.domain_embedding = nn.Embedding(domain_vocab_size, domain_embed_dim)
        self.model = nn.Sequential(
            nn.Linear(title_embed_dim + domain_embed_dim + 1, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 1)
        )

    def forward(self, title_embed, domain_id, karma):
        domain_vec = self.domain_embedding(domain_id)
        x = torch.cat([title_embed, domain_vec, karma], dim=1)
        return self.model(x)

model = UpvotePredictor(embed_dim, domain_vocab_size, domain_embed_dim)
loss_fn = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# -----------------------------
# 8. Dataset and training
# -----------------------------
class HNDataset(Dataset):
    def __init__(self, title_embeds, domain_ids, karmas, labels):
        self.title_embeds = title_embeds
        self.domain_ids = domain_ids
        self.karmas = karmas
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.title_embeds[idx], self.domain_ids[idx], self.karmas[idx], self.labels[idx]

dataset = HNDataset(X_title, domain_ids_tensor, karmas_tensor, y)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

print("Training regression model...")
for epoch in range(20):
    total_loss = 0
    for title_embed, domain_id, karma, label in dataloader:
        pred = model(title_embed, domain_id, karma)
        loss = loss_fn(pred, label)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch}, Loss: {total_loss:.4f}")

# -----------------------------
# 9. Prediction function
# -----------------------------
def predict_upvotes(title, url, user_id):
    tokens = preprocess(title)
    token_ids = [word_to_ix.get(t) for t in tokens if t in word_to_ix]
    if not token_ids:
        return None

    with torch.no_grad():
        vectors = cbow_model.embeddings(torch.tensor(token_ids))
        avg_embed = vectors.mean(dim=0)

    try:
        parsed = urlparse(url)
        domain = parsed.netloc or 'unknown'
    except:
        domain = 'unknown'

    try:
        domain_id = le.transform([domain])[0]
    except:
        domain_id = 0  # fallback if unseen domain

    domain_tensor = torch.tensor([domain_id], dtype=torch.long)
    karma_value = user_karma_lookup.get(user_id, 0)
    karma = torch.tensor([[karma_value]], dtype=torch.float32)

    model.eval()
    with torch.no_grad():
        prediction = model(avg_embed.unsqueeze(0), domain_tensor, karma).item()
    return max(prediction, 0)  # no negative scores

# -----------------------------
# 10. Example
# -----------------------------
print("\nPredicted upvotes for new post:")
example_title = "Show HN: AI Hacker generates $1 billion"
example_url = "https://openai.com"
pred = predict_upvotes(example_title, example_url, "ingve")
print(f"{example_title} \u2192 Predicted Upvotes: {pred:.2f}")


[nltk_data] Downloading package punkt to /home/usa/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Fetching Hacker News data from PostgreSQL...
Training regression model...
Epoch 0, Loss: 13172101.0013
Epoch 1, Loss: 16254787.7412
Epoch 2, Loss: 4171538.0912
Epoch 3, Loss: 4520816.8846
Epoch 4, Loss: 4624699.6260
Epoch 5, Loss: 5295892.1440
Epoch 6, Loss: 4062451.7632
Epoch 7, Loss: 3997113.3077
Epoch 8, Loss: 3976671.2220
Epoch 9, Loss: 3913383.2743
Epoch 10, Loss: 3922213.5948
Epoch 11, Loss: 3839923.3516
Epoch 12, Loss: 3749362.8172
Epoch 13, Loss: 3733857.8913
Epoch 14, Loss: 3718686.9486
Epoch 15, Loss: 3713940.0997
Epoch 16, Loss: 3728649.3564
Epoch 17, Loss: 3723918.7060
Epoch 18, Loss: 3782209.7143
Epoch 19, Loss: 3732282.1179

Predicted upvotes for new post:
Show HN: AI Hacker generates $1 billion → Predicted Upvotes: 24.29


In [47]:
# -----------------------------
# 11. Example
# -----------------------------
print("\nPredicted upvotes for new post:")
example_title = "Generates $1 billion today!"
example_title = "Close Votes Are A Feature, Not A Bug"
example_title = "what do you mean? okay"
example_title = "Show HN: AI Hacker generates $1 billion"
example_url = "https://openai.com"
pred = predict_upvotes(example_title, example_url, "ingve")
print(f"{example_title} → Predicted Upvotes: {pred:.2f}")



Predicted upvotes for new post:
Show HN: AI Hacker generates $1 billion → Predicted Upvotes: 24.29
