In [None]:
### """Make sure umuts_cbow_full.pth and umuts_cbow.pth are both in the same directory for this code to work!"""


import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import psycopg2
from urllib.parse import urlparse
from collections import Counter
from sklearn.preprocessing import LabelEncoder
import numpy as np
import re
import nltk
nltk.download('punkt')

# # -----------------------------
# # 1. Fetch data from PostgreSQL
# # -----------------------------
# print("Fetching Hacker News data from PostgreSQL...")
# conn = psycopg2.connect("postgres://sy91dhb:g5t49ao@178.156.142.230:5432/hd64m1ki")
# cur = conn.cursor()
# cur.execute("""
#     SELECT i.title, i.url, i.score, i.by, u.karma
#     FROM hacker_news.items i
#     JOIN hacker_news.users u ON i.by = u.id
#     WHERE i.title IS NOT NULL AND i.score IS NOT NULL AND i.by IS NOT NULL
#     LIMIT 20000;
# """)
# rows = cur.fetchall()
# conn.close()

# titles, urls, scores, by, karmas = zip(*rows)

# -----------------------------
# 1b. 80% TRAINING & VALIDATION DATA LOAD LIMITED TO 100K ROWS & RECENT DATA
# -----------------------------
print("Fetching Hacker News data...")
conn = psycopg2.connect("postgres://sy91dhb:g5t49ao@178.156.142.230:5432/hd64m1ki")
cur = conn.cursor()
cur.execute('''SELECT i.title, i.url, i.score, u.id, u.karma 
            FROM hacker_news.items AS i
            LEFT JOIN hacker_news.users AS u 
                ON i.by = u.id
            WHERE title IS NOT NULL AND score IS NOT NULL AND (abs(hashtext(i.id::text)) % 100) >= 20
            LIMIT 100000;
            ''')
rows = cur.fetchall()
conn.close()

titles, urls, scores, by, karmas = zip(*rows)

# -----------------------------
# 2. Preprocess titles
# -----------------------------
def preprocess(text):
    text = text.lower()
    text = re.sub(r'[^a-z0-9 ]+', '', text)
    return text.split()

checkpoint = torch.load("umuts_cbow_full.pth")
word_to_ix = checkpoint['word_to_ix']

tokenized_titles = [preprocess(title) for title in titles]
ix_to_word = {i: word for word, i in word_to_ix.items()}
vocab_size = len(word_to_ix)

# -----------------------------
# 3. CBOW Model definition
# -----------------------------
class CBOW(nn.Module):
    def __init__(self, vocab_size, embed_dim):
        super().__init__()
        self.embeddings = nn.Embedding(vocab_size, embed_dim)
        self.linear = nn.Linear(embed_dim, vocab_size)

    def forward(self, inputs):
        embeds = self.embeddings(inputs).mean(dim=1)
        return self.linear(embeds)

embed_dim = 5
cbow_model = CBOW(vocab_size, embed_dim)
cbow_model.load_state_dict(torch.load("umuts_cbow.pth"))
cbow_model.eval()

# -----------------------------
# 4. Create title embeddings
# -----------------------------
title_embeddings = []
valid_indices = []
for i, tokens in enumerate(tokenized_titles):
    token_ids = [word_to_ix[t] for t in tokens if t in word_to_ix]
    if token_ids:
        with torch.no_grad():
            vectors = cbow_model.embeddings(torch.tensor(token_ids))
            avg_vector = vectors.mean(dim=0)
        title_embeddings.append(avg_vector)
        valid_indices.append(i)

X_title = torch.stack(title_embeddings)
y = torch.tensor([scores[i] for i in valid_indices], dtype=torch.float32).unsqueeze(1)

# -----------------------------
# 5. Process domains
# -----------------------------
parsed_domains = []
for url in urls:
    try:
        parsed = urlparse(url)
        domain = parsed.netloc or 'unknown'
    except:
        domain = 'unknown'
    parsed_domains.append(domain)

le = LabelEncoder()
domain_ids = le.fit_transform(parsed_domains)
domain_ids_tensor = torch.tensor(domain_ids, dtype=torch.long)[valid_indices]
domain_vocab_size = len(le.classes_)
domain_embed_dim = 3

# -----------------------------
# 6. User karma
# -----------------------------
karmas_tensor = torch.tensor([karmas[i] for i in valid_indices], dtype=torch.float32).unsqueeze(1)
user_ids = [by[i] for i in valid_indices]
user_karma_lookup = {user_ids[i]: karmas_tensor[i].item() for i in range(len(user_ids))}

# -----------------------------
# 7. Model definition
# -----------------------------
class UpvotePredictor(nn.Module):
    def __init__(self, title_embed_dim, domain_vocab_size, domain_embed_dim):
        super().__init__()
        self.domain_embedding = nn.Embedding(domain_vocab_size, domain_embed_dim)
        self.model = nn.Sequential(
            nn.Linear(title_embed_dim + domain_embed_dim + 1, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 1)
        )

    def forward(self, title_embed, domain_id, karma):
        domain_vec = self.domain_embedding(domain_id)
        x = torch.cat([title_embed, domain_vec, karma], dim=1)
        return self.model(x)

model = UpvotePredictor(embed_dim, domain_vocab_size, domain_embed_dim)
loss_fn = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)


# -----------------------------
# 7b. Train-validation split
# -----------------------------
from sklearn.model_selection import train_test_split

# Assume you already have these full N-length tensors from V8 preprocessing:
#   X_title           – (N, title_embed_dim) torch.Tensor  
#   domain_ids_tensor – (N,)               torch.LongTensor
#   karma_tensor      – (N,)               torch.FloatTensor
#   y                 – (N,)               torch.FloatTensor (scores)

# Build an array of row indices
all_idx = list(range(y.size(0)))

# Split indices into 80% train / 20% val
train_idx, val_idx = train_test_split(
    all_idx,
    test_size=0.2,
    random_state=42,
    shuffle=True
)

# Index into each tensor
X_title_train      = X_title[train_idx]
X_title_val        = X_title[val_idx]

domain_ids_train   = domain_ids_tensor[train_idx]
domain_ids_val     = domain_ids_tensor[val_idx]

karma_train        = karmas_tensor[train_idx]
karma_val          = karmas_tensor[val_idx]

y_train            = y[train_idx]
y_val              = y[val_idx]

# Print sizes of training and validation sets
print(f"Train / Val sizes: {len(train_idx)} / {len(val_idx)}")

# -----------------------------
# 8. Dataset and training
# -----------------------------
class HNDataset(Dataset):
    def __init__(self, title_embeds, domain_ids, karmas, labels):
        self.title_embeds = title_embeds
        self.domain_ids = domain_ids
        self.karmas = karmas
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.title_embeds[idx], self.domain_ids[idx], self.karmas[idx], self.labels[idx]

dataset = HNDataset(X_title_train, domain_ids_train, karma_train, y_train)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

print("Training regression model...")
for epoch in range(20):
    total_loss = 0
    for title_embed, domain_id, karma, label in dataloader:
        pred = model(title_embed, domain_id, karma)
        loss = loss_fn(pred, label)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch}, Loss: {total_loss:.4f}")

# -----------------------------
# 9. Prediction function
# -----------------------------
def predict_upvotes(title, url, user_id):
    tokens = preprocess(title)
    token_ids = [word_to_ix.get(t) for t in tokens if t in word_to_ix]
    if not token_ids:
        return None

    with torch.no_grad():
        vectors = cbow_model.embeddings(torch.tensor(token_ids))
        avg_embed = vectors.mean(dim=0)

    try:
        parsed = urlparse(url)
        domain = parsed.netloc or 'unknown'
    except:
        domain = 'unknown'

    try:
        domain_id = le.transform([domain])[0]
    except:
        domain_id = 0  # fallback if unseen domain

    domain_tensor = torch.tensor([domain_id], dtype=torch.long)
    karma_value = user_karma_lookup.get(user_id, 0)
    karma = torch.tensor([[karma_value]], dtype=torch.float32)

    model.eval()
    with torch.no_grad():
        prediction = model(avg_embed.unsqueeze(0), domain_tensor, karma).item()
    return max(prediction, 0)  # no negative scores

# -----------------------------
# 10. Example
# -----------------------------
print("\nPredicted upvotes for new post:")
example_title = "Show HN: AI Hacker generates $1 billion"
example_url = "https://openai.com"
pred = predict_upvotes(example_title, example_url, "ingve")
print(f"{example_title} \u2192 Predicted Upvotes: {pred:.2f}")


[nltk_data] Downloading package punkt to /Users/Arjuna/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Fetching Hacker News data...
Train / Val sizes: 79071 / 19768
Training regression model...
Epoch 0, Loss: 31971868.8735
Epoch 1, Loss: 15392660.3683
Epoch 2, Loss: 13560051.1591
Epoch 3, Loss: 11704623.0207
Epoch 4, Loss: 11427061.4933
Epoch 5, Loss: 11437943.9736
Epoch 6, Loss: 11405943.6225
Epoch 7, Loss: 11306153.9255
Epoch 8, Loss: 11296491.5370
Epoch 9, Loss: 11228262.6810
Epoch 10, Loss: 11148410.6384
Epoch 11, Loss: 11120974.2863
Epoch 12, Loss: 11134450.9298
Epoch 13, Loss: 11056007.9696
Epoch 14, Loss: 11070961.5587
Epoch 15, Loss: 11110049.7772
Epoch 16, Loss: 10973050.9664
Epoch 17, Loss: 10926563.1366
Epoch 18, Loss: 10901681.3685
Epoch 19, Loss: 10877998.9545

Predicted upvotes for new post:
Show HN: AI Hacker generates $1 billion → Predicted Upvotes: 17.99


In [3]:
# -----------------------------
# 11. Example
# -----------------------------
print("\nPredicted upvotes for new post:")
example_title = "Generates $1 billion today!"
example_title = "Close Votes Are A Feature, Not A Bug"
example_title = "what do you mean? okay"
example_title = "Show HN: AI Hacker generates $1 billion"
example_url = "https://openai.com"
pred = predict_upvotes(example_title, example_url, "ingve")
print(f"{example_title} → Predicted Upvotes: {pred:.2f}")



Predicted upvotes for new post:
Show HN: AI Hacker generates $1 billion → Predicted Upvotes: 17.99


In [16]:
# -----------------------------
# 11. Validation on held-out split
# -----------------------------
from sklearn.metrics import mean_squared_error, r2_score

# Build a DataLoader for the validation set
dataset_val = HNDataset(
    X_title_val,
    domain_ids_val,
    karma_val,
    y_val
)
val_loader = DataLoader(dataset_val, batch_size=32, shuffle=False)

model.eval()
all_preds = []
all_targets = []

with torch.no_grad():
    for title_emb, dom_id, krm, target in val_loader:
        pred = model(title_emb, dom_id, krm).squeeze()
        pred = pred.view(-1)
        all_preds.extend(pred.tolist())
        all_targets.extend(target.tolist())

# Flatten any singleton lists in all_targets (and preds, just in case)
all_preds_flat   = [p[0] if isinstance(p, list) else p   for p in all_preds]
all_targets_flat = [t[0] if isinstance(t, list) else t   for t in all_targets]

# Compute metrics
mse = mean_squared_error(all_targets, all_preds)
r2  = r2_score(all_targets, all_preds)
print(f"Validation MSE: {mse:.4f}")
print(f"Validation R² : {r2:.4f}")


# Optional: inspect a few examples
print("\nSample predictions vs. actuals:")
for i in range(20):
    print(f"  Predicted: {all_preds_flat[i]:.2f}  |  Actual: {all_targets_flat[i]:.0f}")


Validation MSE: 4744.5015
Validation R² : -0.0087

Sample predictions vs. actuals:
  Predicted: 17.99  |  Actual: 63
  Predicted: 34.91  |  Actual: 1
  Predicted: 17.99  |  Actual: 26
  Predicted: 17.99  |  Actual: 5
  Predicted: 17.99  |  Actual: 2
  Predicted: 17.99  |  Actual: 9
  Predicted: 17.99  |  Actual: 33
  Predicted: 58.47  |  Actual: 3
  Predicted: 17.99  |  Actual: 2
  Predicted: 17.99  |  Actual: 1
  Predicted: 17.99  |  Actual: 3
  Predicted: 17.99  |  Actual: 18
  Predicted: 17.99  |  Actual: 4
  Predicted: 17.99  |  Actual: 3
  Predicted: 17.99  |  Actual: 2
  Predicted: 17.99  |  Actual: 104
  Predicted: 17.99  |  Actual: 6
  Predicted: -0.71  |  Actual: 1
  Predicted: 17.99  |  Actual: 3
  Predicted: 17.99  |  Actual: 5
