In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import psycopg2
from urllib.parse import urlparse
from collections import Counter
from sklearn.preprocessing import LabelEncoder
import numpy as np
import re
from collections import Counter
from datasets import load_dataset
import json


  from .autonotebook import tqdm as notebook_tqdm


In [2]:

# -----------------------------
# 1. Fetch data from PostgreSQL
# -----------------------------
print("Fetching Hacker News data from PostgreSQL...")
conn = psycopg2.connect("postgres://sy91dhb:g5t49ao@178.156.142.230:5432/hd64m1ki")
cur = conn.cursor()
cur.execute("SELECT title, url, score FROM hacker_news.items WHERE title IS NOT NULL AND score IS NOT NULL LIMIT 20000;")
rows = cur.fetchall()
conn.close()

titles, urls, scores = zip(*rows)

# -----------------------------
# 2. Preprocess titles
# -----------------------------
def preprocess(text):
    text = text.lower()
    text = re.sub(r'[^a-z0-9 ]+', '', text)
    return text.split()

tokenized_titles = [preprocess(title) for title in titles]
word_counts = Counter(word for title in tokenized_titles for word in title)
vocab = sorted(word_counts, key=word_counts.get, reverse=True)
word_to_ix = {word: i for i, word in enumerate(vocab)}
ix_to_word = {i: word for word, i in word_to_ix.items()}
vocab_size = len(word_to_ix)


Fetching Hacker News data from PostgreSQL...


In [12]:

# -----------------------------
# 3. Load Vocabulary
# -----------------------------
with open("vocab-w2-200titles.json", "r", encoding="utf-8") as f:
    word_to_ix = json.load(f)

ix_to_word = {int(i): w for w, i in word_to_ix.items()}
vocab_size = len(word_to_ix)


# -----------------------------
# 3. Load Pre-trained Embeddings
# -----------------------------
embed_dim = 300  
embeddings = torch.load("embeddings-w2-200titles-300dim-10e.pt", map_location='cpu')  # Shape: [vocab_size, embed_dim]
assert embeddings.shape[0] == vocab_size, "Vocab size mismatch!"


In [13]:


# -----------------------------
# 4. CBOW Model
# -----------------------------
class CBOW(nn.Module):
    def __init__(self, vocab_size, embed_dim):
        super().__init__()
        self.embeddings = nn.Embedding(vocab_size, embed_dim)
        self.linear = nn.Linear(embed_dim, vocab_size)

    def forward(self, inputs):
        embeds = self.embeddings(inputs).mean(dim=1)
        return self.linear(embeds)

cbow_model = CBOW(vocab_size, embed_dim)
cbow_model.embeddings.weight.data.copy_(embeddings)
cbow_model.embeddings.weight.requires_grad = False  


# -----------------------------
# 6. Create title embeddings
# -----------------------------
title_embeddings = []
valid_indices = []
for i, tokens in enumerate(tokenized_titles):
    token_ids = [word_to_ix[t] for t in tokens if t in word_to_ix]
    if token_ids:
        with torch.no_grad():
            vectors = cbow_model.embeddings(torch.tensor(token_ids))
            avg_vector = vectors.mean(dim=0)
        title_embeddings.append(avg_vector)
        valid_indices.append(i)

X_title = torch.stack(title_embeddings)
y = torch.tensor([scores[i] for i in valid_indices], dtype=torch.float32).unsqueeze(1)

# -----------------------------
# 7. Process domain names
# -----------------------------
parsed_domains = []
for url in urls:
    try:
        parsed = urlparse(url)
        domain = parsed.netloc or 'unknown'
    except:
        domain = 'unknown'
    parsed_domains.append(domain)

le = LabelEncoder()
domain_ids = le.fit_transform(parsed_domains)
domain_ids_tensor = torch.tensor(domain_ids, dtype=torch.long)[valid_indices]
domain_vocab_size = len(le.classes_)
domain_embed_dim = 3

# -----------------------------
# 8. Regression Model
# -----------------------------
class UpvotePredictor(nn.Module):
    def __init__(self, title_embed_dim, domain_vocab_size, domain_embed_dim):
        super().__init__()
        self.domain_embedding = nn.Embedding(domain_vocab_size, domain_embed_dim)
        self.model = nn.Sequential(
            nn.Linear(title_embed_dim + domain_embed_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 1)
        )

    def forward(self, title_embed, domain_id):
        domain_vec = self.domain_embedding(domain_id)
        x = torch.cat([title_embed, domain_vec], dim=1)
        return self.model(x)

model = UpvotePredictor(embed_dim, domain_vocab_size, domain_embed_dim)
loss_fn = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# -----------------------------
# 9. Dataset and Training
# -----------------------------
class HNDataset(Dataset):
    def __init__(self, title_embeds, domain_ids, labels):
        self.title_embeds = title_embeds
        self.domain_ids = domain_ids
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.title_embeds[idx], self.domain_ids[idx], self.labels[idx]

dataset = HNDataset(X_title, domain_ids_tensor, y)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

print("Training regression model...")
for epoch in range(20):
    total_loss = 0
    for title_embed, domain_id, label in dataloader:
        pred = model(title_embed, domain_id)
        loss = loss_fn(pred, label)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch}, Loss: {total_loss:.4f}")

# -----------------------------
# 10. Prediction Function
# -----------------------------
def predict_upvotes(title, url):
    tokens = preprocess(title)
    token_ids = [word_to_ix.get(t) for t in tokens if t in word_to_ix]
    if not token_ids:
        return None

    with torch.no_grad():
        vectors = cbow_model.embeddings(torch.tensor(token_ids))
        avg_embed = vectors.mean(dim=0)

    try:
        parsed = urlparse(url)
        domain = parsed.netloc or 'unknown'
    except:
        domain = 'unknown'

    try:
        domain_id = le.transform([domain])[0]
    except:
        domain_id = 0  # fallback if unseen domain

    domain_tensor = torch.tensor([domain_id], dtype=torch.long)
    model.eval()
    with torch.no_grad():
        prediction = model(avg_embed.unsqueeze(0), domain_tensor).item()
    return prediction



Training regression model...
Epoch 0, Loss: 3505053.8513
Epoch 1, Loss: 3465769.0129
Epoch 2, Loss: 3422931.6892
Epoch 3, Loss: 3379705.4300
Epoch 4, Loss: 3305101.5542
Epoch 5, Loss: 3220818.2889
Epoch 6, Loss: 3120895.7736
Epoch 7, Loss: 3012050.1520
Epoch 8, Loss: 2894646.6732
Epoch 9, Loss: 2759575.0025
Epoch 10, Loss: 2607334.1094
Epoch 11, Loss: 2423618.8444
Epoch 12, Loss: 2194236.4211
Epoch 13, Loss: 1999496.9757
Epoch 14, Loss: 1737605.4961
Epoch 15, Loss: 1517183.8288
Epoch 16, Loss: 1334066.1755
Epoch 17, Loss: 1275192.3249
Epoch 18, Loss: 1076709.5625
Epoch 19, Loss: 936962.6045


In [16]:
# -----------------------------
# 11. Example
# -----------------------------
print("\nPredicted upvotes for new post:")

example_title = "what do you mean? okay"
example_title = "The librarian immediately attempts to sell you a vuvuzela"
example_url = "kaveland.no"
pred = predict_upvotes(example_title, example_url)
print(f"{example_title} → Predicted Upvotes: {pred:.2f}")



Predicted upvotes for new post:
The librarian immediately attempts to sell you a vuvuzela → Predicted Upvotes: 4.75
