In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import psycopg2
from urllib.parse import urlparse
from collections import Counter
from sklearn.preprocessing import LabelEncoder
import numpy as np
import re
import nltk
nltk.download('punkt')

# -----------------------------
# 1. Fetch data from PostgreSQL
# -----------------------------
# print("Fetching Hacker News data from PostgreSQL...")
# conn = psycopg2.connect("postgres://sy91dhb:g5t49ao@178.156.142.230:5432/hd64m1ki")
# cur = conn.cursor()
# cur.execute("SELECT title, url, score FROM hacker_news.items WHERE title IS NOT NULL AND score IS NOT NULL LIMIT 20000;")
# rows = cur.fetchall()
# conn.close()


# -----------------------------
# 1a. 20% TESTITNG DATA LOAD
# -----------------------------
print("Fetching Hacker News data...")
conn = psycopg2.connect("postgres://sy91dhb:g5t49ao@178.156.142.230:5432/hd64m1ki")
cur = conn.cursor()
cur.execute('''SELECT title, url, score
            FROM hacker_news.items
            WHERE title IS NOT NULL AND score IS NOT NULL AND (abs(hashtext(id::text)) % 100) < 20;
            ''')

# cur.execute('''SELECT i.title, i.url, i.score, u.id, u.karma 
#             FROM hacker_news.items AS i
#             LEFT JOIN hacker_news.users AS u 
#                 ON i.by = u.id
#             WHERE title IS NOT NULL AND score IS NOT NULL AND (abs(hashtext(i.id::text)) % 100) < 20;
#             ''')
rows_test = cur.fetchall()
conn.close()


# -----------------------------
# 1b. 80% TRAINING & VALIDATION DATA LOAD LIMITED TO 100K ROWS & RECENT DATA
# -----------------------------
print("Fetching Hacker News data...")
conn = psycopg2.connect("postgres://sy91dhb:g5t49ao@178.156.142.230:5432/hd64m1ki")
cur = conn.cursor()
cur.execute('''SELECT title, url, score
            FROM hacker_news.items
            WHERE title IS NOT NULL 
                AND score IS NOT NULL 
                AND (abs(hashtext(id::text)) % 100) >= 20
                AND EXTRACT(YEAR FROM time) >= 2023
            LIMIT 100000;
            ''')

# cur.execute('''SELECT i.title, i.url, i.score, u.id, u.karma 
#             FROM hacker_news.items AS i
#             LEFT JOIN hacker_news.users AS u 
#                 ON i.by = u.id
#             WHERE title IS NOT NULL AND score IS NOT NULL AND (abs(hashtext(i.id::text)) % 100) >= 20
#             LIMIT 100000;
#             ''')
rows = cur.fetchall()
conn.close()




titles, urls, scores = zip(*rows)

# -----------------------------
# 2. Preprocess titles
# -----------------------------
def preprocess(text):
    text = text.lower()
    text = re.sub(r'[^a-z0-9 ]+', '', text)
    return text.split()

tokenized_titles = [preprocess(title) for title in titles]
word_counts = Counter(word for title in tokenized_titles for word in title)
vocab = sorted(word_counts, key=word_counts.get, reverse=True)
word_to_ix = {word: i for i, word in enumerate(vocab)}
ix_to_word = {i: word for word, i in word_to_ix.items()}
vocab_size = len(word_to_ix)


# -----------------------------
# 3. CBOW Model
# -----------------------------
class CBOW(nn.Module):
    def __init__(self, vocab_size, embed_dim):
        super().__init__()
        self.embeddings = nn.Embedding(vocab_size, embed_dim)
        self.linear = nn.Linear(embed_dim, vocab_size)

    def forward(self, inputs):
        embeds = self.embeddings(inputs).mean(dim=1)
        return self.linear(embeds)

embed_dim = 5
cbow_model = CBOW(vocab_size, embed_dim)
cbow_loss_fn = nn.CrossEntropyLoss()
cbow_optimizer = optim.Adam(cbow_model.parameters(), lr=0.01)

# -----------------------------
# 4. Prepare CBOW training data
# -----------------------------
context_size = 2
def make_cbow_data(tokenized_titles):
    data = []
    for tokens in tokenized_titles:
        if len(tokens) < context_size * 2 + 1:
            continue
        for i in range(context_size, len(tokens) - context_size):
            context = tokens[i - context_size:i] + tokens[i + 1:i + context_size + 1]
            target = tokens[i]
            data.append((
                torch.tensor([word_to_ix[w] for w in context]),
                torch.tensor(word_to_ix[target])
            ))
    return data

cbow_data = make_cbow_data(tokenized_titles)

# -----------------------------
# 5. Train CBOW model
# -----------------------------
print("Training CBOW model...")
for epoch in range(10):
    total_loss = 0
    for context, target in cbow_data:
        output = cbow_model(context.unsqueeze(0))
        loss = cbow_loss_fn(output, target.unsqueeze(0))
        cbow_optimizer.zero_grad()
        loss.backward()
        cbow_optimizer.step()
        total_loss += loss.item()
    print(f"CBOW Epoch {epoch}, Loss: {total_loss:.4f}")

# -----------------------------
# 6. Create title embeddings
# -----------------------------
title_embeddings = []
valid_indices = []
for i, tokens in enumerate(tokenized_titles):
    token_ids = [word_to_ix[t] for t in tokens if t in word_to_ix]
    if token_ids:
        with torch.no_grad():
            vectors = cbow_model.embeddings(torch.tensor(token_ids))
            avg_vector = vectors.mean(dim=0)
        title_embeddings.append(avg_vector)
        valid_indices.append(i)

X_title = torch.stack(title_embeddings)
y = torch.tensor([scores[i] for i in valid_indices], dtype=torch.float32).unsqueeze(1)

# -----------------------------
# 7. Process domain names
# -----------------------------
parsed_domains = []
for url in urls:
    try:
        parsed = urlparse(url)
        domain = parsed.netloc or 'unknown'
    except:
        domain = 'unknown'
    parsed_domains.append(domain)

le = LabelEncoder()
domain_ids = le.fit_transform(parsed_domains)
domain_ids_tensor = torch.tensor(domain_ids, dtype=torch.long)[valid_indices]
domain_vocab_size = len(le.classes_)
domain_embed_dim = 3

# -----------------------------
# 8. Regression Model
# -----------------------------
class UpvotePredictor(nn.Module):
    def __init__(self, title_embed_dim, domain_vocab_size, domain_embed_dim):
        super().__init__()
        self.domain_embedding = nn.Embedding(domain_vocab_size, domain_embed_dim)
        self.model = nn.Sequential(
            nn.Linear(title_embed_dim + domain_embed_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 1)
        )

    def forward(self, title_embed, domain_id):
        domain_vec = self.domain_embedding(domain_id)
        x = torch.cat([title_embed, domain_vec], dim=1)
        return self.model(x)

model = UpvotePredictor(embed_dim, domain_vocab_size, domain_embed_dim)
loss_fn = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# -----------------------------
# 9. Dataset and Training
# -----------------------------
class HNDataset(Dataset):
    def __init__(self, title_embeds, domain_ids, labels):
        self.title_embeds = title_embeds
        self.domain_ids = domain_ids
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.title_embeds[idx], self.domain_ids[idx], self.labels[idx]

dataset = HNDataset(X_title, domain_ids_tensor, y)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

print("Training regression model...")
for epoch in range(20):
    total_loss = 0
    for title_embed, domain_id, label in dataloader:
        pred = model(title_embed, domain_id)
        loss = loss_fn(pred, label)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch}, Loss: {total_loss:.4f}")

# -----------------------------
# 10. Prediction Function
# -----------------------------
def predict_upvotes(title, url):
    tokens = preprocess(title)
    token_ids = [word_to_ix.get(t) for t in tokens if t in word_to_ix]
    if not token_ids:
        return None

    with torch.no_grad():
        vectors = cbow_model.embeddings(torch.tensor(token_ids))
        avg_embed = vectors.mean(dim=0)

    try:
        parsed = urlparse(url)
        domain = parsed.netloc or 'unknown'
    except:
        domain = 'unknown'

    try:
        domain_id = le.transform([domain])[0]
    except:
        domain_id = 0  # fallback if unseen domain

    domain_tensor = torch.tensor([domain_id], dtype=torch.long)
    model.eval()
    with torch.no_grad():
        prediction = model(avg_embed.unsqueeze(0), domain_tensor).item()
    return prediction



[nltk_data] Downloading package punkt to /home/usa/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Fetching Hacker News data from PostgreSQL...
Training CBOW model...
CBOW Epoch 0, Loss: 861068.2161
CBOW Epoch 1, Loss: 860621.4569
CBOW Epoch 2, Loss: 847706.8341
CBOW Epoch 3, Loss: 836108.4245
CBOW Epoch 4, Loss: 826832.6198
CBOW Epoch 5, Loss: 819517.2792
CBOW Epoch 6, Loss: 813776.5789
CBOW Epoch 7, Loss: 809481.7710
CBOW Epoch 8, Loss: 806728.2729
CBOW Epoch 9, Loss: 804931.5705
Training regression model...
Epoch 0, Loss: 3585472.1901
Epoch 1, Loss: 3557292.7703
Epoch 2, Loss: 3550885.7382
Epoch 3, Loss: 3540995.4077
Epoch 4, Loss: 3532520.7850
Epoch 5, Loss: 3521245.4537
Epoch 6, Loss: 3504623.2575
Epoch 7, Loss: 3491089.4931
Epoch 8, Loss: 3471266.5018
Epoch 9, Loss: 3451778.7582
Epoch 10, Loss: 3428752.0967
Epoch 11, Loss: 3404225.3358
Epoch 12, Loss: 3379119.6823
Epoch 13, Loss: 3351952.3654
Epoch 14, Loss: 3329757.9417
Epoch 15, Loss: 3289924.9066
Epoch 16, Loss: 3254076.5346
Epoch 17, Loss: 3216272.4020
Epoch 18, Loss: 3178803.4402
Epoch 19, Loss: 3137114.1441


In [11]:
# -----------------------------
# 11. Example
# -----------------------------
print("\nPredicted upvotes for new post:")

example_title = "what do you mean? okay"
example_title = "Show HN: AI Hacker generates $1 billion"
example_url = "https://openwall.com"
pred = predict_upvotes(example_title, example_url)
print(f"{example_title} → Predicted Upvotes: {pred:.2f}")



Predicted upvotes for new post:
Show HN: AI Hacker generates $1 billion → Predicted Upvotes: 7.63
