In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import psycopg2
import re
import nltk
from urllib.parse import urlparse
from nltk.tokenize import word_tokenize
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import LabelEncoder
import numpy as np

nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/usa/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
# -----------------------------
# 1. Load and preprocess Hacker News data with URLs
# -----------------------------
print("Fetching Hacker News data...")
conn = psycopg2.connect("postgres://sy91dhb:g5t49ao@178.156.142.230:5432/hd64m1ki")
cur = conn.cursor()
cur.execute("SELECT title, url, score FROM hacker_news.items WHERE title IS NOT NULL AND score IS NOT NULL LIMIT 10000;")
data = cur.fetchall()
conn.close()


Fetching Hacker News data...


In [3]:

def preprocess(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    return word_tokenize(text)

# Extract and process data
tokenized_titles = [preprocess(title) for title, _, _ in data]
upvotes = [float(up) for _, _, up in data]
urls = [url for _, url, _ in data]


In [4]:
print(upvotes[:20])

[1.0, 1.0, 1.0, 2.0, 2.0, 1.0, 2.0, 1.0, 18.0, 1.0, 3.0, 8.0, 5.0, 2.0, 1.0, 3.0, 1.0, 9.0, 1.0, 1.0]


In [5]:
print(tokenized_titles[:20])

[['android', 'chrome', 'update', 'brings', 'stability', 'and', 'performance', 'improvements'], ['lifelikes', 'religious', 'extremists', 'or', 'marketing', 'through', 'intellectual', 'stimulus'], ['3d', 'printer', 'on', 'moon', 'or', 'mars', 'could', 'make', 'tools', 'from', 'local', 'rocks'], ['the', 'bookstore', 'strikes', 'back'], ['a', 'new', 'model', 'for', 'the', 'musiciantolistener', 'relationship'], ['palestines', 'bid', 'to', 'statehood'], ['learn', 'more', 'by', 'asking', 'fewer', 'questions'], ['latest', 'trend', 'in', 'piling', 'builidngs', 'ontop', 'of', 'each', 'other'], ['ask', 'hn', 'how', 'much', 'money', 'does', 'your', 'ios', 'app', 'make'], ['the', 'security', 'mindset', 'engineers', 'vs', 'security', 'professionals'], ['att', 'ceo', 'says', 'hard', 'to', 'find', 'skilled', 'us', 'workers'], ['fear', 'of', 'housing', 'slump', 'may', 'be', 'seriously', 'overdone'], ['rubiks', 'cube', 'proof', 'cut', 'to', '25', 'moves'], ['acid3', 'was', 'broken'], ['use', 'case', 'sc

In [6]:
print(urls[:20])

['http://technobb.com/google/android-chrome-update-brings-stability-and-performance-improvements/', 'http://www.watchlifelike.com/index.php/lifelikes-religious-extremists-or-marketing-through-intellectual-stimulus/', 'http://news.cnet.com/8301-17938_105-57556017-1/3d-printer-on-moon-or-mars-could-make-tools-from-local-rocks/', 'http://www.theatlantic.com/magazine/archive/2012/12/the-bookstore-strikes-back/309164/?single_page=true', 'https://medium.com/the-future-of-publishing/44e778827320', 'http://www.avaaz.org/en/palestine_worlds_next_nation_a/?fp', 'http://blogs.hbr.org/schrage/2012/08/learn-more-by-asking-fewer-questions.html', 'http://www.dezeen.com/2012/11/19/peruri-88-by-mvrdvthe-jerde-partnership-and-arup-dublin/', None, 'http://www.schneier.com/blog/archives/2008/03/the_security_mi_1.html', 'http://news.yahoo.com/s/nm/20080327/tc_nm/att_workforce_dc', 'http://online.wsj.com/article/SB120640528180260969.html?mod=googlenews_wsj', 'http://arxivblog.com/?p=332', 'http://ln.hixie.c

In [7]:

# Extract domain features from URLs
def extract_domain_features(url_list):
    domains = []
    domain_lengths = []
    is_https = []
    
    for url in url_list:
        if not url or not isinstance(url, str):  # Handle missing/None URLs
            domains.append('unknown')
            domain_lengths.append(0)
            is_https.append(0)
            continue
            
        try:
            parsed = urlparse(url)
            domain = parsed.netloc
            domains.append(domain)
            domain_lengths.append(len(domain))
            is_https.append(1 if parsed.scheme == 'https' else 0)
        except:
            domains.append('unknown')
            domain_lengths.append(0)
            is_https.append(0)
    
    # Encode domains numerically
    le = LabelEncoder()
    domain_encoded = le.fit_transform(domains)
    
    return np.column_stack([
        domain_encoded,
        domain_lengths,
        is_https
    ])



In [8]:
# Get URL features
url_features = extract_domain_features(urls)
url_features = torch.tensor(url_features, dtype=torch.float32)

# -----------------------------
# 2. Build Vocabulary
# -----------------------------
from collections import Counter
all_tokens = [token for title in tokenized_titles for token in title]
vocab = [word for word, freq in Counter(all_tokens).items() if freq >= 5]
word_to_ix = {word: i for i, word in enumerate(vocab)}
vocab_size = len(vocab)
print(f"Vocab size: {vocab_size}")

Vocab size: 2301


In [9]:
print(vocab[:20])

['android', 'chrome', 'update', 'brings', 'and', 'performance', 'or', 'marketing', 'through', 'intellectual', '3d', 'printer', 'on', 'moon', 'mars', 'could', 'make', 'tools', 'from', 'local']


In [10]:

# -----------------------------
# 3. Prepare CBOW training data
# -----------------------------
window_size = 2
cbow_data = []
for title in tokenized_titles:
    indexed = [word_to_ix[word] for word in title if word in word_to_ix]
    for i in range(window_size, len(indexed) - window_size):
        context = indexed[i - window_size:i] + indexed[i + 1:i + window_size + 1]
        target = indexed[i]
        cbow_data.append((context, target))
print(f"Training CBOW pairs: {len(cbow_data)}")

Training CBOW pairs: 19077


In [11]:
# -----------------------------
# 4. CBOW Model
# -----------------------------
embedding_dim = 100
class CBOW(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super().__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear = nn.Linear(embedding_dim, vocab_size)
    def forward(self, context_idxs):
        embeds = self.embeddings(context_idxs)
        avg_embed = embeds.mean(dim=1)
        out = self.linear(avg_embed)
        return out

cbow_model = CBOW(vocab_size, embedding_dim)
cbow_loss_fn = nn.CrossEntropyLoss()
cbow_optimizer = optim.Adam(cbow_model.parameters(), lr=0.001)


In [12]:

# -----------------------------
# 5. Train CBOW Model
# -----------------------------
print("Training CBOW model on HN titles...")
for epoch in range(5):
    total_loss = 0
    for context, target in cbow_data:
        context_var = torch.tensor([context], dtype=torch.long)
        target_var = torch.tensor([target], dtype=torch.long)
        cbow_model.zero_grad()
        logits = cbow_model(context_var)
        loss = cbow_loss_fn(logits, target_var)
        loss.backward()
        cbow_optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}, CBOW Loss: {total_loss:.2f}")

Training CBOW model on HN titles...
Epoch 1, CBOW Loss: 127278.03
Epoch 2, CBOW Loss: 111313.49
Epoch 3, CBOW Loss: 104768.17
Epoch 4, CBOW Loss: 98382.85
Epoch 5, CBOW Loss: 92343.16


In [17]:
# -----------------------------
# 6. Create averaged title embeddings and combine with URL features
# -----------------------------
title_embeddings = []
valid_labels = []
valid_indices = []  # To keep track of which samples we're keeping

for idx, (tokens, label) in enumerate(zip(tokenized_titles, upvotes)):
    token_ids = [word_to_ix[t] for t in tokens if t in word_to_ix]
    if token_ids:
        with torch.no_grad():
            vectors = cbow_model.embeddings(torch.tensor(token_ids))
            avg_vector = vectors.mean(dim=0)
        title_embeddings.append(avg_vector)
        valid_labels.append(label)
        valid_indices.append(idx)


In [18]:
# Stack embeddings and get corresponding URL features
X_title = torch.stack(title_embeddings)
X_url = url_features[valid_indices]  # Only keep URL features for valid samples
y = torch.tensor(valid_labels, dtype=torch.float32).unsqueeze(1)

# Combine title embeddings and URL features
X_combined = torch.cat([X_title, X_url], dim=1)

# -----------------------------
# 7. Dataset and Dataloader
# -----------------------------
class HNDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y
    def __len__(self):
        return len(self.X)
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

dataset = HNDataset(X_combined, y)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)


In [19]:
# -----------------------------
# 8. Enhanced Regression Model (now with URL features)
# -----------------------------
class UpvotePredictor(nn.Module):
    def __init__(self, title_embed_dim, url_feat_dim):
        super().__init__()
        combined_dim = title_embed_dim + url_feat_dim
        self.model = nn.Sequential(
            nn.Linear(combined_dim, 256),
            nn.ReLU(),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 1)
        )
    def forward(self, x):
        return self.model(x)

model = UpvotePredictor(embedding_dim, url_features.shape[1])
loss_fn = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)


In [20]:
# -----------------------------
# 9. Train Enhanced Regression Model
# -----------------------------
print("Training enhanced upvote regression model...")
for epoch in range(10):
    total_loss = 0
    for batch_x, batch_y in dataloader:
        pred = model(batch_x)
        loss = loss_fn(pred, batch_y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {total_loss:.4f}")

Training enhanced upvote regression model...
Epoch 1, Loss: 530761.3510
Epoch 2, Loss: 526978.8422
Epoch 3, Loss: 524142.1476
Epoch 4, Loss: 522481.2534
Epoch 5, Loss: 522720.1001
Epoch 6, Loss: 521357.9524
Epoch 7, Loss: 521041.1453
Epoch 8, Loss: 520943.0590
Epoch 9, Loss: 517661.1490
Epoch 10, Loss: 516180.8094


In [21]:
def predict_upvotes(title, url):
    # Preprocess and tokenize the title
    tokens = preprocess(title)
    token_ids = [word_to_ix[t] for t in tokens if t in word_to_ix]
    if not token_ids:
        print("No known tokens in title.")
        return None

    with torch.no_grad():
        title_vecs = cbow_model.embeddings(torch.tensor(token_ids))
        avg_embed = title_vecs.mean(dim=0)

    # Extract URL features
    parsed = urlparse(url)
    domain = parsed.netloc or "unknown"
    domain_len = len(domain)
    https_flag = 1 if parsed.scheme == 'https' else 0
    try:
        domain_encoded = le.transform([domain])[0]
    except:
        domain_encoded = 0  # Handle unseen domain

    url_feat = torch.tensor([[domain_encoded, domain_len, https_flag]], dtype=torch.float32)

    # Combine title embedding and URL features
    x_combined = torch.cat([avg_embed.unsqueeze(0), url_feat], dim=1)

    # Predict
    model.eval()
    with torch.no_grad():
        prediction = model(x_combined).item()
    return prediction


In [55]:
# Sample prediction
title_input = "AI Hacker Cracks the Code"
url_input = "https://google.com"
predicted_score = predict_upvotes(title_input, url_input)
print(f"Predicted upvotes for \"{title_input}\" → {predicted_score:.2f}")

Predicted upvotes for "AI Hacker Cracks the Code" → 36.95
