In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import psycopg2
import re
import nltk
from urllib.parse import urlparse
from nltk.tokenize import word_tokenize
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import LabelEncoder
import numpy as np

nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/usa/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
# -----------------------------
# 1. Load and preprocess Hacker News data with URLs
# -----------------------------
print("Fetching Hacker News data...")
conn = psycopg2.connect("postgres://sy91dhb:g5t49ao@178.156.142.230:5432/hd64m1ki")
cur = conn.cursor()
cur.execute("SELECT title, url, score FROM hacker_news.items WHERE title IS NOT NULL AND score IS NOT NULL LIMIT 10000;")
data = cur.fetchall()
conn.close()


Fetching Hacker News data...


In [3]:

def preprocess(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    return word_tokenize(text)

# Extract and process data
tokenized_titles = [preprocess(title) for title, _, _ in data]
upvotes = [float(up) for _, _, up in data]
urls = [url for _, url, _ in data]


In [7]:
print(upvotes[:20])

[144.0, 1.0, 3.0, 1.0, 2.0, 1.0, 1.0, 2.0, 1.0, 2.0, 5.0, 1.0, 2.0, 2.0, 1.0, 1.0, 2.0, 2.0, 1.0, 180.0]


In [8]:
print(tokenized_titles[:20])

[['breaking', 'textbook', 'rsa', 'used', 'to', 'protect', 'the', 'privacy', 'of', 'millions', 'of', 'users'], ['evernote', 'is', 'in', 'deep', 'trouble'], ['found', 'a', '90s', 'chatroom', 'for', 'horses', 'on', 'a', 'bunch', 'of', 'old', 'floppy', 'disks'], ['theorytheory'], ['ask', 'hn', 'what', 'are', 'the', 'best', 'book', 'on', 'startup'], ['is', 'there', 'a', 'me', 'inside', 'of', 'here', 'that', 'is', 'dying', 'to', 'get', 'out'], ['political', 'correctness', 'is', 'stifling', 'innovation'], ['semisupervised', 'image', 'classification', 'explained'], ['earthday2016whattodoandwear'], ['chinas', 'next', 'target', 'us', 'microchip', 'hegemony'], ['graphql', 'at', 'braintree'], ['this', 'is', 'americas', 'richest', 'zip', 'code'], ['angular', '6', 'universal', 'pwa', 'installable', 'from', 'npm'], ['goes', 'satellite', 'view', 'of', 'eclipse'], ['shaaaaaaaaaaaaa'], ['here', 'be', 'dragons', 'the', 'mythic', 'bite', 'of', 'the', 'komodo', 'science', 'sushi'], ['how', 'the', 'mig31', 

In [5]:
print(urls[:20])

['https://arxiv.org/abs/1802.03367', 'http://businessinsider.com/evernote-is-in-deep-trouble-2015-10', 'http://www.deathandtaxesmag.com/265340/horsechat-horsechat-horsechat-horsechat/', 'https://en.wikipedia.org/wiki/Theory-theory', None, 'http://uber.la/2016/10/inside-dying-get/', 'http://www.breitbart.com/california/2017/12/19/silicon-valley-political-correctness-stifling-innovation/', 'https://thecuriousaicompany.com/mean-teacher/', 'http://claudettepesterine.com/2016/04/earth-day-2016-wear/?utm_campaign=shareaholic&utm_medium=yc_hacker_news&utm_source=news', 'https://www.wsj.com/articles/chinas-next-target-u-s-microchip-hegemony-1501168303', 'https://www.braintreepayments.com/blog/graphql-at-braintree/', 'https://www.bloomberg.com/news/articles/2018-04-10/to-visit-america-s-richest-zip-code-first-you-ll-need-a-boat', None, 'http://www.goes.noaa.gov/GSSLOOPS/wcvs.html', 'https://shaaaaaaaaaaaaa.com/', 'http://blogs.discovermagazine.com/science-sushi/2013/06/25/here-be-dragons-the-my

In [6]:

# Extract domain features from URLs
def extract_domain_features(url_list):
    domains = []
    domain_lengths = []
    is_https = []
    
    for url in url_list:
        if not url or not isinstance(url, str):  # Handle missing/None URLs
            domains.append('unknown')
            domain_lengths.append(0)
            is_https.append(0)
            continue
            
        try:
            parsed = urlparse(url)
            domain = parsed.netloc
            domains.append(domain)
            domain_lengths.append(len(domain))
            is_https.append(1 if parsed.scheme == 'https' else 0)
        except:
            domains.append('unknown')
            domain_lengths.append(0)
            is_https.append(0)
    
    # Encode domains numerically
    le = LabelEncoder()
    domain_encoded = le.fit_transform(domains)
    
    return np.column_stack([
        domain_encoded,
        domain_lengths,
        is_https
    ])



In [9]:
# Get URL features
url_features = extract_domain_features(urls)
url_features = torch.tensor(url_features, dtype=torch.float32)

# -----------------------------
# 2. Build Vocabulary
# -----------------------------
from collections import Counter
all_tokens = [token for title in tokenized_titles for token in title]
vocab = [word for word, freq in Counter(all_tokens).items() if freq >= 5]
word_to_ix = {word: i for i, word in enumerate(vocab)}
vocab_size = len(vocab)
print(f"Vocab size: {vocab_size}")

Vocab size: 2489


In [10]:
print(vocab[:20])

['breaking', 'used', 'to', 'protect', 'the', 'privacy', 'of', 'millions', 'users', 'is', 'in', 'deep', 'trouble', 'found', 'a', 'for', 'on', 'old', 'ask', 'hn']


In [11]:

# -----------------------------
# 3. Prepare CBOW training data
# -----------------------------
window_size = 2
cbow_data = []
for title in tokenized_titles:
    indexed = [word_to_ix[word] for word in title if word in word_to_ix]
    for i in range(window_size, len(indexed) - window_size):
        context = indexed[i - window_size:i] + indexed[i + 1:i + window_size + 1]
        target = indexed[i]
        cbow_data.append((context, target))
print(f"Training CBOW pairs: {len(cbow_data)}")

Training CBOW pairs: 24450


In [12]:
# -----------------------------
# 4. CBOW Model
# -----------------------------
embedding_dim = 100
class CBOW(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super().__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear = nn.Linear(embedding_dim, vocab_size)
    def forward(self, context_idxs):
        embeds = self.embeddings(context_idxs)
        avg_embed = embeds.mean(dim=1)
        out = self.linear(avg_embed)
        return out

cbow_model = CBOW(vocab_size, embedding_dim)
cbow_loss_fn = nn.CrossEntropyLoss()
cbow_optimizer = optim.Adam(cbow_model.parameters(), lr=0.001)


In [15]:

# -----------------------------
# 5. Train CBOW Model
# -----------------------------
print("Training CBOW model on HN titles...")
for epoch in range(5):
    total_loss = 0
    for context, target in cbow_data:
        context_var = torch.tensor([context], dtype=torch.long)
        target_var = torch.tensor([target], dtype=torch.long)
        cbow_model.zero_grad()
        logits = cbow_model(context_var)
        loss = cbow_loss_fn(logits, target_var)
        loss.backward()
        cbow_optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}, CBOW Loss: {total_loss:.2f}")

Training CBOW model on HN titles...
Epoch 1, CBOW Loss: 163973.86
Epoch 2, CBOW Loss: 147424.31
Epoch 3, CBOW Loss: 139970.70
Epoch 4, CBOW Loss: 132397.04
Epoch 5, CBOW Loss: 126074.64


In [16]:
# -----------------------------
# 6. Create averaged title embeddings and combine with URL features
# -----------------------------
title_embeddings = []
valid_labels = []
valid_indices = []  # To keep track of which samples we're keeping

for idx, (tokens, label) in enumerate(zip(tokenized_titles, upvotes)):
    token_ids = [word_to_ix[t] for t in tokens if t in word_to_ix]
    if token_ids:
        with torch.no_grad():
            vectors = cbow_model.embeddings(torch.tensor(token_ids))
            avg_vector = vectors.mean(dim=0)
        title_embeddings.append(avg_vector)
        valid_labels.append(label)
        valid_indices.append(idx)


In [17]:
# Stack embeddings and get corresponding URL features
X_title = torch.stack(title_embeddings)
X_url = url_features[valid_indices]  # Only keep URL features for valid samples
y = torch.tensor(valid_labels, dtype=torch.float32).unsqueeze(1)

# Combine title embeddings and URL features
X_combined = torch.cat([X_title, X_url], dim=1)

# -----------------------------
# 7. Dataset and Dataloader
# -----------------------------
class HNDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y
    def __len__(self):
        return len(self.X)
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

dataset = HNDataset(X_combined, y)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)


In [18]:
# -----------------------------
# 8. Enhanced Regression Model (now with URL features)
# -----------------------------
class UpvotePredictor(nn.Module):
    def __init__(self, title_embed_dim, url_feat_dim):
        super().__init__()
        combined_dim = title_embed_dim + url_feat_dim
        self.model = nn.Sequential(
            nn.Linear(combined_dim, 256),
            nn.ReLU(),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 1)
        )
    def forward(self, x):
        return self.model(x)

model = UpvotePredictor(embedding_dim, url_features.shape[1])
loss_fn = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)


In [19]:
# -----------------------------
# 9. Train Enhanced Regression Model
# -----------------------------
print("Training enhanced upvote regression model...")
for epoch in range(10):
    total_loss = 0
    for batch_x, batch_y in dataloader:
        pred = model(batch_x)
        loss = loss_fn(pred, batch_y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {total_loss:.4f}")

Training enhanced upvote regression model...
Epoch 1, Loss: 942215.6909
Epoch 2, Loss: 935179.0526
Epoch 3, Loss: 934054.3995
Epoch 4, Loss: 930922.0938
Epoch 5, Loss: 929337.5126
Epoch 6, Loss: 929907.0703
Epoch 7, Loss: 927330.7702
Epoch 8, Loss: 924687.2913
Epoch 9, Loss: 925493.6873
Epoch 10, Loss: 927113.7655
