In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import psycopg2
import re
import nltk
from nltk.tokenize import word_tokenize
from torch.utils.data import Dataset, DataLoader
#nltk.download('punkt')
nltk.download('punkt_tab')

# -----------------------------
# 1. Load and preprocess Hacker News data
# ----------------------------
print("Fetching Hacker News titles...")
conn = psycopg2.connect("postgres://sy91dhb:g5t49ao@178.156.142.230:5432/hd64m1ki")
cur = conn.cursor()
cur.execute("SELECT title, score FROM hacker_news.items WHERE title IS NOT NULL AND score IS NOT NULL LIMIT 1000;")
data = cur.fetchall()
conn.close()

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/Arjuna/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Fetching Hacker News titles...


In [2]:
def preprocess(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    return word_tokenize(text)

In [3]:
tokenized_titles = [preprocess(title) for title, _ in data]
upvotes = [float(up) for _, up in data]

In [4]:
print(tokenized_titles)



In [5]:
len(tokenized_titles)

1000

In [45]:
# -----------------------------
# 2. Build Vocabulary
# -----------------------------
from collections import Counter
all_tokens = [token for title in tokenized_titles for token in title]
# We build the vocabulary based on words that appear at least 5 times
vocab_list = [word for word, freq in Counter(all_tokens).items() if freq >= 5]
word_to_ix = {word: i for i, word in enumerate(vocab_list)}
vocab_size = len(vocab_list)
ix_to_word = {i: word for word, i in word_to_ix.items()}

print(f"Vocabulary Size: {vocab_size}")



Vocabulary Size: 229


In [46]:
# ---------------------------------
# 3. Prepare Skip-gram training data
# ---------------------------------
# We create (target, context) pairs.
window_size = 2
skipgram_data = []
for title in tokenized_titles:
    # Convert title tokens to their corresponding indices from the vocabulary
    indexed_title = [word_to_ix[word] for word in title if word in word_to_ix]
    for i, target_word_ix in enumerate(indexed_title):
        # Determine the context window around the target word
        start = max(0, i - window_size)
        end = min(len(indexed_title), i + window_size + 1)
        
        # Iterate through context words in the window
        for j in range(start, end):
            if i == j: # Skip the target word itself
                continue
            context_word_ix = indexed_title[j]
            skipgram_data.append((target_word_ix, context_word_ix))

print(f"Total Skip-gram training pairs: {len(skipgram_data)}")


Total Skip-gram training pairs: 8818


In [47]:
# ----------------------------
# 4. Skip-gram Model Definition
# ----------------------------
# This class defines the architecture for our Skip-gram model.
class SkipGram(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(SkipGram, self).__init__()
        # Embedding layer: maps a word index to a dense vector
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        # Linear layer: maps the embedding vector to a score for each word in the vocabulary
        self.linear = nn.Linear(embedding_dim, vocab_size)

    def forward(self, target_word_idx):
        # Get the embedding for the target word
        embeds = self.embeddings(target_word_idx) 
        # Get the scores for each word in the vocabulary
        out = self.linear(embeds)
        return out

In [48]:
# ----------------------------
# 5. Train Skip-gram Model
# ----------------------------
# This is the alternative training block for the Skip-gram model.

# Instantiate the model, loss function, and optimizer
embedding_dim = 100 
skipgram_model = SkipGram(vocab_size, embedding_dim)
loss_function = nn.CrossEntropyLoss()
optimizer = optim.Adam(skipgram_model.parameters(), lr=0.01)

print("\nTraining Skip-gram model on HN titles...")

# Training loop
for epoch in range(10): # You can adjust the number of epochs
    total_loss = 0
    for target, context in skipgram_data:
        # Prepare tensors for the target and context words
        target_tensor = torch.tensor([target], dtype=torch.long)
        context_tensor = torch.tensor([context], dtype=torch.long)
        
        # Zero the gradients
        skipgram_model.zero_grad()
        
        # Forward pass: get predictions from the model
        log_probs = skipgram_model(target_tensor)
        
        # Compute the loss
        loss = loss_function(log_probs, context_tensor)
        
        # Backward pass and optimization
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        
    # Print loss for the epoch
    print(f"Epoch {epoch+1}, Skip-gram Loss: {total_loss:.2f}")


Training Skip-gram model on HN titles...
Epoch 1, Skip-gram Loss: 60923.61
Epoch 2, Skip-gram Loss: 51049.32
Epoch 3, Skip-gram Loss: 48225.10
Epoch 4, Skip-gram Loss: 47859.97
Epoch 5, Skip-gram Loss: 47769.68
Epoch 6, Skip-gram Loss: 47583.38
Epoch 7, Skip-gram Loss: 47623.76
Epoch 8, Skip-gram Loss: 47663.39
Epoch 9, Skip-gram Loss: 47510.43
Epoch 10, Skip-gram Loss: 47656.68


In [49]:
# ---------------------------------
# 5b. Test the Skip-gram Embeddings
# ---------------------------------
# This new block lets you inspect the learned embeddings.
import torch.nn.functional as F

print("\n--- Testing Skip-gram Embeddings ---")

def get_similar_words(target_word, top_n=5):
    """
    Finds the most similar words to a target word based on cosine similarity.
    """
    if target_word not in word_to_ix:
        print(f"'{target_word}' is not in the vocabulary.")
        return

    # Get the embedding vector for the target word.
    # We use .weight to access the embedding matrix and detach() to remove it from the computation graph.
    word_embeddings = skipgram_model.embeddings.weight.detach()
    target_id = torch.tensor([word_to_ix[target_word]], dtype=torch.long)
    target_vec = word_embeddings[target_id]

    # Calculate cosine similarity between the target vector and all other word vectors.
    # Cosine similarity measures the angle between two vectors, with 1 being identical.
    similarities = F.cosine_similarity(target_vec, word_embeddings)
    
    # Get the top N most similar words. We use .topk() for this.
    top_similar = torch.topk(similarities, top_n + 1) # +1 to exclude the word itself

    print(f"Words most similar to '{target_word}':")
    for i in top_similar.indices:
        # We check to make sure we don't list the target word as its own most similar word.
        if ix_to_word[i.item()] != target_word:
            similarity_score = similarities[i].item()
            print(f"  - {ix_to_word[i.item()]} (Similarity: {similarity_score:.4f})")

# --- Example Tests ---
# Test the model with a few words likely to be in the vocabulary.
# What to expect: For 'google', you might see 'apple', 'facebook', 'microsoft', or 'startup'.
# For 'security', you might see 'hacked', 'data', 'privacy', 'vulnerability'.
get_similar_words('google')
print("-" * 20)
get_similar_words('security')
print("-" * 20)
get_similar_words('app')


--- Testing Skip-gram Embeddings ---
Words most similar to 'google':
  - things (Similarity: 0.3077)
  - found (Similarity: 0.3059)
  - you (Similarity: 0.2930)
  - us (Similarity: 0.2859)
  - code (Similarity: 0.2629)
--------------------
Words most similar to 'security':
  - open (Similarity: 0.3283)
  - content (Similarity: 0.3085)
  - its (Similarity: 0.3016)
  - web (Similarity: 0.2948)
  - internet (Similarity: 0.2847)
--------------------
Words most similar to 'app':
  - apple (Similarity: 0.5491)
  - with (Similarity: 0.5017)
  - new (Similarity: 0.4172)
  - now (Similarity: 0.3980)
  - for (Similarity: 0.3746)


In [50]:
# -----------------------------
# 6. Create averaged title embeddings using trained Skip-gram
# -----------------------------
title_embeddings = []
valid_labels = []

for tokens, label in zip(tokenized_titles, upvotes):
    token_ids = [word_to_ix[t] for t in tokens if t in word_to_ix]
    if token_ids:
        with torch.no_grad():  # Disable gradient tracking
            vectors = skipgram_model.embeddings(torch.tensor(token_ids))
            avg_vector = vectors.mean(dim=0)
        title_embeddings.append(avg_vector)
        valid_labels.append(label)

X = torch.stack(title_embeddings)
y = torch.tensor(valid_labels, dtype=torch.float32).unsqueeze(1)

In [51]:

# -----------------------------
# 7. Dataset and Dataloader
# -----------------------------
class HNTitleDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y
    def __len__(self):
        return len(self.X)
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

dataset = HNTitleDataset(X, y)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)


In [20]:
# -----------------------------
# 8. Regression Model
# -----------------------------
class UpvotePredictor(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 1)
        )
    def forward(self, x):
        return self.model(x)

model = UpvotePredictor(embedding_dim)
loss_fn = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [52]:
# -----------------------------
# 9. Train Regression Model
# -----------------------------
print("Training upvote regression model...")
for epoch in range(10):
    total_loss = 0
    for batch_x, batch_y in dataloader:
        pred = model(batch_x)
        loss = loss_fn(pred, batch_y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {total_loss:.4f}")


Training upvote regression model...
Epoch 1, Loss: 939909.7212
Epoch 2, Loss: 1053218.1377
Epoch 3, Loss: 995011.5347
Epoch 4, Loss: 982420.6025
Epoch 5, Loss: 1079733.1826
Epoch 6, Loss: 968749.1401
Epoch 7, Loss: 1009177.9487
Epoch 8, Loss: 939447.0181
Epoch 9, Loss: 964039.8613
Epoch 10, Loss: 1115826.7578
