In [3]:
import torch
import torch.nn as nn
import torch
import torch.optim as optim
import sentencepiece as spm
import psycopg2
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [19]:
# Step 1: Connect to the PostgreSQL database and fetch the data
conn = psycopg2.connect('postgres://arcanum:nz2TBHLHl8VSBTSxznk@pg.mlx.institute:5433/arcanum')

# Open a cursor to perform database operations
cur = conn.cursor()

In [36]:
def fetch_data(conn):
    cur = conn.cursor()
    try:
        conn.rollback()  # Rollback the previous transaction
        
        cur.execute("SELECT title, score FROM hackernews.items")
        rows = cur.fetchall()
    except Exception as e:
        print("Failed to fetch data from database", e)
        rows = []
    finally:
        cur.close()  # Close the cursor
    return rows

data = fetch_data(conn)

In [44]:
# Step 2: Preprocess the data: Tokenize the titles using SentencePiece
titles = [data[0] for row in data]
upvotes = [data[1] for row in data]

In [48]:
# Write titles to a file as SentencePiece requires file input
with open('titles.txt', 'w') as f:
    for title in titles:
        f.write(str(title) + '\n')

In [51]:
spm.SentencePieceTrainer.Train('--input=titles.txt --model_prefix=m --vocab_size=12')
sp = spm.SentencePieceProcessor()
sp.Load("m.model")

sentencepiece_trainer.cc(178) LOG(INFO) Running command: --input=titles.txt --model_prefix=m --vocab_size=12
sentencepiece_trainer.cc(78) LOG(INFO) Starts training with : 
trainer_spec {
  input: titles.txt
  input_format: 
  model_prefix: m
  model_type: UNIGRAM
  vocab_size: 12
  self_test_sample_size: 0
  character_coverage: 0.9995
  input_sentence_size: 0
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 4192
  num_threads: 16
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 0
  pretokenization_delimiter: 
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pieces: 0
  required_chars: 
  byte_fallback: 0
  vocabulary_output_piece_score: 1
  train_extremely_large_corpus: 0
  seed_sentencepieces_file: 
  hard_vocab_limit: 1
  use_all_vocab: 0
  unk_id: 0
  bos_id: 1
  eos_id: 2
  pad_id: -1
  unk_piece: <unk>
  bos_piece: <s

True

In [53]:
tokenized_titles = [sp.EncodeAsIds(str(title)) for title in titles]

In [93]:
# Step 3: Implement the Word2Vec model (Skip-gram)
class SkipGramModel(nn.Module):
    def __init__(self, vocab_size, embed_size):
        super(SkipGramModel, self).__init__()
        self.embed_v = nn.Embedding(vocab_size, embed_size)
        self.embed_u = nn.Embedding(vocab_size, embed_size)

    def forward(self, center_words, target_words, outer_words):
        center_embeds = self.embed_v(center_words).unsqueeze(2)  # Get embeddings for center word
        target_embeds = self.embed_u(target_words).unsqueeze(1)  # Get embeddings for target word
        outer_embeds = self.embed_u(outer_words).unsqueeze(2)  # Get embeddings for outer words

        scores = target_embeds.bmm(center_embeds).squeeze()  # Calculate score for target words
        neg_scores = outer_embeds.bmm(center_embeds).squeeze().neg()  # Calculate score for negative words

        return scores, neg_scores


In [94]:
def generate_pairs_and_neg_samples(title):
    center_words = []
    target_words = []
    outer_words = []

    # Convert lists to Tensors
    center_words = torch.tensor(center_words)
    target_words = torch.tensor(target_words)
    outer_words = torch.tensor(outer_words)
    return center_words, target_words, outer_words

In [95]:
def generate_pairs_and_neg_samples(title, window_size=2, num_neg_samples=5):
    center_words = []
    target_words = []
    outer_words = []

    for i in range(len(title)):
        for j in range(i - window_size, i + window_size + 1):
            if j != i and j >= 0 and j < len(title):
                center_words.append(title[i])
                target_words.append(title[j])
                outer_words.extend(np.random.choice(len(sp), num_neg_samples))  # Randomly choose negative samples

    return center_words, target_words, outer_words

In [98]:
# Step 4: Train the Word2Vec model on the tokenized titles
vocab_size = len(sp)  # Vocabulary size
embed_size = 100  # Size of embedding vector
learning_rate = 0.001  # Learning rate for the optimizer
epochs = 5  # Number of epochs to train

# Initialize model and optimizer
model = SkipGramModel(vocab_size, embed_size)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Define loss function
def neg_log_likelihood(scores, neg_scores):
    return -torch.mean(torch.log(torch.sigmoid(scores))) - torch.mean(torch.log(torch.sigmoid(neg_scores)))

# Training loop
for epoch in range(epochs):
    total_loss = 0
    for title in tokenized_titles:
        # Prepare data in the format expected by the model
        # This involves generating pairs of centre words, context words, and negative samples
        center_words, target_words, outer_words = generate_pairs_and_neg_samples(title)

        
        # Convert lists to Tensors of type Long
        center_words = torch.tensor(center_words, dtype=torch.long)
        target_words = torch.tensor(target_words, dtype=torch.long)
        outer_words = torch.tensor(outer_words, dtype=torch.long)


        # Get embeddings for outer words
        outer_embeds = model.embed_u(outer_words).unsqueeze(2)

        # Adjust the dimensions of outer_embeds to match center_embeds
        outer_embeds = outer_embeds.view(center_embeds.size(0), -1, center_embeds.size(1))

        # Calculate score for negative words
        neg_scores = outer_embeds.bmm(center_embeds).squeeze().neg()

        # Forward pass
        scores, neg_scores = model(center_words, target_words, outer_words)
        


        # Calculate loss
        loss = neg_log_likelihood(scores, neg_scores)

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f'Epoch {epoch+1}, Loss: {total_loss/len(tokenized_titles)}')


IndexError: Dimension out of range (expected to be in range of [-1, 0], but got 1)

In [76]:
# Step 5: Use the trained Word2Vec model to convert titles into vectors
title_vectors = []

for title in tokenized_titles:
    vector = torch.zeros(embed_size)
    for word in title:
        vector += model.embed_v(torch.tensor(word))
    title_vectors.append(vector / len(title)) 

[tensor([-9.6218e-02,  1.4108e+00, -2.1825e-01,  4.7657e-01, -1.6733e-01,
          4.8520e-01,  7.8704e-01,  7.4554e-02,  8.2022e-01, -1.0279e-01,
         -8.4406e-01,  3.9515e-01,  1.3748e+00, -1.6056e-01, -2.4219e-01,
         -1.0982e-01,  2.9845e-01, -4.3951e-01,  7.4778e-01,  1.9043e-01,
         -5.8463e-01,  6.1667e-01, -2.4364e-01, -4.8500e-01,  1.2618e-01,
         -8.6228e-02,  5.6947e-01,  1.9578e-01, -3.9204e-01,  5.5917e-01,
         -1.3669e-01, -2.4886e-01,  1.5489e-01, -4.1200e-02, -2.0100e-02,
         -2.6347e-03,  1.0666e-01,  4.6636e-02, -1.3855e+00,  1.2896e-01,
         -1.3029e-01, -2.5050e-01,  6.8087e-01, -8.2419e-01,  3.7983e-01,
          3.7711e-01, -4.0287e-01, -4.7058e-01,  1.0798e+00,  3.4207e-01,
         -3.7724e-01, -8.6643e-01,  7.4403e-01, -3.5332e-01,  1.1544e+00,
          7.8331e-02,  2.7582e-01, -4.2398e-01,  9.3174e-01,  4.1126e-01,
          9.6364e-03, -7.9264e-01,  3.8383e-01,  1.2901e-01, -9.1352e-02,
          5.0847e-02,  3.9277e-01, -9.

In [78]:

# Step 6: Implement a regression model
reg = LinearRegression()

# Step 7: Train the regression model on the vectors and corresponding upvote scores
X_train, X_test, y_train, y_test = train_test_split(title_vectors, upvotes, test_size=0.2, random_state=42)
reg.fit(X_train, y_train)

# Step 8: Evaluate the model and make predictions
y_pred = reg.predict(X_test)

RuntimeError: Can't call numpy() on Tensor that requires grad. Use tensor.detach().numpy() instead.