In [2]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [8]:
import pandas as pd

df = pd.read_csv('/content/drive/MyDrive/cleaned_combined_dataset_with_sentiment.csv')
titles = df['clean_title'].astype(str).tolist()


In [11]:
import pandas as pd
import torch
import torch.nn as nn
import numpy as np
from collections import Counter

In [18]:
# Assume you already have:
# titles = list of cleaned title strings
# word_to_idx = your vocabulary dictionary

# Step 2: Convert titles to list of indices (with unk handling)
def text_to_indices(title, word_to_idx, unk_idx=1):
    return [word_to_idx.get(word, unk_idx) for word in title.split()]

title_indices = [text_to_indices(title, word_to_idx) for title in titles]

# Step 3: Pad/truncate sequences to fixed length
max_len = 20

def pad_sequence(seq, max_len, pad_idx=0):
    if len(seq) < max_len:
        return seq + [pad_idx] * (max_len - len(seq))
    else:
        return seq[:max_len]

title_padded = [pad_sequence(seq, max_len) for seq in title_indices]

# Step 4: Convert to tensor
import torch
title_tensors = torch.tensor(title_padded, dtype=torch.long)

print(title_tensors.shape)  # should be (num_samples, max_len)
print(title_tensors.min(), title_tensors.max())  # should be within vocab range


torch.Size([971806, 20])
tensor(0) tensor(164400)


In [19]:
min_freq = 1
counter = Counter(word for title in titles for word in title.split())
vocab = ['<pad>', '<unk>'] + [word for word, freq in counter.items() if freq >= min_freq]
word_to_idx = {word: i for i, word in enumerate(vocab)}
print("Vocab size:", len(vocab))



Vocab size: 164401


In [20]:
def text_to_indices(title, word_to_idx, unk_idx=1):
    return [word_to_idx.get(word, unk_idx) for word in title.split()]

title_indices = [text_to_indices(title, word_to_idx) for title in titles]


In [21]:
def pad_sequence(seq, max_len=20, pad_idx=0):
    if len(seq) < max_len:
        return seq + [pad_idx] * (max_len - len(seq))
    else:
        return seq[:max_len]

title_padded = [pad_sequence(seq) for seq in title_indices]


In [22]:
import torch
title_tensors = torch.tensor(title_padded, dtype=torch.long)
print(title_tensors.shape)
print(title_tensors.min(), title_tensors.max())  # should be <= len(vocab)-1


torch.Size([971806, 20])
tensor(0) tensor(164400)


In [25]:
def load_glove(path, dim):
    embeddings_index = {}
    with open(path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            if len(values) != dim + 1:
                continue  # Skip malformed lines
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = vector
    return embeddings_index


glove_path = '/content/drive/MyDrive/glove.840B.300d.txt'
embedding_dim = 300
glove = load_glove(glove_path, embedding_dim)


In [26]:
vocab_size = len(word_to_idx)  # Should match your vocab size, e.g. 164401

# Initialize with small random values
embedding_matrix = np.random.uniform(-0.05, 0.05, (vocab_size, embedding_dim))

# Fill embedding matrix with vectors from GloVe where available
for word, idx in word_to_idx.items():
    vector = glove.get(word)
    if vector is not None:
        embedding_matrix[idx] = vector

# Convert to PyTorch tensor
embedding_matrix = torch.tensor(embedding_matrix, dtype=torch.float32)

In [27]:
import torch.nn as nn

class BiLSTMTextEncoder(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, pretrained_embeddings):
        super(BiLSTMTextEncoder, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.embedding.weight.data.copy_(pretrained_embeddings)
        self.embedding.weight.requires_grad = False  # Freeze embeddings if you want
        self.bilstm = nn.LSTM(embedding_dim, hidden_dim, bidirectional=True, batch_first=True)

    def forward(self, x):
        embedded = self.embedding(x)            # [batch_size, seq_len, embedding_dim]
        lstm_out, _ = self.bilstm(embedded)    # [batch_size, seq_len, hidden_dim*2]
        pooled = torch.mean(lstm_out, dim=1)   # Average pooling over seq_len
        return pooled                          # [batch_size, hidden_dim*2]


In [28]:
hidden_dim = 64  # You can change this

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = BiLSTMTextEncoder(vocab_size, embedding_dim, hidden_dim, embedding_matrix).to(device)

# Move your input tensor to the device
title_tensors = title_tensors.to(device)


In [29]:
batch_size = 32
outputs = []

model.eval()  # Set model to eval mode (important for dropout, batchnorm etc.)
with torch.no_grad():
    for i in range(0, len(title_tensors), batch_size):
        batch = title_tensors[i:i+batch_size]
        output = model(batch)  # output shape: (batch_size, hidden_dim*2)
        outputs.append(output.cpu())  # Move output back to CPU for saving later


In [30]:
all_outputs = torch.cat(outputs, dim=0)       # Concatenate all batches
print(all_outputs.shape)                       # Check shape: (num_samples, hidden_dim*2)

torch.Size([971806, 128])


In [31]:
import pandas as pd

# Convert to numpy
title_embeddings = all_outputs.numpy()

# Save to CSV
df_embeddings = pd.DataFrame(title_embeddings)
df_embeddings.to_csv('/content/drive/MyDrive/title_bilstm_embeddings.csv', index=False)

print("Embeddings saved successfully!")


Embeddings saved successfully!


In [16]:
#dummy
import torch
import torch.nn as nn

# Your BiLSTMTextEncoder definition
class BiLSTMTextEncoder(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, pretrained_embeddings):
        super(BiLSTMTextEncoder, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.embedding.weight.data.copy_(pretrained_embeddings)
        self.embedding.weight.requires_grad = False
        self.bilstm = nn.LSTM(embedding_dim, hidden_dim, bidirectional=True, batch_first=True)

    def forward(self, x):
        embedded = self.embedding(x)
        lstm_out, _ = self.bilstm(embedded)
        pooled = torch.mean(lstm_out, dim=1)
        return pooled

# Parameters for dummy test
vocab_size = 1000       # vocabulary size
embedding_dim = 300     # embedding dimension (e.g., GloVe 300d)
hidden_dim = 64         # hidden units in LSTM

# Create a random embedding matrix (pretend pretrained embeddings)
embedding_matrix = torch.randn(vocab_size, embedding_dim)

# Instantiate model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = BiLSTMTextEncoder(vocab_size, embedding_dim, hidden_dim, embedding_matrix).to(device)
model.eval()

# Create dummy input tensor (batch_size x sequence_length)
batch_size = 4
seq_len = 10
dummy_input = torch.randint(0, vocab_size, (batch_size, seq_len)).to(device)

# Forward pass
with torch.no_grad():
    dummy_output = model(dummy_input)

print("Dummy output shape:", dummy_output.shape)  # Expected: (4, hidden_dim*2)


Dummy output shape: torch.Size([4, 128])


In [17]:
print(title_tensors.dtype)  # should be torch.int64 (LongTensor)
print(title_tensors.min(), title_tensors.max())  # should be within vocab index range
print(title_tensors.shape)  # (num_samples, max_seq_length)


torch.int64
tensor(0) tensor(164400)
torch.Size([971806, 20])
