In [1]:
with open('custom_embeddings/embeddings.txt', 'r') as f:
    print(f.read())

good 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0
great 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0
awesome 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0
bad -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0
terrible -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0
awful -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0
kwyjibo 0.5 -0.5 0.5 -0.5 0.5 -0.5 0.5 -0.5 0.5 -0.5 0.5 -0.5 0.5 -0.5 0.5 -0.5 0.5 -0.5 0.5 -0.5



In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from datasets import load_dataset
import string
import os
from tqdm import tqdm
import torch.nn.functional as F

# Set seed for reproducibility and device
SEED = 1234
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')  # ADDED DEVICE DEFINITION

# 1. Custom Vocabulary and Embedding Loader
class Vocabulary:
    def __init__(self, embedding_file):
        self.stoi = {}
        self.itos = {}
        self.vectors = []
        self.load_embeddings(embedding_file)
        
    def load_embeddings(self, file_path):
        # Add special tokens
        self.add_word('<pad>', torch.zeros(20))  # Match embedding_dim
        self.add_word('<unk>', torch.zeros(20))
        
        with open(file_path, 'r') as f:
            for line in tqdm(f, desc="Loading embeddings"):
                parts = line.rstrip().split(' ')
                word = parts[0]
                vector = torch.tensor([float(x) for x in parts[1:]], dtype=torch.float)
                self.add_word(word, vector)
    
    def add_word(self, word, vector):
        idx = len(self.itos)
        self.stoi[word] = idx
        self.itos[idx] = word
        self.vectors.append(vector)
    
    def __len__(self):
        return len(self.vectors)

# 2. Custom Dataset Class
class IMDBDataset(Dataset):
    def __init__(self, data, vocab, max_length=100):
        self.data = data
        self.vocab = vocab
        self.max_length = max_length
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        text = self.data[idx]['text']
        label = float(self.data[idx]['label'])
        
        # Simple tokenizer
        tokens = text.lower().translate(
            str.maketrans('', '', string.punctuation)
        ).split()[:self.max_length]
        
        # Convert to indices
        indices = [self.vocab.stoi.get(token, self.vocab.stoi['<unk>']) 
                  for token in tokens]
        
        # Pad/truncate
        if len(indices) < self.max_length:
            indices += [self.vocab.stoi['<pad>']] * (self.max_length - len(indices))
        else:
            indices = indices[:self.max_length]
            
        return torch.tensor(indices, dtype=torch.long), torch.tensor(label, dtype=torch.float)

# 3. Load Data and Vocabulary
# Load IMDB dataset
imdb = load_dataset('imdb')
train_valid = imdb['train'].train_test_split(test_size=0.2, seed=SEED)
train_data = train_valid['train']
valid_data = train_valid['test']
test_data = imdb['test']

# Load custom embeddings
vocab = Vocabulary('custom_embeddings/embeddings.txt')

# 4. Create DataLoaders
BATCH_SIZE = 64
MAX_LENGTH = 100

train_dataset = IMDBDataset(train_data, vocab, MAX_LENGTH)
valid_dataset = IMDBDataset(valid_data, vocab, MAX_LENGTH)
test_dataset = IMDBDataset(test_data, vocab, MAX_LENGTH)

def collate_fn(batch):
    texts, labels = zip(*batch)
    return torch.stack(texts).to(device), torch.stack(labels).to(device)  # Move to device

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
valid_loader = DataLoader(valid_dataset, batch_size=BATCH_SIZE, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, collate_fn=collate_fn)

# 5. Model Definition
class CNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, n_filters, filter_sizes, output_dim, dropout, pad_idx):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
        self.convs = nn.ModuleList([
            nn.Conv2d(1, n_filters, (fs, embedding_dim)) 
            for fs in filter_sizes
        ])
        self.fc = nn.Linear(len(filter_sizes) * n_filters, output_dim)
        self.dropout = nn.Dropout(dropout)
        
        # Initialize with custom embeddings
        self.embedding.weight.data.copy_(torch.stack(vocab.vectors))
        self.embedding.weight.requires_grad = False  # Start frozen

    def forward(self, text):
        embedded = self.embedding(text).unsqueeze(1)  # Add channel dimension
        conved = [F.relu(conv(embedded)).squeeze(3) for conv in self.convs]
        pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]
        cat = self.dropout(torch.cat(pooled, dim=1))
        return self.fc(cat).squeeze()

# Model parameters
EMBEDDING_DIM = 20
N_FILTERS = 100
FILTER_SIZES = [3, 4, 5]
OUTPUT_DIM = 1
DROPOUT = 0.5
PAD_IDX = vocab.stoi['<pad>']

model = CNN(
    vocab_size=len(vocab),
    embedding_dim=EMBEDDING_DIM,
    n_filters=N_FILTERS,
    filter_sizes=FILTER_SIZES,
    output_dim=OUTPUT_DIM,
    dropout=DROPOUT,
    pad_idx=PAD_IDX
).to(device)

# Rest of the code remains the same...

# 6. Training Setup
optimizer = optim.Adam(model.parameters())
criterion = nn.BCEWithLogitsLoss()

# 7. Training Loop
def train_epoch(model, loader, optimizer, criterion):
    model.train()
    total_loss = 0
    correct = 0
    
    for texts, labels in tqdm(loader, desc="Training"):
        texts, labels = texts.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(texts)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        preds = torch.round(torch.sigmoid(outputs))
        correct += (preds == labels).sum().item()
    
    return total_loss / len(loader), correct / len(loader.dataset)

def evaluate(model, loader, criterion):
    model.eval()
    total_loss = 0
    correct = 0
    
    with torch.no_grad():
        for texts, labels in tqdm(loader, desc="Evaluating"):
            texts, labels = texts.to(device), labels.to(device)
            outputs = model(texts)
            loss = criterion(outputs, labels)
            
            total_loss += loss.item()
            preds = torch.round(torch.sigmoid(outputs))
            correct += (preds == labels).sum().item()
    
    return total_loss / len(loader), correct / len(loader.dataset)

# Training execution
N_EPOCHS = 10
FREEZE_FOR = 5
best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    # Unfreeze embeddings after FREEZE_FOR epochs
    if epoch == FREEZE_FOR:
        model.embedding.weight.requires_grad = True
        print("\nUnfreezing embeddings!")
    
    train_loss, train_acc = train_epoch(model, train_loader, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_loader, criterion)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'best_model.pt')
    
    print(f'\nEpoch {epoch+1:02}')
    print(f'Train Loss: {train_loss:.3f} | Acc: {train_acc*100:.2f}%')
    print(f'Valid Loss: {valid_loss:.3f} | Acc: {valid_acc*100:.2f}%')

# Final evaluation
model.load_state_dict(torch.load('best_model.pt'))
test_loss, test_acc = evaluate(model, test_loader, criterion)
print(f'\nFinal Test Performance:')
print(f'Loss: {test_loss:.3f} | Acc: {test_acc*100:.2f}%')

# Save trained embeddings
def save_embeddings(model, vocab, path):
    os.makedirs(os.path.dirname(path), exist_ok=True)
    with open(path, 'w') as f:
        for idx in tqdm(range(len(vocab)), desc="Saving embeddings"):
            word = vocab.itos[idx]
            vector = model.embedding.weight.data[idx].cpu().numpy()
            line = f"{word} " + " ".join(map(str, vector)) + "\n"
            f.write(line)

save_embeddings(model, vocab, 'custom_embeddings/trained_embeddings.txt')

In [None]:
def predict_sentiment(model, vocab, sentence, device, max_length=100):
    model.eval()
    
    # Preprocess input text
    tokens = sentence.lower().translate(
        str.maketrans('', '', string.punctuation)
    ).split()[:max_length]
    
    # Convert to indices
    indices = [vocab.stoi.get(token, vocab.stoi['<unk>']) for token in tokens]
    
    # Pad/truncate
    if len(indices) < max_length:
        indices += [vocab.stoi['<pad>']] * (max_length - len(indices))
    else:
        indices = indices[:max_length]
    
    # Convert to tensor
    tensor = torch.LongTensor(indices).unsqueeze(0).to(device)
    
    with torch.no_grad():
        output = model(tensor)
        probability = torch.sigmoid(output).item()
    
    return "Positive" if probability > 0.5 else "Negative", probability

In [None]:
# Example 1: Direct question
question = "Is this movie worth watching?"
sentiment, confidence = predict_sentiment(model, vocab, question, device)
print(f"Question: {question}")
print(f"Sentiment: {sentiment} | Confidence: {confidence:.4f}")

# Example 2: Statement with question mark
statement = "Why would anyone like this terrible film?"
sentiment, confidence = predict_sentiment(model, vocab, statement, device)
print(f"\nStatement: {statement}")
print(f"Sentiment: {sentiment} | Confidence: {confidence:.4f}")

# Example 3: Neutral question
neutral = "What time does the movie start?"
sentiment, confidence = predict_sentiment(model, vocab, neutral, device)
print(f"\nNeutral: {neutral}")
print(f"Sentiment: {sentiment} | Confidence: {confidence:.4f}")

Question: Is this movie worth watching?
Sentiment: Positive | Confidence: 0.5336

Statement: Why would anyone like this terrible film?
Sentiment: Negative | Confidence: 0.1220

Neutral: What time does the movie start?
Sentiment: Positive | Confidence: 0.5336
