<a href="https://colab.research.google.com/github/a-mhamdi/nlp-unraveled/blob/main/05_text_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Text Analysis and Advanced Techniques
---

**Outlines**
----
1. [Text classification](#text-classification)
1. [Named entity recognition](#ner)
1. [Sentiment analysis](#sentiment-analysis)
1. [Word embeddings](#word-embeddings)

## Text classification <a name="text-classification"></a>


## Named entity recognition (NER) <a name="ner"></a>


## Sentiment analysis <a name="sentiment-analysis"></a>


In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from collections import Counter
import numpy as np

In [2]:
texts = [
    "this movie was fantastic and I loved it",
    "terrible waste of time",
    "great performance by all actors",
    "I really hated this show",
    "amazing plot and great acting",
    "worst movie ever",
    "absolutely brilliant",
    "completely disappointed",
]
labels = [1, 0, 1, 0, 1, 0, 1, 0]  # 1: positive, 0: negative

In [3]:
# Vocabulary processing
class Vocabulary:
    def __init__(self, texts):
        # Create word to index mapping
        words = ' '.join(texts).lower().split()
        word_counts = Counter(words)
        # Add special tokens
        self.word2idx = {'<PAD>': 0, '<UNK>': 1}
        # Add words that appear at least twice
        for word, count in word_counts.items():
            if count >= 1 and word not in self.word2idx:
                self.word2idx[word] = len(self.word2idx)
        self.idx2word = {v: k for k, v in self.word2idx.items()}

    def text_to_indices(self, text, max_length=10):
        # Convert text to indices and pad
        words = text.lower().split()
        indices = [self.word2idx.get(word, self.word2idx['<UNK>']) for word in words]
        if len(indices) < max_length:
            indices += [self.word2idx['<PAD>']] * (max_length - len(indices))
        else:
            indices = indices[:max_length]
        return indices

In [4]:
# Create dataset class
class TextClassificationDataset(Dataset):
    def __init__(self, texts, labels, vocab):
        self.texts = texts
        self.labels = labels
        self.vocab = vocab

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        indices = self.vocab.text_to_indices(text)
        return torch.tensor(indices), torch.tensor(label, dtype=torch.float32)

In [5]:
# Create vocabulary and dataset
vocab = Vocabulary(texts)
dataset = TextClassificationDataset(texts, labels, vocab)
dataloader = DataLoader(dataset, batch_size=2, shuffle=True)

In [6]:
# Define the model
class SimpleTextClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super(SimpleTextClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        embedded = self.embedding(x)
        lstm_out, (hidden, cell) = self.lstm(embedded)
        # Use the last hidden state
        out = self.fc(hidden[-1])
        out = self.sigmoid(out)
        return out.squeeze()

In [7]:
# Initialize model, loss function, and optimizer
model = SimpleTextClassifier(
    vocab_size=len(vocab.word2idx),
    embedding_dim=50,
    hidden_dim=20
)
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

In [8]:
# Training loop
num_epochs = 100
for epoch in range(num_epochs):
    total_loss = 0
    for batch_texts, batch_labels in dataloader:
        # Forward pass
        outputs = model(batch_texts)
        loss = criterion(outputs, batch_labels)

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    # Print progress every 10 epochs
    if (epoch + 1) % 10 == 0:
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {total_loss/len(dataloader):.4f}')

Epoch [10/100], Loss: 0.0169
Epoch [20/100], Loss: 0.0021
Epoch [30/100], Loss: 0.0013
Epoch [40/100], Loss: 0.0009
Epoch [50/100], Loss: 0.0007
Epoch [60/100], Loss: 0.0005
Epoch [70/100], Loss: 0.0004
Epoch [80/100], Loss: 0.0004
Epoch [90/100], Loss: 0.0003
Epoch [100/100], Loss: 0.0003


In [9]:
# Test the model
def predict_sentiment(text):
    model.eval()
    with torch.no_grad():
        indices = vocab.text_to_indices(text)
        input_tensor = torch.tensor([indices])
        output = model(input_tensor)
        prediction = "Positive" if output.item() > 0.5 else "Negative"
        confidence = max(output.item(), 1 - output.item())
        return prediction, confidence

In [10]:
# Example predictions
test_texts = [
    "this was really good",
    "I did not enjoy it at all",
    "amazing performance"
]

print("\nTest Predictions:")
for text in test_texts:
    prediction, confidence = predict_sentiment(text)
    print(f"Text: {text}")
    print(f"Prediction: {prediction} (confidence: {confidence:.2f})\n")


Test Predictions:
Text: this was really good
Prediction: Positive (confidence: 1.00)

Text: I did not enjoy it at all
Prediction: Positive (confidence: 1.00)

Text: amazing performance
Prediction: Positive (confidence: 1.00)



## Word embeddings <a name="word-embeddings"></a>