<a href="https://colab.research.google.com/github/aliikhwan99/Text_Embeddings/blob/main/Text_Embeddings.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Phoneme Embeddings

In [None]:
pip install phonemizer

# Morphology Embeddings

In [None]:
pip install morfessor


# Environment set

In [None]:
!pip install phonemizer morfessor
!apt-get install espeak-ng


# Combined Embedding Layer

In [None]:
import torch
import torch.nn as nn
from transformers import BertModel

# Example values (replace with actual numbers from your dataset)
num_phonemes = 100  # Adjust based on how many unique phonemes you have
num_morphs = 100    # Adjust based on your morph units
num_labels = 3      # E.g., 3 classes: Standard Malay, Sabah dialect, Code-switch

class CustomBertWithPhonemeMorph(nn.Module):
    def __init__(self):
        super().__init__()
        self.bert = BertModel.from_pretrained("bert-base-multilingual-cased")
        self.phoneme_emb = nn.Embedding(num_phonemes, 768)
        self.morph_emb = nn.Embedding(num_morphs, 768)
        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Linear(768, num_labels)

    def forward(self, input_ids, phoneme_ids, morph_ids, attention_mask=None):
        # [batch_size, seq_len, emb_dim]
        phoneme_vec = self.phoneme_emb(phoneme_ids).mean(dim=2)  # Mean pooling over morph units
        morph_vec = self.morph_emb(morph_ids).mean(dim=2)

        # BERT output
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        cls_emb = outputs.last_hidden_state[:, 0, :]  # [CLS] token representation

        # Combine embeddings
        combined = cls_emb + phoneme_vec[:, 0, :] + morph_vec[:, 0, :]  # Keep batch-first
        logits = self.classifier(self.dropout(combined))
        return logits


# -----TEST----------

# Step 1: Dummy Test Dataset

In [None]:
# Simulated text samples
texts = [
    "Saya pergi ke pasar",                          # Standard Malay
    "Aku mau pigi kedai bah",                       # Sabah dialect
    "Saya want to buy makanan from kedai"           # Code-switched Malay-English
]

# Fake labels for classification: 0 = Malay, 1 = Dialect, 2 = Code-Switch
labels = [0, 1, 2]


#  Step 2: Tokenizer

In [None]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased")
inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")


# Step 3: Fake Phoneme & Morph IDs

In [None]:
import torch

batch_size, seq_len = inputs['input_ids'].shape

# Randomly simulate phoneme and morph IDs (normally derived from actual phoneme/morph analyzers)
phoneme_ids = torch.randint(0, 100, (batch_size, seq_len, 5))  # 5 phoneme units per word
morph_ids = torch.randint(0, 100, (batch_size, seq_len, 3))    # 3 morphemes per word


# Step 4: Define Models

In [None]:
class BaselineModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.bert = BertModel.from_pretrained("bert-base-multilingual-cased")
        self.classifier = nn.Linear(768, 3)

    def forward(self, input_ids, attention_mask=None):
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        cls_emb = outputs.last_hidden_state[:, 0, :]
        return self.classifier(cls_emb)

baseline_model = BaselineModel()
custom_model = CustomBertWithPhonemeMorph()  # From earlier cell


# Step 5: Forward Pass & Comparison

In [None]:
# Forward pass with baseline
with torch.no_grad():
    baseline_logits = baseline_model(inputs['input_ids'], attention_mask=inputs['attention_mask'])

# Forward pass with phoneme+morph model
with torch.no_grad():
    advanced_logits = custom_model(
        input_ids=inputs['input_ids'],
        attention_mask=inputs['attention_mask'],
        phoneme_ids=phoneme_ids,
        morph_ids=morph_ids
    )


# Step 6: Output Predictions

In [None]:
import torch.nn.functional as F

baseline_preds = torch.argmax(F.softmax(baseline_logits, dim=1), dim=1)
advanced_preds = torch.argmax(F.softmax(advanced_logits, dim=1), dim=1)

print("Ground Truth:", labels)
print("Baseline Predictions:", baseline_preds.tolist())
print("Phoneme+Morph Predictions:", advanced_preds.tolist())


# Test 2

In [None]:
import torch
import torch.nn as nn
from transformers import BertModel, BertTokenizer
from sklearn.metrics import classification_report
import numpy as np


# phoneme and morph embeddings

In [None]:
num_phonemes = 50
num_morphs = 100
num_labels = 2  # Changed from 3 → now only Kelantanese and Code-switch


In [None]:
class CustomBertWithPhonemeMorph(nn.Module):
    def __init__(self):
        super().__init__()
        self.bert = BertModel.from_pretrained("bert-base-multilingual-cased")
        self.phoneme_emb = nn.Embedding(num_phonemes, 768)
        self.morph_emb = nn.Embedding(num_morphs, 768)
        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Linear(768, num_labels)  # ✅ changed to 2

    def forward(self, input_ids, phoneme_ids, morph_ids, attention_mask=None):
        phoneme_vec = self.phoneme_emb(phoneme_ids).mean(dim=2)
        morph_vec = self.morph_emb(morph_ids).mean(dim=2)
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        cls_emb = outputs.last_hidden_state[:, 0, :]
        combined = cls_emb + phoneme_vec[:, 0, :] + morph_vec[:, 0, :]
        logits = self.classifier(self.dropout(combined))
        return logits


In [None]:
tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased")

texts = [
    "Kawe nok gi kedai jap",              # Kelantanese
    "Demo makan gapo pagi tadi?",
    "Ore tu dok gi kerja lagi",
    "Mu buat gapo situ?",
    "Bakpo demo lambat sangat?",
    "I tengah buat kerja rumah sekarang",      # Code-switch
    "You nak makan sini or tapau?",
    "Boss suruh I settle that document cepat",
    "Dia tengah belajar untuk test tomorrow",
    "You boleh start dulu, I datang later"
]

original_labels = [1, 1, 1, 1, 1, 2, 2, 2, 2, 2]

# ✅ Normalize labels: 1 → 0 (Kelantanese), 2 → 1 (Code-switch)
labels = [0 if l == 1 else 1 for l in original_labels]

# Tokenize input text
encodings = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
input_ids = encodings['input_ids']
attention_mask = encodings['attention_mask']
batch_size, seq_len = input_ids.shape

# Simulated phoneme and morph IDs
phoneme_ids = torch.randint(0, num_phonemes, (batch_size, seq_len, 5))
morph_ids = torch.randint(0, num_morphs, (batch_size, seq_len, 5))


In [None]:
model = CustomBertWithPhonemeMorph()
model.eval()

with torch.no_grad():
    logits = model(input_ids, phoneme_ids, morph_ids, attention_mask=attention_mask)
    predictions = torch.argmax(logits, dim=1)


In [None]:
print("Predictions:", predictions.tolist())
print("Ground Truth:", labels)

print("\nClassification Report:")
print(classification_report(labels, predictions.tolist(), target_names=["Kelantanese", "Code-switch"]))


# Standard Bert text embeddings

In [None]:
class BaselineBertClassifier(nn.Module):
    def __init__(self):
        super().__init__()
        self.bert = BertModel.from_pretrained("bert-base-multilingual-cased")
        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Linear(768, num_labels)

    def forward(self, input_ids, attention_mask=None):
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        cls_emb = outputs.last_hidden_state[:, 0, :]
        logits = self.classifier(self.dropout(cls_emb))
        return logits

# Run baseline
baseline_model = BaselineBertClassifier()
baseline_model.eval()

with torch.no_grad():
    baseline_logits = baseline_model(input_ids, attention_mask=attention_mask)
    baseline_preds = torch.argmax(baseline_logits, dim=1)

print("\nBaseline Predictions:", baseline_preds.tolist())
print(classification_report(labels, baseline_preds.tolist(), target_names=["Kelantanese", "Code-switch"]))


# TEST 3

In [None]:
import torch
import torch.nn as nn
from transformers import BertModel, BertTokenizer
from torch.utils.data import Dataset

# 1. Define texts and labels
texts = [
    "Kawe nok gi kedai jap",  # Kelantanese
    "Demo makan gapo pagi tadi?",
    "Ore tu dok gi kerja lagi",
    "Mu buat gapo situ?",
    "Bakpo demo lambat sangat?",
    "I tengah buat kerja rumah sekarang",  # Code-switch
    "You nak makan sini or tapau?",
    "Boss suruh I settle that document cepat",
    "Dia tengah belajar untuk test tomorrow",
    "You boleh start dulu, I datang later"
]
labels = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]  # 0 = Kelantanese, 1 = Code-switch

# 2. Phoneme and Morph Vocabularies
phoneme_vocab = {ph: idx for idx, ph in enumerate(["k", "a", "w", "e", "n", "o", "g", "i", "d", "m", "p", "b", "s", "y", "u", "r", "t", "h", "l", "j"])}
morph_vocab = {m: idx for idx, m in enumerate(["ka", "we", "nok", "gi", "kedai", "jap", "you", "makan", "rumah", "buat", "kerja", "settle", "cepat", "test", "tomorrow"])}

# Save dummy vector files for phoneme and morph
def save_dummy_vec(path, vocab):
    with open(path, "w", encoding="utf-8") as f:
        for token in vocab:
            vec = " ".join(["0.01"] * 768)  # dummy small values
            f.write(f"{token} {vec}\n")

save_dummy_vec("phoneme.vec", phoneme_vocab)
save_dummy_vec("morph.vec", morph_vocab)

# 3. Tokenization and Mapping
def tokenize_and_map(texts, tokenizer, phoneme_vocab, morph_vocab, max_len=20):
    input_ids, attention_masks = [], []
    phoneme_lists, morph_lists = [], []

    max_word_phonemes = 1  # initialize max phonemes per word

    # First pass: gather phoneme and morph IDs
    for text in texts:
        phonemes = [[phoneme_vocab.get(c, 0) for c in word] for word in text.split()]
        morphs = [[morph_vocab.get(word.lower(), 0)] for word in text.split()]
        phoneme_lists.append(phonemes)
        morph_lists.append(morphs)
        max_word_phonemes = max(max_word_phonemes, max((len(p) for p in phonemes), default=1))

    phoneme_ids, morph_ids = [], []

    for text, phonemes, morphs in zip(texts, phoneme_lists, morph_lists):
        encoding = tokenizer(text, max_length=max_len, padding='max_length', truncation=True, return_tensors='pt')
        input_ids.append(encoding['input_ids'][0])
        attention_masks.append(encoding['attention_mask'][0])

        # Padded tensors
        phoneme_tensor = torch.zeros(max_len, max_word_phonemes, dtype=torch.long)
        morph_tensor = torch.zeros(max_len, 1, dtype=torch.long)

        for i, (ph, mo) in enumerate(zip(phonemes, morphs)):
            if i < max_len:
                phoneme_tensor[i, :len(ph)] = torch.tensor(ph)
                morph_tensor[i, 0] = mo[0]

        phoneme_ids.append(phoneme_tensor)
        morph_ids.append(morph_tensor)

    return (
        torch.stack(input_ids),
        torch.stack(attention_masks),
        torch.stack(phoneme_ids),
        torch.stack(morph_ids),
    )

# Initialize Tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

# Tokenize and Map
input_ids, attention_masks, phoneme_ids, morph_ids = tokenize_and_map(texts, tokenizer, phoneme_vocab, morph_vocab)

# 4. MultiModalEncoder Model
class MultiModalEncoder(nn.Module):
    def __init__(self, bert_model_name, phoneme_vocab_size, morph_vocab_size, phoneme_dim=16, morph_dim=8):
        super(MultiModalEncoder, self).__init__()
        self.bert = BertModel.from_pretrained(bert_model_name)
        self.bert_hidden_size = self.bert.config.hidden_size

        # Embed phonemes and morph features
        self.phoneme_emb = nn.Embedding(phoneme_vocab_size, phoneme_dim)
        self.morph_emb = nn.Embedding(morph_vocab_size, morph_dim)

        # Combine BERT output + mean-pooled phoneme + morph embedding
        self.linear = nn.Linear(self.bert_hidden_size + phoneme_dim + morph_dim, 128)
        self.classifier = nn.Linear(128, 2)  # Binary classification

    def forward(self, input_ids, attention_mask, phoneme_ids, morph_ids):
        bert_output = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        bert_cls = bert_output.last_hidden_state[:, 0, :]  # [CLS] token representation

        # Pool phoneme features: mean over each word's phoneme sequence
        phoneme_embs = self.phoneme_emb(phoneme_ids)  # (B, T, P, D)
        phoneme_mean = phoneme_embs.mean(dim=2)       # (B, T, D)

        # Also take morph embedding (B, T, D) → mean across tokens
        morph_embs = self.morph_emb(morph_ids.squeeze(-1))  # (B, T, D)

        # For simplicity, take mean over token dim to get (B, D) representations
        phoneme_feat = phoneme_mean.mean(dim=1)
        morph_feat = morph_embs.mean(dim=1)

        # Concatenate all features
        concat = torch.cat([bert_cls, phoneme_feat, morph_feat], dim=1)
        x = self.linear(concat)
        x = torch.relu(x)
        out = self.classifier(x)
        return out

# 5. Initialize the Model
phoneme_vocab_size = len(phoneme_vocab)
morph_vocab_size = len(morph_vocab)

model = MultiModalEncoder(
    bert_model_name='bert-base-multilingual-cased',
    phoneme_vocab_size=phoneme_vocab_size,
    morph_vocab_size=morph_vocab_size
)

model.eval()  # for inference

# 6. Inference
with torch.no_grad():
    outputs = model(input_ids, attention_masks, phoneme_ids, morph_ids)
    predictions = torch.argmax(outputs, dim=1)
    print("Predictions:", predictions)

# 7. Probability Calculation
probs = torch.softmax(outputs, dim=1)
print("Probabilities:", probs)

# 8. Dataset Class
class MultimodalTextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, phoneme_vocab, morph_vocab, max_len=20):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.phoneme_vocab = phoneme_vocab
        self.morph_vocab = morph_vocab
        self.max_len = max_len

        (
            self.input_ids,
            self.attention_masks,
            self.phoneme_ids,
            self.morph_ids,
        ) = tokenize_and_map(texts, tokenizer, phoneme_vocab, morph_vocab, max_len=max_len)

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        return (
            self.input_ids[idx],
            self.attention_masks[idx],
            self.phoneme_ids[idx],
            self.morph_ids[idx],
            torch.tensor(self.labels[idx], dtype=torch.long)
        )



In [None]:
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# Convert predictions and labels to numpy arrays
predictions_np = predictions.cpu().numpy()
labels_np = torch.tensor(labels).cpu().numpy()

# Generate and print classification report
report = classification_report(labels_np, predictions_np, target_names=['Kelantanese', 'Code-switch'])
print("Classification Report:\n", report)

# Generate confusion matrix
cm = confusion_matrix(labels_np, predictions_np)

# Plot confusion matrix
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Kelantanese', 'Code-switch'], yticklabels=['Kelantanese', 'Code-switch'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()


# Traslation Task

In [None]:
!pip install --quiet sacrebleu


In [None]:
!pip install sacrebleu


In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import random
from sklearn.model_selection import train_test_split
from collections import Counter
import sacrebleu
import re


In [None]:
# Sample paired dataset (Kelantanese ↔ Code-Switch)
data_pairs = [
    ("Demo nok gi mano?", "Where are you going?"),
    ("Kito make nasi tadi.", "We ate rice earlier."),
    ("Mu buat gapo situ?", "What are you doing there?"),
    ("Tok sir la gi sanung.", "Don't go there."),
    ("Bakpo mu diam jah?", "Why are you so quiet?"),
    ("I saw dia dekat pasar.", "I saw her at the market."),
    ("He already balik rumah.", "He already went home."),
    ("Dia suka makan nasi lemak.", "She loves eating nasi lemak."),
    ("Jom kita pergi tengok movie.", "Let's go watch a movie."),
    ("They selalu datang lambat.", "They always come late.")
]

# Split to train/test
train_data, test_data = train_test_split(data_pairs, test_size=0.2, random_state=42)


In [None]:
def tokenize(text):
    text = re.sub(r"[^\w\s]", "", text.lower())
    return text.strip().split()

class Vocab:
    def __init__(self, texts, min_freq=1):
        tokens = [token for text in texts for token in tokenize(text)]
        counter = Counter(tokens)
        self.itos = ["<pad>", "<sos>", "<eos>", "<unk>"] + [word for word, freq in counter.items() if freq >= min_freq]
        self.stoi = {word: idx for idx, word in enumerate(self.itos)}

    def encode(self, text, max_len=20):
        tokens = ["<sos>"] + tokenize(text) + ["<eos>"]
        token_ids = [self.stoi.get(t, self.stoi["<unk>"]) for t in tokens]
        return token_ids[:max_len] + [self.stoi["<pad>"]] * (max_len - len(token_ids))

    def decode(self, ids):
        return " ".join([self.itos[i] for i in ids if self.itos[i] not in ["<sos>", "<eos>", "<pad>"]])

# Build vocab from both sides
src_vocab = Vocab([src for src, _ in train_data])
tgt_vocab = Vocab([tgt for _, tgt in train_data])


In [None]:
class TranslationDataset(Dataset):
    def __init__(self, pairs, src_vocab, tgt_vocab):
        self.pairs = pairs
        self.src_vocab = src_vocab
        self.tgt_vocab = tgt_vocab

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        src, tgt = self.pairs[idx]
        return (
            torch.tensor(self.src_vocab.encode(src), dtype=torch.long),
            torch.tensor(self.tgt_vocab.encode(tgt), dtype=torch.long)
        )

def collate_fn(batch):
    src_batch, tgt_batch = zip(*batch)
    return torch.stack(src_batch), torch.stack(tgt_batch)

train_loader = DataLoader(TranslationDataset(train_data, src_vocab, tgt_vocab), batch_size=2, collate_fn=collate_fn)
test_loader = DataLoader(TranslationDataset(test_data, src_vocab, tgt_vocab), batch_size=1, collate_fn=collate_fn)


In [None]:
class SimpleTransformer(nn.Module):
    def __init__(self, src_vocab_size, tgt_vocab_size, d_model=128, nhead=4, num_layers=2):
        super().__init__()
        self.src_embed = nn.Embedding(src_vocab_size, d_model)
        self.tgt_embed = nn.Embedding(tgt_vocab_size, d_model)
        self.transformer = nn.Transformer(d_model, nhead, num_layers, num_layers)
        self.out = nn.Linear(d_model, tgt_vocab_size)

    def forward(self, src, tgt):
        src = self.src_embed(src).permute(1, 0, 2)  # (S, N, E)
        tgt = self.tgt_embed(tgt).permute(1, 0, 2)
        out = self.transformer(src, tgt)
        return self.out(out).permute(1, 0, 2)


# -----------

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = SimpleTransformer(len(src_vocab.itos), len(tgt_vocab.itos)).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss(ignore_index=src_vocab.stoi["<pad>"])

for epoch in range(3):
    model.train()
    for src, tgt in train_loader:
        src, tgt = src.to(device), tgt.to(device)
        tgt_input = tgt[:, :-1]
        tgt_output = tgt[:, 1:]

        logits = model(src, tgt_input)
        loss = criterion(logits.reshape(-1, logits.shape[-1]), tgt_output.reshape(-1))

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch+1}, Loss: {loss.item():.4f}")


In [None]:
import torch
import torch.nn.functional as F
import sacrebleu  # This replaces torchtext for BLEU score computation

def generate_translation(model, src, src_vocab, tgt_vocab, max_len=50, beam_width=3, device=torch.device("cuda")):
    model.eval()
    # Start of sentence token
    sos_token = tgt_vocab.stoi["<sos>"]
    eos_token = tgt_vocab.stoi["<eos>"]

    # Initialize beam search
    beams = [(torch.tensor([sos_token]).to(device), 0)]  # Each beam stores a sequence and its score

    for _ in range(max_len):
        new_beams = []
        for seq, score in beams:
            # Pass current sequence through model
            logits = model(src, seq.unsqueeze(0))  # Assuming model outputs logits (batch_size, seq_len, vocab_size)
            probs = F.softmax(logits[:, -1, :], dim=-1)  # Get probability distribution for next token

            top_k_probs, top_k_indices = probs.topk(beam_width, dim=-1)  # Get top-k tokens and their probabilities

            for prob, idx in zip(top_k_probs[0], top_k_indices[0]):
                new_seq = torch.cat([seq, idx.unsqueeze(0)])
                new_score = score - torch.log(prob)  # Accumulate negative log probability (we want to minimize)
                new_beams.append((new_seq, new_score))

        # Select top k sequences based on cumulative score
        beams = sorted(new_beams, key=lambda x: x[1])[:beam_width]

        # Check if all beams ended with eos_token
        if all(seq[-1].item() == eos_token for seq, _ in beams):
            break

    # Return the best beam
    best_sequence = beams[0][0]
    return best_sequence

def evaluate_translation(model, test_loader, src_vocab, tgt_vocab, device):
    model.eval()
    predictions, references = [], []

    with torch.no_grad():
        for src, tgt in test_loader:
            src = src.to(device)
            tgt = tgt.to(device)

            # Generate translation output using beam search
            output_sequence = generate_translation(model, src, src_vocab, tgt_vocab, device=device)

            # Decode predicted output and reference output (target sequence)
            pred_text = [tgt_vocab.itos[i] for i in output_sequence.tolist() if tgt_vocab.itos[i] != "<pad>"]
            ref_text = [tgt_vocab.itos[i] for i in tgt[0].tolist() if tgt_vocab.itos[i] != "<pad>"]

            predictions.append(" ".join(pred_text))
            references.append([" ".join(ref_text)])

    # Compute BLEU score using sacrebleu
    bleu = sacrebleu.corpus_bleu(predictions, references)

    # Print prediction and reference pairs
    for i, (pred, ref) in enumerate(zip(predictions, references)):
        print(f"\n[{i+1}]")
        print(f"Prediction: {pred}")
        print(f"Reference : {ref[0]}")

    # Output the BLEU score
    print(f"\nCorpus BLEU score: {bleu.score:.2f}")


In [None]:
evaluate_translation(model, test_loader, src_vocab, tgt_vocab, device)


In [None]:
def evaluate_translation(model, test_loader, src_vocab, tgt_vocab, device):
    model.eval()
    predictions, references = [], []

    with torch.no_grad():
        for src, tgt in test_loader:
            src = src.to(device)
            tgt = tgt.to(device)

            # Generate translation output for the current test example
            tgt_input = torch.full((1, 1), tgt_vocab.stoi["<sos>"], dtype=torch.long).to(device)
            output_sentence = []

            for _ in range(20):  # max length
                logits = model(src, tgt_input)
                next_token = logits[:, -1, :].argmax(-1).unsqueeze(1)
                tgt_input = torch.cat([tgt_input, next_token], dim=1)

                if next_token.item() == tgt_vocab.stoi["<eos>"]:
                    break
                output_sentence.append(next_token.item())

            # Decode predicted output and reference output (target sequence)
            pred_text = tgt_vocab.decode(output_sentence)
            ref_text = tgt_vocab.decode(tgt[0].tolist())

            # Remove <pad> tokens from reference and prediction for BLEU calculation
            pred_text = [tgt_vocab.itos[i] for i in output_sentence if tgt_vocab.itos[i] != "<pad>"]
            ref_text = [tgt_vocab.itos[i] for i in tgt[0].tolist() if tgt_vocab.itos[i] != "<pad>"]

            predictions.append(" ".join(pred_text))
            references.append([" ".join(ref_text)])

    bleu = sacrebleu.corpus_bleu(predictions, references)
    for i, (pred, ref) in enumerate(zip(predictions, references)):
        print(f"\n[{i+1}]")
        print(f"Prediction: {pred}")
        print(f"Reference : {ref[0]}")

    print(f"\nCorpus BLEU score: {bleu.score:.2f}")


# ------HYBRID EMBEDDINGS

In [None]:
import torch
import torch.nn as nn
from transformers import BertModel

class PhonemeMorphologyHybridModel(nn.Module):
    def __init__(self, vocab_size, phoneme_emb_dim, morphology_emb_dim, embedding_dim, hidden_dim, bert_model_name="bert-base-uncased"):
        super(PhonemeMorphologyHybridModel, self).__init__()

        # Word embedding layer (e.g., GloVe, FastText)
        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)

        # Phoneme embedding layer
        self.phoneme_embeddings = nn.Embedding(vocab_size, phoneme_emb_dim)

        # Morphology-aware embedding layer
        self.morphology_embeddings = nn.Embedding(vocab_size, morphology_emb_dim)

        # BERT model for contextual embeddings
        self.bert = BertModel.from_pretrained(bert_model_name)

        # LSTM encoder-decoder
        self.encoder = nn.LSTM(input_size=embedding_dim + phoneme_emb_dim + morphology_emb_dim + 768,  # Word + Phoneme + Morphology + BERT embeddings
                               hidden_size=hidden_dim,
                               num_layers=2,
                               batch_first=True)

        self.decoder = nn.LSTM(input_size=hidden_dim,
                               hidden_size=hidden_dim,
                               num_layers=2,
                               batch_first=True)

        # Output layer
        self.fc_out = nn.Linear(hidden_dim, vocab_size)

    def forward(self, src, tgt):
        # Word-level embeddings
        word_embeddings = self.word_embeddings(src)

        # Phoneme embeddings
        phoneme_embeddings = self.phoneme_embeddings(src)

        # Morphology-aware embeddings
        morphology_embeddings = self.morphology_embeddings(src)

        # Get BERT contextual embeddings (assuming src is already tokenized)
        bert_inputs = self.bert(input_ids=src)[0]  # Output embeddings from BERT

        # Combine embeddings: Concatenate word, phoneme, morphology, and BERT embeddings
        combined_embeddings = torch.cat((word_embeddings, phoneme_embeddings, morphology_embeddings, bert_inputs), dim=-1)

        # Pass combined embeddings through the encoder and decoder
        encoder_output, (hidden, cell) = self.encoder(combined_embeddings)
        decoder_output, _ = self.decoder(tgt, (hidden, cell))

        # Output predictions
        output = self.fc_out(decoder_output)
        return output


# Example of how to use the model
vocab_size = 10000  # Adjust based on your data
phoneme_emb_dim = 50  # Dimensionality of phoneme embeddings
morphology_emb_dim = 100  # Dimensionality of morphology-aware embeddings
embedding_dim = 300  # For GloVe or other pre-trained embeddings
hidden_dim = 512

model = PhonemeMorphologyHybridModel(vocab_size, phoneme_emb_dim, morphology_emb_dim, embedding_dim, hidden_dim)


In [None]:
def evaluate_translation(model, test_loader, device=torch.device("cuda")):
    model.eval()

    predictions = []
    references = []

    for batch in test_loader:
        # Assuming the DataLoader yields a tuple (src_indices, phoneme_indices, morphology_indices, tgt_indices)
        # If not, you will need to update the dataset to include phoneme_indices and morphology_indices
        if len(batch) == 2:  # If batch contains only source and target indices
            src_indices, tgt_indices = batch
            phoneme_indices = None  # Replace with actual phoneme indices if available
            morphology_indices = None  # Replace with actual morphology indices if available
        else:
            src_indices, phoneme_indices, morphology_indices, tgt_indices = batch

        src_indices, phoneme_indices, morphology_indices, tgt_indices = \
            src_indices.to(device), phoneme_indices.to(device), morphology_indices.to(device), tgt_indices.to(device)

        with torch.no_grad():
            # Forward pass through the model
            output = model(src_indices, phoneme_indices, morphology_indices, tgt_indices)

            # Get the predicted translation (argmax over output)
            predicted_indices = output.argmax(dim=-1)

            # Convert predicted indices to words (this part depends on how you map indices to words)
            predicted_words = [tgt_vocab.itos[idx] for idx in predicted_indices.squeeze().cpu().numpy()]
            reference_words = [tgt_vocab.itos[idx] for idx in tgt_indices.squeeze().cpu().numpy()]

            predictions.append(predicted_words)
            references.append([reference_words])  # BLEU requires references to be a list of lists

    # Compute BLEU score (corpus-wide)
    bleu_score = corpus_bleu(references, predictions)
    print(f"Corpus BLEU score: {bleu_score * 100:.2f}%")



In [None]:
class TranslationDataset(torch.utils.data.Dataset):
    def __init__(self, src_data, phoneme_data, morphology_data, tgt_data):
        self.src_data = src_data
        self.phoneme_data = phoneme_data
        self.morphology_data = morphology_data
        self.tgt_data = tgt_data

    def __len__(self):
        return len(self.src_data)

    def __getitem__(self, idx):
        src_idx = self.src_data[idx]
        phoneme_idx = self.phoneme_data[idx]
        morphology_idx = self.morphology_data[idx]
        tgt_idx = self.tgt_data[idx]
        return src_idx, phoneme_idx, morphology_idx, tgt_idx


In [None]:
# Assuming you have the data loaded as numpy arrays or lists
test_dataset = TranslationDataset(src_data, phoneme_data, morphology_data, tgt_data)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=32, shuffle=False)


# ------TESTING 5

In [None]:
# Example vocab dictionary with '<unk>' token added
src_vocab = {
    '<sos>': 0,
    '<eos>': 1,
    'i': 2,
    'saw': 3,
    'her': 4,
    'at': 5,
    'the': 6,
    'market': 7,
    '<unk>': 8  # Added the <unk> token for unknown words
}

tgt_vocab = {
    '<sos>': 0,
    '<eos>': 1,
    'yo': 2,
    'fui': 3,
    'a': 4,
    'la': 5,
    'tienda': 6,
    '<unk>': 7  # Added the <unk> token for unknown words
}

# Example source and target data
src_data = [
    ['i', 'saw', 'her', 'at', 'the', 'market'],
    ['are', 'you', 'going', 'to', 'the', 'store']
]

tgt_data = [
    ['yo', 'fui', 'a', 'la', 'tienda'],
    ['estás', 'yendo', 'a', 'la', 'tienda']
]

# Convert sentences to indices using the vocab dictionaries
src_data_indices = [[src_vocab.get(word, src_vocab['<unk>']) for word in sentence] for sentence in src_data]
tgt_data_indices = [[tgt_vocab.get(word, tgt_vocab['<unk>']) for word in sentence] for sentence in tgt_data]

print("Source Data Indices:", src_data_indices)
print("Target Data Indices:", tgt_data_indices)


In [None]:
# Assuming you have a vocab object that maps words to indices
# Example vocab dictionary
src_vocab = {'<sos>': 0, '<eos>': 1, 'i': 2, 'saw': 3, 'her': 4, 'at': 5, 'the': 6, 'market': 7, ...}
tgt_vocab = {'<sos>': 0, '<eos>': 1, 'yo': 2, 'fui': 3, 'a': 4, 'la': 5, 'tienda': 6, ...}

# Convert sentences to indices
src_data_indices = [[src_vocab[word] for word in sentence] for sentence in src_data]
tgt_data_indices = [[tgt_vocab[word] for word in sentence] for sentence in tgt_data]


In [None]:
# Define the TranslationDataset class
class TranslationDataset(torch.utils.data.Dataset):
    def __init__(self, src_data, phoneme_data, morphology_data, tgt_data):
        self.src_data = src_data
        self.phoneme_data = phoneme_data
        self.morphology_data = morphology_data
        self.tgt_data = tgt_data

    def __len__(self):
        return len(self.src_data)

    def __getitem__(self, idx):
        src_idx = self.src_data[idx]
        phoneme_idx = self.phoneme_data[idx]
        morphology_idx = self.morphology_data[idx]
        tgt_idx = self.tgt_data[idx]
        return src_idx, phoneme_idx, morphology_idx, tgt_idx

# Assuming you have the data loaded and converted to indices
test_dataset = TranslationDataset(src_data_indices, phoneme_data, morphology_data, tgt_data_indices)

# Create the DataLoader
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=32, shuffle=False)


In [None]:
# Assuming your model is already defined and loaded
evaluate_translation(model, test_loader, device=torch.device("cuda"))


In [None]:
def forward(self, src, tgt):
    # Word-level embeddings
    word_embeddings = self.word_embeddings(src)

    # Phoneme embeddings (if applicable)
    phoneme_embeddings = self.phoneme_embeddings(src)

    # Combine word and phoneme embeddings
    combined_embeddings = torch.cat([word_embeddings, phoneme_embeddings], dim=-1)

    # Pass combined embeddings through the encoder
    encoder_output, (hidden, cell) = self.encoder(combined_embeddings)

    # Ensure hidden and cell states are in the correct shape
    batch_size = combined_embeddings.size(0)

    # For a batch size of 1, remove the unnecessary dimensions to make hidden/cell 2D
    if batch_size == 1:
        hidden = hidden.squeeze(0)  # Remove the first dimension if batch size is 1
        cell = cell.squeeze(0)  # Remove the first dimension if batch size is 1
    else:
        # Otherwise, make sure the states are 2D (batch_size, hidden_size)
        hidden = hidden.view(batch_size, self.hidden_size)  # Ensure it's 2D
        cell = cell.view(batch_size, self.hidden_size)  # Ensure it's 2D

    # Pass the hidden and cell states to the decoder
    decoder_output, _ = self.decoder(tgt, (hidden, cell))

    # Output predictions
    return decoder_output


In [None]:
import torch
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence

# Custom Dataset for Translation
class TranslationDataset(torch.utils.data.Dataset):
    def __init__(self, src_data, tgt_data):
        self.src_data = src_data
        self.tgt_data = tgt_data

    def __len__(self):
        return len(self.src_data)

    def __getitem__(self, idx):
        return self.src_data[idx], self.tgt_data[idx]

# Custom collate function to pad sequences
def collate_fn(batch):
    src_batch, tgt_batch = zip(*batch)

    # Pad the source and target sequences
    src_padded = pad_sequence([torch.tensor(seq) for seq in src_batch], padding_value=src_vocab['<unk>'], batch_first=True)
    tgt_padded = pad_sequence([torch.tensor(seq) for seq in tgt_batch], padding_value=tgt_vocab['<unk>'], batch_first=True)

    return src_padded, tgt_padded

# Example data
src_data = [
    ['i', 'saw', 'her', 'at', 'the', 'market'],
    ['are', 'you', 'going', 'to', 'the', 'store']
]

tgt_data = [
    ['yo', 'fui', 'a', 'la', 'tienda'],
    ['estás', 'yendo', 'a', 'la', 'tienda']
]

# Convert sentences to indices using vocab dictionaries
src_data_indices = [[src_vocab.get(word, src_vocab['<unk>']) for word in sentence] for sentence in src_data]
tgt_data_indices = [[tgt_vocab.get(word, tgt_vocab['<unk>']) for word in sentence] for sentence in tgt_data]

# Create dataset and dataloader
dataset = TranslationDataset(src_data_indices, tgt_data_indices)
test_loader = DataLoader(dataset, batch_size=2, collate_fn=collate_fn, shuffle=False)

# Define the evaluate_translation function
def evaluate_translation(model, test_loader, device):
    model.eval()  # Set the model to evaluation mode
    total_loss = 0.0

    # Move the model to the correct device (CUDA)
    model.to(device)

    with torch.no_grad():
        for src_indices, tgt_indices in test_loader:
            # Move input tensors to the same device as the model
            src_indices, tgt_indices = src_indices.to(device), tgt_indices.to(device)

            # Forward pass (adjust based on your model architecture)
            output = model(src_indices, tgt_indices)  # Assuming your model takes src and tgt
            # Assuming loss function and target processing is done correctly
            loss = loss_function(output, tgt_indices)
            total_loss += loss.item()

    # Print the average loss
    print(f"Total loss: {total_loss / len(test_loader)}")

# Assuming your model is already defined and loaded
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
evaluate_translation(model, test_loader, device=device)
