# Phoneme Embeddings

In [1]:
pip install phonemizer

Collecting phonemizer
  Downloading phonemizer-3.3.0-py3-none-any.whl.metadata (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.2/48.2 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
Collecting segments (from phonemizer)
  Downloading segments-2.3.0-py2.py3-none-any.whl.metadata (3.5 kB)
Collecting dlinfo (from phonemizer)
  Downloading dlinfo-2.0.0-py3-none-any.whl.metadata (1.1 kB)
Collecting csvw>=1.5.6 (from segments->phonemizer)
  Downloading csvw-3.5.1-py2.py3-none-any.whl.metadata (10 kB)
Collecting isodate (from csvw>=1.5.6->segments->phonemizer)
  Downloading isodate-0.7.2-py3-none-any.whl.metadata (11 kB)
Collecting rfc3986<2 (from csvw>=1.5.6->segments->phonemizer)
  Downloading rfc3986-1.5.0-py2.py3-none-any.whl.metadata (6.5 kB)
Collecting language-tags (from csvw>=1.5.6->segments->phonemizer)
  Downloading language_tags-1.2.0-py3-none-any.whl.metadata (2.1 kB)
Collecting rdflib (from csvw>=1.5.6->segments->phonemizer)
  Downloading 

# Morphology Embeddings

In [2]:
pip install morfessor


Collecting morfessor
  Downloading Morfessor-2.0.6-py3-none-any.whl.metadata (628 bytes)
Downloading Morfessor-2.0.6-py3-none-any.whl (35 kB)
Installing collected packages: morfessor
Successfully installed morfessor-2.0.6
Note: you may need to restart the kernel to use updated packages.


# Environment set

In [3]:
!pip install phonemizer morfessor
!apt-get install espeak-ng





The following additional packages will be installed:
  espeak-ng-data libespeak-ng1 libpcaudio0 libsonic0
The following NEW packages will be installed:
  espeak-ng espeak-ng-data libespeak-ng1 libpcaudio0 libsonic0
0 upgraded, 5 newly installed, 0 to remove and 122 not upgraded.
Need to get 4,526 kB of archives.
After this operation, 11.9 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/main amd64 libpcaudio0 amd64 1.1-6build2 [8,956 B]
Get:2 http://archive.ubuntu.com/ubuntu jammy/main amd64 libsonic0 amd64 0.2.0-11build1 [10.3 kB]
Get:3 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 espeak-ng-data amd64 1.50+dfsg-10ubuntu0.1 [3,956 kB]
Get:4 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 libespeak-ng1 amd64 1.50+dfsg-10ubuntu0.1 [207 kB]
Get:5 http://archive.ubuntu.com/ubuntu jammy-updates/universe amd64 espeak-ng amd64 1.50+dfsg-10ubuntu0.1 [343 kB]
Fetched 4,526 kB in 0s (12.7 MB/s)
Selecting previously u

# Combined Embedding Layer

In [4]:
import torch
import torch.nn as nn
from transformers import BertModel

# Example values (replace with actual numbers from your dataset)
num_phonemes = 100  # Adjust based on how many unique phonemes you have
num_morphs = 100    # Adjust based on your morph units
num_labels = 3      # E.g., 3 classes: Standard Malay, Sabah dialect, Code-switch

class CustomBertWithPhonemeMorph(nn.Module):
    def __init__(self):
        super().__init__()
        self.bert = BertModel.from_pretrained("bert-base-multilingual-cased")
        self.phoneme_emb = nn.Embedding(num_phonemes, 768)
        self.morph_emb = nn.Embedding(num_morphs, 768)
        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Linear(768, num_labels)

    def forward(self, input_ids, phoneme_ids, morph_ids, attention_mask=None):
        # [batch_size, seq_len, emb_dim]
        phoneme_vec = self.phoneme_emb(phoneme_ids).mean(dim=2)  # Mean pooling over morph units
        morph_vec = self.morph_emb(morph_ids).mean(dim=2)

        # BERT output
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        cls_emb = outputs.last_hidden_state[:, 0, :]  # [CLS] token representation

        # Combine embeddings
        combined = cls_emb + phoneme_vec[:, 0, :] + morph_vec[:, 0, :]  # Keep batch-first
        logits = self.classifier(self.dropout(combined))
        return logits


2025-04-21 16:25:29.362367: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1745252729.584206      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1745252729.649160      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


# -----TEST----------

# Step 1: Dummy Test Dataset

In [5]:
# Simulated text samples
texts = [
    "Saya pergi ke pasar",                          # Standard Malay
    "Aku mau pigi kedai bah",                       # Sabah dialect
    "Saya want to buy makanan from kedai"           # Code-switched Malay-English
]

# Fake labels for classification: 0 = Malay, 1 = Dialect, 2 = Code-Switch
labels = [0, 1, 2]


#  Step 2: Tokenizer

In [6]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased")
inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

# Step 3: Fake Phoneme & Morph IDs

In [7]:
import torch

batch_size, seq_len = inputs['input_ids'].shape

# Randomly simulate phoneme and morph IDs (normally derived from actual phoneme/morph analyzers)
phoneme_ids = torch.randint(0, 100, (batch_size, seq_len, 5))  # 5 phoneme units per word
morph_ids = torch.randint(0, 100, (batch_size, seq_len, 3))    # 3 morphemes per word


# Step 4: Define Models

In [8]:
class BaselineModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.bert = BertModel.from_pretrained("bert-base-multilingual-cased")
        self.classifier = nn.Linear(768, 3)

    def forward(self, input_ids, attention_mask=None):
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        cls_emb = outputs.last_hidden_state[:, 0, :]
        return self.classifier(cls_emb)

baseline_model = BaselineModel()
custom_model = CustomBertWithPhonemeMorph()  # From earlier cell


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

# Step 5: Forward Pass & Comparison

In [9]:
# Forward pass with baseline
with torch.no_grad():
    baseline_logits = baseline_model(inputs['input_ids'], attention_mask=inputs['attention_mask'])

# Forward pass with phoneme+morph model
with torch.no_grad():
    advanced_logits = custom_model(
        input_ids=inputs['input_ids'],
        attention_mask=inputs['attention_mask'],
        phoneme_ids=phoneme_ids,
        morph_ids=morph_ids
    )


# Step 6: Output Predictions

In [10]:
import torch.nn.functional as F

baseline_preds = torch.argmax(F.softmax(baseline_logits, dim=1), dim=1)
advanced_preds = torch.argmax(F.softmax(advanced_logits, dim=1), dim=1)

print("Ground Truth:", labels)
print("Baseline Predictions:", baseline_preds.tolist())
print("Phoneme+Morph Predictions:", advanced_preds.tolist())


Ground Truth: [0, 1, 2]
Baseline Predictions: [0, 0, 0]
Phoneme+Morph Predictions: [0, 0, 0]
