In [None]:
import pandas as pd
import regex as re
import json
from tqdm import tqdm

In [None]:
data = pd.read_excel('telugu_lyrics.xlsx')
data.head()

Unnamed: 0,movie,telugu_lyrics,lyricist,year
0,doosukeltha,2010 summerలో రేణిగుంట స్టేషన్\nplatform 2 లో ...,Ramajogayya Sastri,2013
1,doosukeltha,ఉగ్గు పాల రొజుల్లోనే సిగ్గు శరం వదిలేసాడే\nనిక...,Ramajogayya Sastri,2013
2,doosukeltha,"\n\nModati sari\nSingers: Rahul Sipligunj, Sud...",Sreemani,2013
3,doosukeltha,సూదిమందే గుచ్చినావే గుండెల్లో సుందరాంగి\nప్రేమ...,Sreemani,2013
4,doosukeltha,2010 summerలో రేణిగుంట స్టేషన్\nplatform 2 లో ...,Ramajogayya Sastri,2013


In [None]:
lyrics = data['telugu_lyrics'].astype(str)

### Understanding Telugu Text

In [None]:
# utf-8 encoding on Sample text
text = lyrics[0][:100]
enc_text = list(text.encode('utf-8'))
print(text, '\n\nUtf-8 Encoded:', enc_text, end='\n\n')

print('Length increased from', len(text), '->', len(enc_text))

2010 summerలో రేణిగుంట స్టేషన్
platform 2 లో మెరిసిందో అందాల సెన్సేషన్
ఒక లుక్ ఏసా తను చూసేలా టక్కున 

Utf-8 Encoded: [50, 48, 49, 48, 32, 115, 117, 109, 109, 101, 114, 224, 176, 178, 224, 177, 139, 32, 224, 176, 176, 224, 177, 135, 224, 176, 163, 224, 176, 191, 224, 176, 151, 224, 177, 129, 224, 176, 130, 224, 176, 159, 32, 224, 176, 184, 224, 177, 141, 224, 176, 159, 224, 177, 135, 224, 176, 183, 224, 176, 168, 224, 177, 141, 10, 112, 108, 97, 116, 102, 111, 114, 109, 32, 50, 32, 224, 176, 178, 224, 177, 139, 32, 224, 176, 174, 224, 177, 134, 224, 176, 176, 224, 176, 191, 224, 176, 184, 224, 176, 191, 224, 176, 130, 224, 176, 166, 224, 177, 139, 32, 224, 176, 133, 224, 176, 130, 224, 176, 166, 224, 176, 190, 224, 176, 178, 32, 224, 176, 184, 224, 177, 134, 224, 176, 168, 224, 177, 141, 224, 176, 184, 224, 177, 135, 224, 176, 183, 224, 176, 168, 224, 177, 141, 10, 224, 176, 146, 224, 176, 149, 32, 224, 176, 178, 224, 177, 129, 224, 176, 149, 224, 177, 141, 194, 160, 224, 176, 143, 224

In [None]:
# single character analysis
print('1 python char:',text[11])
print('1 python char, in utf-8:', list(text[11].encode('utf-8')), end='\n\n')

print('1 telugu char:',text[11:13])
print('1 telugu char, in utf-8:', list(text[11:13].encode('utf-8')))

1 python char: ల
1 python char, in utf-8: [224, 176, 178]

1 telugu char: లో
1 telugu char, in utf-8: [224, 176, 178, 224, 177, 139]


* 1 Telugu character  =  2 python chars = 6 utf-8 numbers

In [None]:
# Dataset Length (characters & utf-8 numbers)

print("Total Python Chars in Dataset:", len(''.join(lyrics)))
print("Total Utf-8 Chars in Dataset:", len(list(''.join(lyrics).encode('utf-8'))), end='\n\n')

print(f"Seq length scaled by {len(list(''.join(lyrics).encode('utf-8')))/len(''.join(lyrics)):.2f}X")

Total Python Chars in Dataset: 635860
Total Utf-8 Chars in Dataset: 1590212

Seq length scaled by 2.50X


### Character Level Tokenization (Pyhton char)

In [None]:
class CharLevelTokenizer:
    def __init__(self):
        self.char_to_idx = {}  # Character to index mapping
        self.idx_to_char = {}  # Index to character mapping
        self.vocab_size = 0

    def fit(self, texts: list):
        """Build vocabulary from list of texts"""
        # Get unique characters from all texts
        unique_chars = sorted(set(''.join(texts)))

        # Create mappings
        self.char_to_idx = {char: idx for idx, char in enumerate(unique_chars)}
        self.idx_to_char = {idx: char for idx, char in enumerate(unique_chars)}
        self.vocab_size = len(unique_chars)

    def encode(self, text):
        """Convert text to list of token indices"""
        return [self.char_to_idx[char] for char in text]

    def decode(self, indices):
        """Convert list of token indices back to text"""
        return ''.join(self.idx_to_char[idx] for idx in indices)

    def get_vocab(self) -> dict:
        """Return the vocabulary dictionary"""
        return {
            'char_to_idx': self.char_to_idx,
            'idx_to_char': self.idx_to_char,
            'vocab_size': self.vocab_size
        }

    def save_vocab(self, filename: str):
        """Saves the vocab as <filename>.json"""

        vocab = self.get_vocab()

        with open(f"{filename}.json", 'w', encoding='utf-8') as f:
            json.dump(vocab,f, indent = 2)


    def set_vocab(self, vocab: dict):
        """Sets up the tokenizer with given Vocabulary"""

        self.char_to_idx = vocab['char_to_idx']
        self.idx_to_char = vocab['idx_to_char']
        self.vocab_size = vocab['vocab_size']

In [None]:
tokenizer = CharLevelTokenizer()
tokenizer.fit(lyrics)

print('Vocab size:', tokenizer.vocab_size)

Vocab size: 165


In [None]:
# Tokenizing our Dataset
enc_data = tokenizer.encode(''.join(lyrics))

print('Sample:', enc_data[:100], end = '\n\n')
print('Total Tokens Length:', len(enc_data))

Sample: [14, 12, 13, 12, 1, 69, 71, 63, 63, 55, 68, 126, 146, 1, 125, 143, 112, 136, 100, 138, 83, 108, 1, 131, 148, 108, 143, 130, 117, 148, 0, 66, 62, 51, 70, 56, 65, 68, 63, 1, 14, 1, 126, 146, 1, 123, 142, 125, 136, 131, 136, 83, 115, 146, 1, 85, 83, 115, 135, 126, 1, 131, 142, 117, 148, 131, 143, 130, 117, 148, 0, 95, 98, 1, 126, 138, 98, 148, 80, 93, 131, 135, 80, 113, 117, 138, 1, 103, 139, 131, 143, 126, 135, 1, 108, 98, 148, 98, 138, 117]

Total Tokens Length: 635860


# Byte Pair Encoding Tokenization

In [None]:
import json
import re
from tqdm import tqdm

class BPETokenizer:
    def __init__(self, vocab_size=1000):
        self.vocab_size = vocab_size
        self.token_to_idx = {}
        self.idx_to_token = {}
        self.pattern = None

    def _get_stats(self, ids):
        # Count frequency of adjacent pairs
        counts = {}
        i = 0
        while i < len(ids) - 1:
            pair = (ids[i], ids[i+1])
            counts[pair] = counts.get(pair, 0) + 1
            i += 1
        return counts

    def _merge(self, ids, pair, idx):
        newids = []
        i = 0
        while i < len(ids):
            if i < len(ids) - 1 and ids[i] == pair[0] and ids[i+1] == pair[1]:
                newids.append(idx)
                i += 2
            else:
                newids.append(ids[i])
                i += 1
        return newids

    def fit(self, texts, verbose = False):

        unique_chars = sorted(set(''.join(texts)))
        self.idx_to_token = {idx: char for idx, char in enumerate(unique_chars)}
        self.token_to_idx = {char: idx for idx, char in enumerate(unique_chars)}
        current_vocab_size = len(unique_chars)

        if verbose:
            print(f"Initial vocabulary size: {current_vocab_size}")

        # Convert text to token ids
        ids = [self.token_to_idx[char] for char in ''.join(texts)]

        num_merges = self.vocab_size - current_vocab_size

        for i in tqdm(range(num_merges), desc="Training BPE"):
            stats = self._get_stats(ids)
            if not stats:
                break

            # Find most frequent pair
            pair = max(stats.items(), key=lambda x: x[1])[0]

            # Replace all occurrences of pair with new token
            ids = self._merge(ids, pair, current_vocab_size)

            # Update vocabulary
            merged_token = self.idx_to_token[pair[0]] + self.idx_to_token[pair[1]]
            self.idx_to_token[current_vocab_size] = merged_token
            current_vocab_size += 1

        self.token_to_idx = {token: idx for idx, token in self.idx_to_token.items()}

        if verbose:
            print(f"Final vocabulary size: {current_vocab_size}")

        
        # Sort tokens by length in descending order to match longest tokens first
        tokens_by_length = sorted(self.token_to_idx.keys(), key=len, reverse=True)
        self.pattern = re.compile("|".join(map(re.escape, tokens_by_length))) # Create regex pattern for tokenization

    def encode(self, text):
        if self.pattern is None:
            raise ValueError("Tokenizer must be fitted before encoding")

        # Find all tokens in text using regex pattern
        tokens = self.pattern.findall(text)

        return [self.token_to_idx[token] for token in tokens]

    def decode(self, ids):
        return ''.join(self.idx_to_token[idx] for idx in ids)

    def _get_vocab(self):
        return {
            'token_to_idx': self.token_to_idx,
            'idx_to_token': self.idx_to_token,
            'vocab_size': self.vocab_size
        }

    def save_vocab(self, filename: str):
        vocab = self._get_vocab()
        with open(f"{filename}.json", 'w', encoding='utf-8') as f:
            json.dump(vocab, f, ensure_ascii=False, indent=2)

    def load_vocab(self, filename: str):
        with open(f"{filename}.json", 'r', encoding='utf-8') as f:
            vocab = json.load(f)
        self.token_to_idx = vocab['token_to_idx']
        self.idx_to_token = vocab['idx_to_token']
        self.vocab_size = vocab['vocab_size']

        # Recreate regex pattern
        tokens_by_length = sorted(self.token_to_idx.keys(), key=len, reverse=True)
        self.pattern = re.compile("|".join(map(re.escape, tokens_by_length)))

In [None]:
tokenizer = BPETokenizer(vocab_size=512)
tokenizer.fit(lyrics)

print('\nVocab size:', tokenizer.vocab_size)

Training BPE: 100%|██████████| 347/347 [02:27<00:00,  2.35it/s]


Vocab size: 512





In [None]:
enc_data = tokenizer.encode(''.join(lyrics))

print('Sample:', enc_data[:100], end = '\n\n')
print('Total Tokens Length:', len(enc_data))

Sample: [14, 12, 13, 12, 1, 69, 71, 63, 63, 333, 253, 215, 112, 136, 223, 181, 245, 148, 108, 143, 130, 171, 0, 66, 62, 51, 70, 56, 65, 68, 63, 1, 14, 1, 253, 413, 183, 233, 175, 172, 358, 176, 245, 142, 171, 368, 130, 171, 0, 437, 1, 375, 200, 80, 93, 290, 80, 113, 226, 274, 368, 359, 108, 252, 138, 195, 131, 183, 174, 264, 290, 0, 200, 182, 120, 251, 131, 142, 207, 427, 344, 410, 175, 125, 253, 412, 279, 119, 226, 317, 187, 427, 290, 0, 105, 138, 224, 502, 314, 252]

Total Tokens Length: 346606


In [None]:
# Compression in seq length
print(f"Seq length scaled by {len(enc_data)/len(''.join(lyrics)):.2f}X")

Seq length scaled by 0.55X


In [None]:
tokenizer.save_vocab('bpe_tokenizer')