In [1]:
import logging
import sentencepiece as spm
logger = logging.getLogger('sentencepiece')
logger.setLevel(logging.CRITICAL) 
logger.propagate = False


## Cleaning the data

In [2]:
import re
import unicodedata

def clean_hindi_text(text):

    # Removed non-essential punctuation
    text = re.sub(r"[^\w\s\u0900-\u097F]", "", text) 
    text = re.sub(r"[a-zA-Z]", "", text)

    return text

# Example usage
hindi_text = "हिंदी में 123 यह एक उदाहरण है! इसे साफ, किया जाना चाहिए।"
cleaned_text = clean_hindi_text(hindi_text)
print(cleaned_text) 


हिंदी में 123 यह एक उदाहरण है इसे साफ किया जाना चाहिए।


## Question 1.
Perform the Unicode correction as discussed in class. Essentially, consonants with a
halant character should be counted as 1, while those without that should be counted as 2. You
may transliterate the corpus to ISO15919 format or ITRANS before and/or after performing the
correction.
Output the code to transform a sentence as a list of characters. This will be tested against random
inputs.

## Reading the corpus

In [3]:
file_path = 'corpus.txt'  
try:
    with open(file_path, 'r',encoding='utf-8') as file:
        data = file.read()
        lines = data

except FileNotFoundError:
    print(f"The file '{file_path}' was not found.")
except Exception as e:
    print(f"An error occurred: {e}")

with open(file_path,'r',encoding='utf-8') as f:
    line1 = f.readlines()

In [4]:
data = clean_hindi_text(data)

In [5]:
consonants = {
    "क": "k", "ख": "kh", "ग": "g", "घ": "gh", "ङ": "N",
    "च": "ch", "छ": "chh", "ज": "j", "झ": "jh", "ञ": "~N",
    "ट": "T", "ठ": "Th", "ड": "D", "ढ": "Dh", "ण": "N",
    "त": "t", "थ": "th", "द": "d", "ध": "dh", "न": "n",
    "प": "p", "फ": "ph", "ब": "b", "भ": "bh", "म": "m",
    "य": "y", "र": "r", "ल": "l", "व": "v", "श": "sh",
    "ष": "Sh", "स": "s", "ह": "h", "ळ": "L", "क्ष": "kSh",
    "ज्ञ": "j~n", "ड़": "R", "य़": "y", "ज़": "z", "ब़": "b", "क़": "q", "ख़": "Kh", "ग़": "G",
    "ड़": "R", "ढ़": "Rh", "फ़": "f", "श़": "sh", "ऋ": "Ri", "ॠ": "Ri", "ॡ": "Li", "ऌ": "Li", "ऴ": "L", "ॐ": "OM",
    "ऽ": "'"
         ""
}
hindi_to_english_transliteration = {
    # Vowels
    "अ": "a", "आ": "aa", "इ": "i", "ई": "ii", "उ": "u",
    "ऊ": "uu", "ऋ": "RRi", "ॠ": "RRi", "ऌ": "LLi", "ॡ": "LLi",
    "ए": "e", "ऐ": "ai", "ओ": "o", "औ": "au", "अं": "am", "अः": "ah",
}
matras = {"ा": "आ", "ि": "इ", "ी": "ई", "ु": "उ", "ू": "ऊ", "ृ": "ऋ", "ॄ": "ॠ", "ॢ": "ऌ", "ॣ": "ॡ", "े": "ए", "ै": "ऐ",
          "ो": "ओ", "ौ": "औ", "ं": "अं", "ः": "अः", "ँ": "अँ"}
reverse_matras = {"अ": "", "आ": "ा", "इ": "ि", "ई": "ी", "उ": "ु", "ऊ": "ू", "ऋ": "ृ", "ॠ": "ॄ", "ऌ": "ॢ", "ॡ": "ॣ",
                  "ए": "े", "ऐ": "ै", "ओ": "ो", "औ": "ौ", "अं": "ं", "अः": "ः", "अँ": "ँ"}
halanta = "्"
halanta_char = []
second_matras = {"अं": "ं", "अः": "ः", "अँ": "ँ"}
reverse_matras_second = {"ं": "अं", "ः": "अः", "ँ": "अँ"}

In [6]:
def character_conversion(line):
    output_characters = []
    for lineIndex in range(len(line)):
        if line[lineIndex] in consonants and lineIndex + 1 < len(line) and line[lineIndex + 1] != '्':
            if line[lineIndex + 1] in matras:
                output_characters.append(line[lineIndex])
                output_characters.append(matras[line[lineIndex + 1]])
                lineIndex += 1
                if lineIndex+1<len(line) and line[lineIndex + 1] in matras:
                    output_characters.append(matras[line[lineIndex + 1]])
                    lineIndex += 1
            elif line[lineIndex + 1] == "़":
                output_characters.append(line[lineIndex] + "़")
                if lineIndex + 2 < len(line) and line[lineIndex + 2] in second_matras:
                    output_characters.append(matras[line[lineIndex + 2]])
                    lineIndex += 2
                else:
                    output_characters.append('अ')
            else:
                output_characters.append(line[lineIndex])
                output_characters.append('अ')
        elif line[lineIndex] in consonants and lineIndex + 1 < len(line) and line[lineIndex + 1] == '्':
            val = line[lineIndex]
            val += "्"
            output_characters.append(val)
            lineIndex += 1
        elif line[lineIndex] in hindi_to_english_transliteration:
            output_characters.append(line[lineIndex])
        else:
            continue 
    return output_characters

In [7]:
def syllable_convert(syllable_fragment):
    current_segment = ""
    output_segments = []
    i = 0
    while i < len(syllable_fragment):
        if syllable_fragment[i] in consonants:
            current_segment = current_segment + syllable_fragment[i]
        elif syllable_fragment[i] in halanta_char:
            current_segment = current_segment + syllable_fragment[i]
        else:
            if current_segment != "":
                if syllable_fragment[i] in reverse_matras:
                    current_segment = current_segment + reverse_matras[syllable_fragment[i]]
                if i + 1 < len(syllable_fragment) and syllable_fragment[i + 1] in second_matras:
                    current_segment = current_segment + second_matras[syllable_fragment[i + 1]]
                    i = i + 1
            else:
                current_segment = syllable_fragment[i]
            output_segments.append(current_segment)
            current_segment = ""
        i += 1
    return [segment for segment in output_segments if segment] 

In [8]:
# def addHal(output_characters):
#     for i in range(len(output_characters)):  
#         char = output_characters[i]
#     if char in consonants:
#         output_characters[i] += halanta 
#     return output_characters

In [9]:
val = character_conversion('बारिश के मौसम में, लोग गरम चाय का आनंद लेते हैं।')
print(syllable_convert(val))
print(val)

['बा', 'रि', 'श', 'के', 'मौ', 'स', 'म', 'में', 'लो', 'ग', 'ग', 'र', 'म', 'चा', 'य', 'का', 'आ', 'नं', 'द', 'ले', 'ते', 'हैं']
['ब', 'आ', 'र', 'इ', 'श', 'अ', 'क', 'ए', 'म', 'औ', 'स', 'अ', 'म', 'अ', 'म', 'ए', 'अं', 'ल', 'ओ', 'ग', 'अ', 'ग', 'अ', 'र', 'अ', 'म', 'अ', 'च', 'आ', 'य', 'अ', 'क', 'आ', 'आ', 'न', 'अं', 'द', 'अ', 'ल', 'ए', 'त', 'ए', 'ह', 'ऐ', 'अं']


## Question 2.
Find all characters and syllables. Store a list of them in descending order of their
frequencies.
Find the top-20 frequent uni-gram and bi-gram frequencies of characters and syllables.

In [10]:

def print_top_k_frequencies(frequencies, k):
    "Prints the top 'k' items from a frequency dictionary." 
    sorted_freqs = sorted(frequencies.items(), key=lambda item: item[1], reverse=True)
    for token, frequency in sorted_freqs[:k]:
        print(f"{token}: {frequency}")


In [11]:
def process_line(line):
    """Processes a single line for syllable and character frequencies."""
    characters = character_conversion(line)
    syllables = syllable_convert(characters)
    return characters, syllables 

def update_frequencies(tokens, unigram_dict, bigram_dict):
    """Updates frequency dictionaries based on a list of tokens."""
    for token in tokens:
        unigram_dict[token] = unigram_dict.get(token, 0) + 1  # Update unigram count 

    for i in range(len(tokens) - 1):
        bigram = " ".join((tokens[i], tokens[i + 1])) 
        bigram_dict[bigram] = bigram_dict.get(bigram, 0) + 1  # Update bigram count

unigram_freq = {}
unigram_freq_char = {}
bigram_freq = {}
bigram_freq_char = {}

for line in line1: 
    characters, syllables = process_line(line)

    update_frequencies(characters, unigram_freq_char, bigram_freq_char)
    update_frequencies(syllables, unigram_freq, bigram_freq)

print("Syllable Unigram Frequency:")
print_top_k_frequencies(unigram_freq, 20)

print("Syllable Bigram Frequency:")
print_top_k_frequencies(bigram_freq, 20)

print("Character Unigram Frequency:")
print_top_k_frequencies(unigram_freq_char, 20)

print("Character Bigram Frequency:")
print_top_k_frequencies(bigram_freq_char, 20)


Syllable Unigram Frequency:
र: 1173238
क: 616759
स: 518703
न: 515812
प: 405371
के: 399507
त: 349748
ल: 333294
ने: 325953
म: 324879
का: 300879
ए: 288027
या: 284755
ह: 279939
य: 277095
में: 260017
अ: 253390
ब: 250789
की: 236633
ग: 231766
Syllable Bigram Frequency:
क र: 164160
औ र: 115615
प र: 100331
प् र: 87696
इ स: 83122
ए क: 59498
लि ए: 54068
के लि: 50111
न हीं: 47406
अ प: 45206
र ने: 44705
त क: 44025
का र: 42594
र का: 41672
कि या: 37119
स के: 36660
ता है: 36647
ने के: 35325
त र: 35025
न के: 33096
Character Unigram Frequency:
अ: 7153466
आ: 2966631
ए: 2303828
क: 2059393
र: 1885117
ई: 1445429
इ: 1425788
न: 1250679
ह: 1130493
स: 1115612
अं: 1111292
म: 1022292
ओ: 891901
ल: 862536
त: 852911
य: 752206
प: 664385
व: 604495
उ: 586141
द: 546709
Character Bigram Frequency:
र अ: 1173240
अ क: 921468
अ र: 786891
क अ: 616852
अ ह: 562925
अ न: 551313
स अ: 518739
न अ: 515816
अ म: 467030
क ए: 407129
प अ: 405636
त अ: 353263
अ स: 348242
आ र: 343947
ए अं: 341741
ल अ: 333296
अ त: 332006
न ए: 328953
म अ: 3248

# Problem-4 

Run the Unigram, BPE (vocabulary sizes, V = 1k, 2k), mBERT (max length = 1k,
2k), IndicBERT (max length = 1k, 2k), and White-space tokenizers on the entire corpus. You
may use Sentence Piece or similar libraries for this purpose.
Find the unigram frequencies of tokens and bi-gram frequencies of tokens, syllables, and characters
for each of the tokenizers.

In [12]:
data = data[:20000]

In [13]:

def write_to_file_and_print_top_k(model_name, vocab, k, frequencies):
    token_types = ['unigram', 'bigram', 'syllable', 'char']  #To define order

    for i, token_freqs in enumerate(frequencies):
        token_type = token_types[i]
        file_name = f"{model_name}_{token_type}_{vocab}.txt"
        with open(file_name, 'w', encoding='utf-8') as f:
            sorted_counts = sorted(token_freqs.items(), key=lambda item: item[1], reverse=True)
            for token, count in sorted_counts:
                f.write(f"{token}: {count}\n")

        print(f"\nTop {k} for {token_type.capitalize()}\n")
        for token, count in sorted_counts[:k]:
            print(f"{token}: {count}")


        

def count_frequencies(tokenizer, text_data):
    encoded_tokens = tokenizer.encode(text_data, out_type=str)

    # Unigrams and Bigrams
    unigram_frequencies = Counter(encoded_tokens)
    bigram_frequencies = Counter(zip(encoded_tokens, encoded_tokens[1:]))

    # Syllables and Characters
    syllables = []
    characters = []
    for token in encoded_tokens:
        syllables.extend(syllable_convert(token)) 
        characters.extend(character_conversion(token))  

    bigram_syllables = zip(syllables, syllables[1:])
    bigram_characters = zip(characters, characters[1:])

    syllable_frequencies = Counter(bigram_syllables)
    character_frequencies = Counter(bigram_characters)

    return unigram_frequencies, bigram_frequencies, syllable_frequencies, character_frequencies


### 


In [14]:
import sentencepiece as spm
from collections import Counter
import tensorflow as tf
import time

def train_and_load_tokenizer(file_path, model_prefix, model_type, vocab_size):
    
    spm.SentencePieceTrainer.Train(
        input=file_path,
        model_prefix=model_prefix,
        model_type=model_type,
        vocab_size=vocab_size
    )

    model_file = f"{model_prefix}.model"
    return spm.SentencePieceProcessor(model_file)

def train_calculate_and_output(file_path, model_prefix, model_type, vocab_size, output_model_name):
    tokenizer = train_and_load_tokenizer(file_path, model_prefix, model_type, vocab_size)
    frequencies = count_frequencies(tokenizer, data)
    freq_list = list(frequencies)  
    write_to_file_and_print_top_k(output_model_name, vocab_size, 10, freq_list)
    return tokenizer




2024-02-17 23:23:17.653368: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


## Unigram for 1000 vocab size

In [15]:
# Unigram with Vocabulary 1000
unigram_1000 = train_calculate_and_output(file_path, 'unigram_model', 'unigram', 1000, 'unigram')


Top 10 for Unigram

▁के: 162
न: 155
।: 138
र: 138
▁है: 135
▁में: 135
ा: 129
ल: 126
ी: 108
▁की: 103

Top 10 for Bigram

('▁है', '।'): 52
('▁के', '▁लिए'): 26
('▁है', '▁कि'): 17
('▁ज', 'न'): 17
('▁हैं', '।'): 14
('ता', '▁है'): 12
('न', 'ि'): 12
('▁गया', '▁है'): 11
('▁कहा', '▁कि'): 11
('न', 'तंत्र'): 11

Top 10 for Syllable

('▁', 'क'): 577
('क', '▁'): 542
('ं', '▁'): 413
('▁', 'ह'): 297
('▁', 'म'): 226
('▁', '▁'): 207
('न', '▁'): 200
('▁', 'स'): 190
('त', '▁'): 165
('स', '▁'): 157

Top 10 for Char

('क', 'ए'): 204
('आ', 'क'): 188
('ह', 'ऐ'): 178
('न', 'ए'): 169
('क', 'अ'): 158
('अ', 'ह'): 153
('य', 'आ'): 150
('अ', 'क'): 148
('ए', 'अं'): 142
('म', 'ए'): 140


sentencepiece_trainer.cc(77) LOG(INFO) Starts training with : 
trainer_spec {
  input: corpus.txt
  input_format: 
  model_prefix: unigram_model
  model_type: UNIGRAM
  vocab_size: 1000
  self_test_sample_size: 0
  character_coverage: 0.9995
  input_sentence_size: 0
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 4192
  num_threads: 16
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 0
  pretokenization_delimiter: 
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pieces: 0
  required_chars: 
  byte_fallback: 0
  vocabulary_output_piece_score: 1
  train_extremely_large_corpus: 0
  hard_vocab_limit: 1
  use_all_vocab: 0
  unk_id: 0
  bos_id: 1
  eos_id: 2
  pad_id: -1
  unk_piece: <unk>
  bos_piece: <s>
  eos_piece: </s>
  pad_piece: <pad>
  unk_surface:  ⁇ 
  enable_differential_privacy: 0
  differential_privacy_noise_leve

## Unigram for 2000 vocab size

In [16]:
unigram_2000 = train_calculate_and_output(file_path, 'unigram_model', 'unigram', 2000, 'unigram')

=0 size=10084 obj=9.75948 num_tokens=1136993 num_tokens/piece=112.752
unigram_model_trainer.cc(580) LOG(INFO) EM sub_iter=1 size=10084 obj=9.7122 num_tokens=1137021 num_tokens/piece=112.755
unigram_model_trainer.cc(580) LOG(INFO) EM sub_iter=0 size=7563 obj=9.9979 num_tokens=1195057 num_tokens/piece=158.014
unigram_model_trainer.cc(580) LOG(INFO) EM sub_iter=1 size=7563 obj=9.94147 num_tokens=1195156 num_tokens/piece=158.027
unigram_model_trainer.cc(580) LOG(INFO) EM sub_iter=0 size=5672 obj=10.269 num_tokens=1256218 num_tokens/piece=221.477
unigram_model_trainer.cc(580) LOG(INFO) EM sub_iter=1 size=5672 obj=10.2032 num_tokens=1256276 num_tokens/piece=221.487
unigram_model_trainer.cc(580) LOG(INFO) EM sub_iter=0 size=4254 obj=10.57 num_tokens=1322515 num_tokens/piece=310.887
unigram_model_trainer.cc(580) LOG(INFO) EM sub_iter=1 size=4254 obj=10.4928 num_tokens=1322548 num_tokens/piece=310.895
unigram_model_trainer.cc(580) LOG(INFO) EM sub_iter=0 size=3190 obj=10.9027 num_tokens=1393553


Top 10 for Unigram

▁के: 155
।: 137
▁है: 135
▁में: 135
▁की: 99
ी: 82
ल: 76
▁को: 74
न: 67
▁का: 65

Top 10 for Bigram

('▁है', '।'): 52
('▁के', '▁लिए'): 26
('▁है', '▁कि'): 17
('▁हैं', '।'): 14
('▁गया', '▁है'): 11
('▁कहा', '▁कि'): 11
('▁जन', 'तंत्र'): 11
('▁के', '▁साथ'): 10
('।', '▁इस'): 9
('ो', 'ज'): 9

Top 10 for Syllable

('▁', 'क'): 574
('क', '▁'): 530
('ं', '▁'): 411
('▁', 'ह'): 306
('▁', 'म'): 234
('▁', '▁'): 207
('▁', 'स'): 195
('न', '▁'): 184
('स', '▁'): 160
('त', '▁'): 157

Top 10 for Char

('क', 'ए'): 204
('आ', 'क'): 182
('ह', 'ऐ'): 178
('अ', 'क'): 176
('क', 'अ'): 172
('न', 'ए'): 169
('अ', 'ह'): 168
('र', 'अ'): 163
('प', 'अ'): 153
('ए', 'अं'): 153


rd_vocab_limit: 1
  use_all_vocab: 0
  unk_id: 0
  bos_id: 1
  eos_id: 2
  pad_id: -1
  unk_piece: <unk>
  bos_piece: <s>
  eos_piece: </s>
  pad_piece: <pad>
  unk_surface:  ⁇ 
  enable_differential_privacy: 0
  differential_privacy_noise_level: 0
  differential_privacy_clipping_threshold: 0
}
normalizer_spec {
  name: nmt_nfkc
  add_dummy_prefix: 1
  remove_extra_whitespaces: 1
  escape_whitespaces: 1
  normalization_rule_tsv: 
}
denormalizer_spec {}
trainer_interface.cc(351) LOG(INFO) SentenceIterator is not specified. Using MultiFileSentenceIterator.
trainer_interface.cc(183) LOG(INFO) Loading corpus: corpus.txt
trainer_interface.cc(407) LOG(INFO) Loaded all 298383 sentences
trainer_interface.cc(414) LOG(INFO) Skipped 456 too long sentences.
trainer_interface.cc(423) LOG(INFO) Adding meta_piece: <unk>
trainer_interface.cc(423) LOG(INFO) Adding meta_piece: <s>
trainer_interface.cc(423) LOG(INFO) Adding meta_piece: </s>
trainer_interface.cc(428) LOG(INFO) Normalizing sentences...
tra

## BPE for 1000 vocab size

In [17]:
# BPE with Vocabulary 1000
bpe_1000 = train_calculate_and_output(file_path, 'bpe_model', 'bpe', 1000, 'bpe')

sentencepiece_trainer.cc(77) LOG(INFO) Starts training with : 
trainer_spec {
  input: corpus.txt
  input_format: 
  model_prefix: bpe_model
  model_type: BPE
  vocab_size: 1000
  self_test_sample_size: 0
  character_coverage: 0.9995
  input_sentence_size: 0
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 4192
  num_threads: 16
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 0
  pretokenization_delimiter: 
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pieces: 0
  required_chars: 
  byte_fallback: 0
  vocabulary_output_piece_score: 1
  train_extremely_large_corpus: 0
  hard_vocab_limit: 1
  use_all_vocab: 0
  unk_id: 0
  bos_id: 1
  eos_id: 2
  pad_id: -1
  unk_piece: <unk>
  bos_piece: <s>
  eos_piece: </s>
  pad_piece: <pad>
  unk_surface:  ⁇ 
  enable_differential_privacy: 0
  differential_privacy_noise_level: 0
  d


Top 10 for Unigram

▁के: 162
।: 137
▁में: 135
▁है: 134
▁की: 103
क: 75
▁को: 75
त: 66
▁आ: 65
न: 65

Top 10 for Bigram

('▁है', '।'): 52
('▁के', '▁लिए'): 26
('▁है', '▁कि'): 17
('▁हैं', '।'): 14
('▁गया', '▁है'): 11
('▁कहा', '▁कि'): 11
('▁जन', 'त'): 11
('त', 'ंत्र'): 11
('।', '▁इस'): 10
('▁के', '▁साथ'): 10

Top 10 for Syllable

('▁', 'क'): 570
('क', '▁'): 548
('ं', '▁'): 412
('▁', 'ह'): 304
('▁', 'म'): 216
('न', '▁'): 208
('▁', '▁'): 207
('▁', 'स'): 194
('त', '▁'): 174
('स', '▁'): 167

Top 10 for Char

('क', 'ए'): 198
('आ', 'क'): 190
('ह', 'ऐ'): 178
('न', 'ए'): 172
('क', 'अ'): 159
('अ', 'क'): 152
('य', 'आ'): 148
('ए', 'अं'): 144
('म', 'ए'): 141
('आ', 'ह'): 135


## BPE for 2000 vocab size

In [18]:
# BPE with Vocabulary 2000
bpe_2000 = train_calculate_and_output(file_path, 'bpe_model', 'bpe', 2000, 'bpe')

ize=100 all=14682 active=9489 piece=▁सं
bpe_model_trainer.cc(159) LOG(INFO) Updating active symbols. max_freq=43241 min_freq=4149
bpe_model_trainer.cc(268) LOG(INFO) Added: freq=35408 size=120 all=16900 active=3171 piece=▁किया
bpe_model_trainer.cc(268) LOG(INFO) Added: freq=28955 size=140 all=18442 active=4713 piece=न्ह
bpe_model_trainer.cc(268) LOG(INFO) Added: freq=24411 size=160 all=20405 active=6676 piece=▁राज
bpe_model_trainer.cc(268) LOG(INFO) Added: freq=21882 size=180 all=22297 active=8568 piece=▁सर
bpe_model_trainer.cc(268) LOG(INFO) Added: freq=19406 size=200 all=24144 active=10415 piece=▁चु
bpe_model_trainer.cc(159) LOG(INFO) Updating active symbols. max_freq=19305 min_freq=2919
bpe_model_trainer.cc(268) LOG(INFO) Added: freq=17837 size=220 all=26375 active=3387 piece=गर
bpe_model_trainer.cc(268) LOG(INFO) Added: freq=16215 size=240 all=27984 active=4996 piece=▁जी
bpe_model_trainer.cc(268) LOG(INFO) Added: freq=14802 size=260 all=29833 active=6845 piece=रो
bpe_model_trainer.


Top 10 for Unigram

▁के: 155
।: 137
▁में: 135
▁है: 134
▁की: 100
▁को: 74
▁का: 64
▁और: 64
▁से: 62
▁ने: 52

Top 10 for Bigram

('▁है', '।'): 52
('▁के', '▁लिए'): 26
('▁है', '▁कि'): 17
('▁हैं', '।'): 14
('▁गया', '▁है'): 11
('▁कहा', '▁कि'): 11
('▁जन', 'तंत्र'): 11
('▁के', '▁साथ'): 10
('।', '▁इस'): 9
('ता', '▁है'): 8

Top 10 for Syllable

('▁', 'क'): 572
('क', '▁'): 539
('ं', '▁'): 410
('▁', 'ह'): 302
('▁', 'म'): 230
('▁', '▁'): 207
('न', '▁'): 198
('▁', 'स'): 194
('त', '▁'): 168
('स', '▁'): 168

Top 10 for Char

('क', 'ए'): 204
('अ', 'क'): 180
('आ', 'क'): 179
('ह', 'ऐ'): 178
('क', 'अ'): 172
('न', 'ए'): 172
('अ', 'ह'): 172
('प', 'अ'): 154
('ए', 'अं'): 153
('र', 'अ'): 152


33 size=760 all=65843 active=6737 piece=▁आम
bpe_model_trainer.cc(268) LOG(INFO) Added: freq=4313 size=780 all=66568 active=7462 piece=▁आने
bpe_model_trainer.cc(268) LOG(INFO) Added: freq=4158 size=800 all=67465 active=8359 piece=▁बंद
bpe_model_trainer.cc(159) LOG(INFO) Updating active symbols. max_freq=4156 min_freq=659
bpe_model_trainer.cc(268) LOG(INFO) Added: freq=4025 size=820 all=68540 active=4417 piece=डे
bpe_model_trainer.cc(268) LOG(INFO) Added: freq=3923 size=840 all=69446 active=5323 piece=▁इसमें
trainer_interface.cc(686) LOG(INFO) Saving model: bpe_model.model
trainer_interface.cc(698) LOG(INFO) Saving vocabs: bpe_model.vocab
sentencepiece_trainer.cc(77) LOG(INFO) Starts training with : 
trainer_spec {
  input: corpus.txt
  input_format: 
  model_prefix: bpe_model
  model_type: BPE
  vocab_size: 2000
  self_test_sample_size: 0
  character_coverage: 0.9995
  input_sentence_size: 0
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_se

## mBERT and IndicBERT

In [19]:
from transformers import BertTokenizer, BertModel
import tensorflow as tf
from collections import Counter
import time

def count_frequencies_bert(tokens):
    # Count unigram frequencies
    unigram_freqs= Counter(tokens)
    
    bigrams = [(tokens[i], tokens[i + 1]) for i in range(len(tokens) - 1)]
    bigram_freqs = Counter(bigrams)

    syllables = []
    characters = []
    for tok in tokens:
        syllables.extend(syllable_convert(tok))
        characters.extend(character_conversion(tok))
        
    
    bigram_syllables = [(syllables[i], syllables[i + 1]) for i in range(len(syllables) - 1)]
    bigram_characters = [(characters[i], characters[i + 1]) for i in range(len(characters) - 1)]
    
    syllable_freqs= Counter(bigram_syllables)
    char_freqs= Counter(bigram_characters)

    return unigram_freqs, bigram_freqs, syllable_freqs, char_freqs

def trained_mBERT_tokenizer():
    # Load mBERT tokenizer
    return BertTokenizer.from_pretrained('bert-base-multilingual-cased')

  torch.utils._pytree._register_pytree_node(


## mBERT for 1k vocab size

In [20]:

mbert_tokenizer = trained_mBERT_tokenizer() 

# Tokenization with mBERT 
text_encoding = mbert_tokenizer.encode_plus(
    data, 
    max_length=1000, 
    truncation=True, 
    padding=True
)
mbert_tokens = mbert_tokenizer.convert_ids_to_tokens(text_encoding['input_ids'])


token_frequencies = count_frequencies_bert(mbert_tokens)
unigram_counts, bigram_counts, syllable_counts, char_counts = token_frequencies 


frequency_list = [unigram_counts, bigram_counts, syllable_counts, char_counts]


write_to_file_and_print_top_k(
    model_name="mBERT", 
    vocab="1000", 
    k=10, 
    frequencies=frequency_list
)


Top 10 for Unigram

के: 22
में: 21
।: 19
है: 15
##प: 15
आ: 13
##र: 13
ब: 13
को: 13
##ी: 12

Top 10 for Bigram

('आ', '##प'): 7
('ख', '##ोज'): 7
('है', '।'): 6
('है', 'कि'): 4
('##ोज', 'शब्द'): 4
('##ों', 'की'): 3
('##्', '##प'): 3
('को', 'ब'): 3
('##रल', '##ैंड'): 3
('वि', '##के'): 3

Top 10 for Syllable

('#', '#'): 580
('#', '्'): 52
('#', 'ि'): 39
('क', '#'): 35
('्', '#'): 34
('#', 'क'): 32
('#', 'े'): 25
('ं', '#'): 23
('म', 'ं'): 21
('ि', '#'): 20

Top 10 for Char

('आ', 'क'): 31
('क', 'ए'): 31
('अ', 'क'): 25
('र', 'अ'): 25
('ह', 'ऐ'): 25
('क', 'अ'): 23
('अ', 'न'): 23
('य', 'आ'): 22
('म', 'ए'): 22
('ए', 'अं'): 22


## mBERT for 2k vocab

In [21]:

mbert_tokenizer = trained_mBERT_tokenizer() 

# Tokenization with mBERT 
text_encoding = mbert_tokenizer.encode_plus(
    data, 
    max_length=2000, 
    truncation=True, 
    padding=True
)
mbert_tokens = mbert_tokenizer.convert_ids_to_tokens(text_encoding['input_ids'])


token_frequencies = count_frequencies_bert(mbert_tokens)
unigram_counts, bigram_counts, syllable_counts, char_counts = token_frequencies 


frequency_list = [unigram_counts, bigram_counts, syllable_counts, char_counts]


write_to_file_and_print_top_k(
    model_name="mBERT", 
    vocab="2000",  #'vocab' might mean tokenization limit here
    k=10, 
    frequencies=frequency_list
)


Top 10 for Unigram

।: 37
में: 36
के: 34
है: 29
##र: 26
की: 25
ब: 25
##ी: 24
##ा: 23
ने: 22

Top 10 for Bigram

('है', '।'): 12
('आ', '##प'): 7
('ख', '##ोज'): 7
('हैं', '।'): 6
('है', 'कि'): 5
('के', 'लिए'): 5
('त', '##ु'): 5
('स', '##्व'): 4
('##ोज', 'शब्द'): 4
('म', '##ु'): 4

Top 10 for Syllable

('#', '#'): 1161
('#', '्'): 87
('क', '#'): 68
('#', 'ि'): 67
('#', 'क'): 58
('्', '#'): 53
('#', 'ा'): 49
('ं', '#'): 41
('म', 'ं'): 37
('#', 'े'): 36

Top 10 for Char

('आ', 'क'): 50
('न', 'ए'): 47
('क', 'ए'): 45
('अ', 'क'): 44
('ह', 'ऐ'): 44
('य', 'आ'): 42
('क', 'अ'): 40
('र', 'अ'): 40
('म', 'ए'): 40
('ए', 'अं'): 39


## IndicBERT

In [22]:
import transformers
import collections

def load_indic_bert_tokenizer(): 
    "Loads a pre-trained IndicBERT tokenizer"
    return transformers.AutoTokenizer.from_pretrained("ai4bharat/indic-bert")

def process_with_indicbert(data, max_length):
    """Processes data with IndicBERT and saves token frequencies.

    Args:
       data (str): Input text data. 
       max_length (int): Maximum length for tokenization. 
    """

    tokenizer = load_indic_bert_tokenizer()
    tokens = tokenizer(data, max_length=max_length, truncation=True)
    tokens = tokenizer.convert_ids_to_tokens(tokens['input_ids'])

    frequencies = count_frequencies_bert(tokens)  
    frequency_list = list(frequencies) 
    write_to_file_and_print_top_k(
        model_name="IndicBERT",
        vocab=str(max_length),  # Using max_length to represent vocab here
        k=10, 
        frequencies=frequency_list
    )


### IndicBERT for 1000 vocab size

In [23]:
process_with_indicbert(data, 1000)


Top 10 for Unigram

▁क: 71
▁ह: 38
▁म: 26
।: 24
य: 24
क: 23
▁: 21
त: 19
न: 16
▁और: 16

Top 10 for Bigram

('▁ह', '।'): 12
('▁क', '▁'): 6
('न', '▁क'): 6
('त', '▁ह'): 6
('▁ल', 'ए'): 5
('▁आप', 'क'): 4
('▁द', 'य'): 4
('य', '▁ह'): 4
('▁ह', '▁क'): 4
('ष', 'त'): 4

Top 10 for Syllable

('▁', '▁'): 445
('।', '▁'): 24
('▁', 'आ'): 22
('▁', 'ए'): 21
('▁', '।'): 20
('आ', '▁'): 19
('इ', '▁'): 19
('▁', 'अ'): 19
('अ', '▁'): 18
('▁', 'इ'): 17

Top 10 for Char

('क', 'अ'): 49
('प', 'अ'): 45
('र', 'अ'): 44
('अ', 'क'): 43
('अ', 'प'): 40
('अ', 'र'): 39
('स', 'अ'): 35
('ज', 'अ'): 30
('अ', 'स'): 27
('ब', 'अ'): 25


### IndicBERT for 2000 vocab size

In [24]:
process_with_indicbert(data, 2000)


Top 10 for Unigram

▁क: 132
▁ह: 77
य: 55
▁म: 51
।: 48
▁न: 44
क: 42
▁: 40
न: 38
त: 37

Top 10 for Bigram

('▁ह', '।'): 26
('त', '▁ह'): 11
('▁ल', 'ए'): 10
('▁न', 'ह'): 10
('य', '▁क'): 8
('।', '▁इस'): 8
('▁रह', '▁ह'): 8
('▁क', '▁ल'): 8
('▁क', '▁'): 7
('▁द', 'य'): 6

Top 10 for Syllable

('▁', '▁'): 918
('।', '▁'): 48
('▁', 'आ'): 41
('▁', '।'): 41
('▁', 'इ'): 38
('▁', 'ए'): 38
('इ', '▁'): 37
('आ', '▁'): 36
('ए', '▁'): 35
('▁', 'अ'): 34

Top 10 for Char

('क', 'अ'): 99
('अ', 'क'): 84
('प', 'अ'): 83
('र', 'अ'): 77
('स', 'अ'): 71
('अ', 'प'): 70
('अ', 'र'): 68
('अ', 'स'): 58
('व', 'अ'): 56
('ज', 'अ'): 55


## WhiteSpace tokenizer

In [25]:
def whitespace_tokenize(text):
    stripped_text = text.strip()
    if not stripped_text:  
        return []

    return stripped_text.split()  

tokens = whitespace_tokenize(data)
# print(tokens)
unigram_freqs, bigram_freqs, syllable_freqs, char_freqs = count_frequencies_bert(tokens)
freq_arr = [unigram_freqs, bigram_freqs, syllable_freqs, char_freqs]
write_to_file_and_print_top_k(model_name="WhiteSpace", vocab="", k=10, frequencies=freq_arr)


Top 10 for Unigram

के: 150
में: 135
की: 94
है: 81
को: 72
और: 64
से: 56
का: 55
है।: 52
ने: 49

Top 10 for Bigram

('के', 'लिए'): 26
('है', 'कि'): 17
('कहा', 'कि'): 11
('के', 'साथ'): 9
('ने', 'कहा'): 8
('हैं', 'और'): 7
('बताया', 'कि'): 7
('के', 'बाद'): 7
('गया', 'है।'): 7
('करने', 'के'): 7

Top 10 for Syllable

('म', 'ं'): 146
('ं', 'क'): 101
('ह', 'ं'): 70
('क', 'स'): 58
('ह', '।'): 53
('क', 'ल'): 52
('त', 'ह'): 39
('प', 'र'): 36
('र', 'क'): 36
('क', 'क'): 35

Top 10 for Char

('र', 'अ'): 227
('क', 'अ'): 224
('अ', 'न'): 217
('अ', 'क'): 210
('क', 'ए'): 206
('स', 'अ'): 202
('अ', 'ह'): 201
('अ', 'र'): 192
('प', 'अ'): 184
('ए', 'अं'): 182


## Question 5
Assume that the set of tokens from Question 3 is the ground truth set. For each
tokenizer in Question 4, find the precision, recall and F-score for the 25 sentences.

In [26]:
sentences = ""
sentences_file = 'text.txt' 

with open(sentences_file, 'r', encoding='utf-8') as file:
    for line in file:
        if line !="\n":
            temp = line.strip()
            sentences = sentences+temp[3:]
print(sentences)


अलीगढ़ मुस्लिम विश्वविद्यालय में कई शिक्षकों ने मौन जुलूस निकाला।एचजीएस धालीवाल आईपीएस दिल्ली में ही पदस्थ हैं।बलदेव और सूबेदार मेजर ब्रह्मू ने कहा कि सीताराम भारद्वाज हमारे पड़ोसी हैं।हर साल मई में भी थोड़ी बहुत बारिश हो जाती थी लेकिन इस बार बादल दूर-दूर ही रहे। इसका असर किसानाें पर साफ दिख रहा है।ऐसे में सरकार की प्राथमिकता रहेगी कि यह जमीन बोर्ड-निगमों को ही बेची जाए ताकि वे यहां अपनी गतिविधियों को बढ़ावा दे सकें।इस सूरत में जो भी नीति प्रमाणित गुणवत्ता वाले शैक्षणिक संस्थानों के अभाव से जूझने का प्रयास न करे वह अप्रासंगिक बनने को अभिशप्त रहेगी।राष्ट्रीय सुरक्षा से जुड़े मुद्दों पर सरकार के साथ एकजुट होने के लिए विपक्ष से अपील करते हुए, ।कंपनियां अब यूसी सेवा की उपयोगिता समझने लगी हैं इसलिए अब इसमें निवेश भी कर रही हैं।31 ओवर के बाद भारत 243/3, जीत के लिए 65 रन चाहिए आपात्काल मे धेेैर्य , अभ्युदय मे क्षमा , सदन मे वाक्पटुता , युद्ध के समय बहादुरी , यशमे अभिरूचि , ज्ञान का व्यसन ये सब चीजे महापुरूषोंमे नैसर्गिक रूपसे पायी जाती हैं । उपराज्यपाल ने कमीशन ऑफ एनक्वायरी एक्ट का हवाला देते

## Functions for Precision, Recall and F1-Score

In [27]:
import re

def remove_leading_underscore(token):
    return re.sub(r"^_", "", token)  # Remove a single leading underscore

def clean_and_create_set(tokens):
    """Removes leading underscores from tokens and creates a set."""
    cleaned_tokens = [remove_leading_underscore(token) for token in tokens]
    return set(cleaned_tokens)
  
def calculate_metrics(true_tokens, pred_tokens):

#     true_set = set(true_tokens)
#     pred_set = set(pred_tokens)
    
    true_set = clean_and_create_set(true_tokens)
    pred_set = clean_and_create_set(pred_tokens)


    true_positives = len(true_set & pred_set)  

    precision = true_positives / (len(pred_set) or 1)  
    recall = true_positives / (len(true_set) or 1)   # Avoid division by zero

    f1_score = (2 * precision * recall) / (precision + recall or 1)  

    return precision, recall, f1_score

def calculate_and_print_metrics(true_tokens, pred_tokens):
    """Calculates precision, recall, F1-score, and prints the results."""
    precision, recall, f1 = calculate_metrics(true_tokens, pred_tokens)
    print(f"Precision: {precision:.2f}")
    print(f"Recall: {recall:.2f}")
    print(f"F1-Score: {f1:.2f}")

In [28]:
file_path = 'truth.txt'

with open(file_path, 'r', encoding='utf-8') as file:
    tokens = file.read().split(',')

true_tokens = [word.strip() for word in tokens]

print("List of True Tokens:", true_tokens)

List of True Tokens: ['अलीगढ़ मुस्लिम विश्वविद्यालय में', 'कई', 'शिक्षकों ने', 'मौन', 'जुलूस', 'निकाला।\n\n\nएचजीएस धालीवाल आईपीएस', 'दिल्ली में ही', 'पदस्थ हैं।\n\n\nबलदेव', 'और', 'सूबेदार मेजर ब्रह्मू ने', 'कहा कि', 'सीताराम भारद्वाज', 'हमारे', 'पड़ोसी हैं।\n\n\nहर साल', 'मई में', 'भी', 'थोड़ी बहुत', 'बारिश', 'हो जाती थी', 'लेकिन', 'इस बार', 'बादल', 'दूर-दूर', 'ही रहे।', 'इसका', 'असर', 'किसानाें पर', 'साफ', 'दिख रहा है।\n\n\n ऐसे में', 'सरकार की', 'प्राथमिकता', 'रहेगी', 'कि', 'यह', 'जमीन', 'बोर्ड-निगमों को ही', 'बेची', 'जाए', 'ताकि', 'वे', 'यहां', 'अपनी', 'गतिविधियों को', 'बढ़ावा', 'दे सकें।\n\n\nइस सूरत में', 'जो भी', 'नीति', 'प्रमाणित गुणवत्ता', 'वाले', 'शैक्षणिक संस्थानों के', 'अभाव से', 'जूझने का', 'प्रयास', 'न', 'करे', 'वह', 'अप्रासंगिक', 'बनने को', 'अभिशप्त', 'रहेगी।\n\n\nराष्ट्रीय सुरक्षा से', 'जुड़े', 'मुद्दों पर', 'सरकार', 'के साथ', 'एकजुट', 'होने के लिए', 'विपक्ष से', 'अपील', 'करते हुए', '।\n\n\nकंपनियां', 'अब', 'यूसी सेवा की', 'उपयोगिता', 'समझने लगी हैं', 'इसलिए', 'अब', 'इ

### Precision, Recall and F1-Score for Unigram

In [29]:
pred_tokens = unigram_1000.encode(sentences, out_type=str)
calculate_and_print_metrics(true_tokens, pred_tokens) 


Precision: 0.01
Recall: 0.02
F1-Score: 0.01


In [30]:

# Unigram Evaluation
pred_tokens = unigram_2000.encode(sentences, out_type=str)
calculate_and_print_metrics(true_tokens, pred_tokens) 


Precision: 0.00
Recall: 0.01
F1-Score: 0.01


### Precision, Recall and F1-Score for BPE

In [31]:
# BPE Evaluation 
pred_tokens = bpe_1000.encode(sentences, out_type=str)
calculate_and_print_metrics(true_tokens, pred_tokens) 

Precision: 0.01
Recall: 0.03
F1-Score: 0.02


In [32]:
# BPE Evaluation 
pred_tokens = bpe_2000.encode(sentences, out_type=str)
calculate_and_print_metrics(true_tokens, pred_tokens) 

Precision: 0.01
Recall: 0.01
F1-Score: 0.01


### Precision, Recall and F1-Score for mBERT

In [33]:
tokenizer = trained_mBERT_tokenizer()
mBert_encodings = tokenizer.encode_plus(sentences, max_length=1000, truncation=True, padding=True)
pred_tokens = tokenizer.convert_ids_to_tokens(mBert_encodings['input_ids'])

calculate_and_print_metrics(true_tokens, pred_tokens) 

Precision: 0.09
Recall: 0.16
F1-Score: 0.11


### Precision, Recall and F1-Score for IndicBERT

In [34]:
indicbert_tokenizer = load_indic_bert_tokenizer()
ibert_tokens = indicbert_tokenizer(sentences, max_length=1000, truncation=True)
# pred_tokens = tokenizer.convert_ids_to_tokens(ibert_tokens['input_ids'])
pred_text = indicbert_tokenizer.decode(ibert_tokens['input_ids'], skip_special_tokens=True)
print(pred_tokens)

calculate_and_print_metrics(true_tokens, pred_tokens) 


['[CLS]', 'अ', '##ली', '##गढ', '##़', 'मुस्लिम', 'विश्वविद्यालय', 'में', 'कई', 'श', '##िक', '##्ष', '##कों', 'ने', 'म', '##ौ', '##न', 'जुल', '##ूस', 'न', '##िका', '##ला', '।', 'ए', '##च', '##जी', '##ए', '##स', 'ध', '##ाली', '##वाल', 'आई', '##पी', '##ए', '##स', 'दिल्ली', 'में', 'ही', 'पद', '##स', '##्थ', 'हैं', '।', 'ब', '##ल', '##देव', 'और', 'स', '##ू', '##बे', '##दार', 'मे', '##ज', '##र', 'ब', '##्रह', '##्म', '##ू', 'ने', 'कहा', 'कि', 'सी', '##ता', '##रा', '##म', 'भ', '##ार', '##द', '##्व', '##ाज', 'हम', '##ारे', 'प', '##ड़', '##ो', '##सी', 'हैं', '।', 'हर', 'साल', 'मई', 'में', 'भी', 'थ', '##ोड़', '##ी', 'बहुत', 'बार', '##िश', 'हो', 'जाती', 'थी', 'लेकिन', 'इस', 'बार', 'बाद', '##ल', 'दूर', '-', 'दूर', 'ही', 'रहे', '।', 'इसका', 'अ', '##सर', 'कि', '##सा', '##ना', '##ें', 'पर', 'स', '##ा', '##फ', 'द', '##ि', '##ख', 'रहा', 'है', '।', 'ऐसे', 'में', 'सरकार', 'की', 'प्राथमिक', '##ता', 'रहे', '##गी', 'कि', 'यह', 'ज', '##मीन', 'ब', '##ोर्ड', '-', 'न', '##ि', '##गम', '##ों', 'को', 'ही', 'ब', '#

### Precision, Recall and F1-Score for WhiteSpace Tokenizer

In [35]:
pred_tokens = whitespace_tokenize(sentences)
print(pred_tokens)

calculate_and_print_metrics(true_tokens, pred_tokens) 

['अलीगढ़', 'मुस्लिम', 'विश्वविद्यालय', 'में', 'कई', 'शिक्षकों', 'ने', 'मौन', 'जुलूस', 'निकाला।एचजीएस', 'धालीवाल', 'आईपीएस', 'दिल्ली', 'में', 'ही', 'पदस्थ', 'हैं।बलदेव', 'और', 'सूबेदार', 'मेजर', 'ब्रह्मू', 'ने', 'कहा', 'कि', 'सीताराम', 'भारद्वाज', 'हमारे', 'पड़ोसी', 'हैं।हर', 'साल', 'मई', 'में', 'भी', 'थोड़ी', 'बहुत', 'बारिश', 'हो', 'जाती', 'थी', 'लेकिन', 'इस', 'बार', 'बादल', 'दूर-दूर', 'ही', 'रहे।', 'इसका', 'असर', 'किसानाें', 'पर', 'साफ', 'दिख', 'रहा', 'है।ऐसे', 'में', 'सरकार', 'की', 'प्राथमिकता', 'रहेगी', 'कि', 'यह', 'जमीन', 'बोर्ड-निगमों', 'को', 'ही', 'बेची', 'जाए', 'ताकि', 'वे', 'यहां', 'अपनी', 'गतिविधियों', 'को', 'बढ़ावा', 'दे', 'सकें।इस', 'सूरत', 'में', 'जो', 'भी', 'नीति', 'प्रमाणित', 'गुणवत्ता', 'वाले', 'शैक्षणिक', 'संस्थानों', 'के', 'अभाव', 'से', 'जूझने', 'का', 'प्रयास', 'न', 'करे', 'वह', 'अप्रासंगिक', 'बनने', 'को', 'अभिशप्त', 'रहेगी।राष्ट्रीय', 'सुरक्षा', 'से', 'जुड़े', 'मुद्दों', 'पर', 'सरकार', 'के', 'साथ', 'एकजुट', 'होने', 'के', 'लिए', 'विपक्ष', 'से', 'अपील', 'करते', 'हुए,', 

# Question 6

In this assignment i got the best Precision, Recall and F1-Score with respect to the ground truth(which was determined by me).

