In [1]:
import pandas as pd
import gc
import numpy as np
import json
#import nltk
import demoji
from tqdm import tqdm

import re
import unicodedata

from bnunicodenormalizer import Normalizer 
bnorm = Normalizer()

In [2]:
from bnlp import NLTKTokenizer

bnltk = NLTKTokenizer()

In [3]:
train = pd.read_csv("train.csv")
all_characters = ''.join(train['sentence'])
unique_characters = set(all_characters)
unique_characters_list = list(unique_characters)

In [4]:
#source
#https://tiefenauer.github.io/blog/wiki-n-gram-lm/
import string

# Regex for matching zero witdh joiner variations.
STANDARDIZE_ZW = re.compile(r'(?<=\u09b0)[\u200c\u200d]+(?=\u09cd\u09af)')

# Regex for removing standardized zero width joiner, except in edge cases.
DELETE_ZW = re.compile(r'(?<!\u09b0)[\u200c\u200d](?!\u09cd\u09af)')

def normalize_word(token, normalize_nukta=False):

    _token_bu = remove_punctuation(token)
    _token = replace_numeric(_token_bu, by_single_digit=False)
    if _token != _token_bu:
        if _token != '#':
            _token = '<num_ext>'
    _token = '<num>' if _token == '#' else _token  # if token was a number, replace it with <num> token

    if _token.isalpha() and _token.isascii():
        _token = '<olan>'
        
    #_token = demoji.replace(_token,'')   
    
    _token = filter_valid_word(_token)
    
    if _token is not None:   
        _token = _token.replace(u"\u098c", u"\u09ef")
        _token = unicodedata.normalize("NFC", _token)
        _token = removeOptionalZW(_token)
    
    if _token is None:
        return ''
    else:
        return _token.strip()

def removeOptionalZW(text):
    """
    Removes all optional occurrences of ZWNJ or ZWJ from Bangla text.
    """
    text = STANDARDIZE_ZW.sub('\u200D', text)
    text = DELETE_ZW.sub('', text)
    return text

def remove_smileys(text):
    smiley_pattern = r'(:-?\)|:-?D|;-?\)|:-?P|:-?\(|:-?\/|:-?[oO]|:-?\||:-?\\|:-?S)'
    clean_text = re.sub(smiley_pattern, '', text)
    return clean_text

def remove_punctuation(text, punctiation_extended=string.punctuation + """"।–—৷’”„“‚‘…॥"""):
    return ''.join(c for c in text if c not in punctiation_extended)

def replace_numeric(text, numeric_pattern=re.compile('[0-9]+|[\u09E6-\u09EF]+'), repl='#', by_single_digit=True):
    if by_single_digit:
        return re.sub(r'[0-9]', '#', re.sub(r'[\u09E6-\u09EF]', '#', text))
    else:
        return re.sub(numeric_pattern, repl, text)

#def contains_numeric(text):
#    return any(char.isdigit() for char in text)

def filter_valid_word(text):
    if text == '<num>' or text == '<num_ext>' or text == '<olan>':
        return text
    else:
        return text if all(char in unique_characters_list for char in text) else '<edge>'

In [5]:
# Regex for matching zero witdh joiner variations.
STANDARDIZE_ZW = re.compile(r'(?<=\u09b0)[\u200c\u200d]+(?=\u09cd\u09af)')

# Regex for removing standardized zero width joiner, except in edge cases.
DELETE_ZW = re.compile(r'(?<!\u09b0)[\u200c\u200d](?!\u09cd\u09af)')

# Regex matching punctuations to remove.
PUNC = re.compile(r'([\?\.।;:,!"\'])')

def removeOptionalZW(text):
    """
    Removes all optional occurrences of ZWNJ or ZWJ from Bangla text.
    """
    text = STANDARDIZE_ZW.sub('\u200D', text)
    text = DELETE_ZW.sub('', text)
    return text

def removePunc(text):
    """
    Remove for punctuations from text.
    """
    text = PUNC.sub(r"", text)
    return text

def normalizeUnicode(text, normalize_nukta=True):
    """
    Normalizes unicode strings using the Normalization Form Canonical
    Composition (NFC) scheme where we first decompose all characters and then
    re-compose combining sequences in a specific order as defined by the
    standard in unicodedata module. Finally all zero-width joiners are
    removed.
    """
    if normalize_nukta:
        words = [ bnorm(word)['normalized']  for word in text.split() ]
        text = " ".join([word for word in words if word is not None])
        text = text.replace("\u2047", "-")

    text = text.replace(u"\u098c", u"\u09ef")
    text = unicodedata.normalize("NFC", text)
    text = removeOptionalZW(text)
    text = removePunc(text)

    return text

## Saving single lists

In [29]:
txt_overview = {'kaggle_trains_tokenized_processed':kaggle_trains_tokenized_processed,
 'ekstep_sentences_tokenized_normalized':ekstep_sentences_tokenized_normalized,
 'ai4bharat_data_tokenized_normalized':ai4bharat_data_tokenized_normalized,
 'lm_train_tokenized_processed':lm_train_tokenized_processed,
 'openslr_tokenized_processed':openslr_tokenized_processed,
 #'bn_cc_train_tokenized_processed':bn_cc_train_tokenized_processed,
 #'data_v2':data_v2,
 #'bangla_nmt':bangla_nmt,
 'saman':saman,
 'llm_kaggle':llm_kaggle,
 'ocsar_kaggle':ocsar_kaggle}

In [32]:
txt_overview['saman'][:2]

['গ্রামবাংলার দৃশ্যপট বদলানোর ক্ষেত্রে জমিদারদের ব্যর্থতা সম্পর্কে পন্ডিতদের মধ্যে কোনো দ্বিমত নেই ।',
 'বাড়ি ফিরতে পেরে তিনি খুবই আনন্দিত ।']

In [30]:
def _save_list(txt_overview):
    #import re

    chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"\“\%\‘\”\�\']'

    for k,v in txt_overview.items():
        with open(f'/media/benedikt/T7/text_corpi_v2/{k}.txt', 'w') as f:
            for sentence in tqdm(v):
                sentence = normalizeUnicode(sentence.strip(), normalize_nukta=False)
                f.write(f"{sentence}\n")

In [33]:
_save_list(txt_overview)

100%|███████████████████████████████| 461997/461997 [00:02<00:00, 179098.09it/s]
100%|███████████████████████████| 60601869/60601869 [08:14<00:00, 122447.79it/s]
100%|███████████████████████████| 30166470/30166470 [04:26<00:00, 113015.04it/s]
100%|███████████████████████████████| 152866/152866 [00:00<00:00, 268504.17it/s]
100%|███████████████████████████████| 108189/108189 [00:00<00:00, 338464.42it/s]
100%|█████████████████████████████| 7715371/7715371 [00:47<00:00, 164051.14it/s]
100%|████████████████████████████| 10882324/10882324 [04:54<00:00, 36950.24it/s]
100%|███████████████████████████| 23312139/23312139 [03:17<00:00, 117927.23it/s]


## Saving large list

In [4]:
## no real improvement, maybe bad data quality or domain off
#i = 0
#bn_cc_train = []
#with open('language_model/bn_cc100.txt', 'r') as file:
#    for line in file:
#       #print(line, end='')  # Each line already ends with a newline character, so specify end='' to avoid printing double newlines
#        bn_cc_train.append(line.strip())
#        i+=1
#        if i % 10000000 == 0:
#            print(f"{i} samples are processed")

In [35]:
def _lm_model(all_sentences):
    #import re

    chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"\“\%\‘\”\�\']'

    with open('/media/benedikt/T7/text_full_v2.txt', 'w') as f:
        for sentence in tqdm(all_sentences):
            sentence = normalizeUnicode(sentence.strip(), normalize_nukta=False)
            f.write(f"{sentence}\n")

In [36]:
_lm_model(all_sentences)

100%|█████████████████████████| 113213667/113213667 [18:34<00:00, 101595.23it/s]


In [6]:
from transformers import pipeline
from bnunicodenormalizer import Normalizer
from tqdm import tqdm

In [17]:
source_overview = ['kaggle_trains_tokenized_processed',
 'ekstep_sentences_tokenized_normalized',
 'ai4bharat_data_tokenized_normalized',
 'lm_train_tokenized_processed',
 'openslr_tokenized_processed',
 #'bn_cc_train_tokenized_processed':bn_cc_train_tokenized_processed,
 #'data_v2':data_v2,
 #'bangla_nmt':bangla_nmt,
 #'saman',
 'llm_kaggle',
 #'ocsar_kaggle' #v4
]

In [18]:
def _read_processed(sourcename):
    i = 0
    sentencelist = []
    with open(f'/media/benedikt/T7/text_corpi_v2/{sourcename}.txt', 'r') as file:
        for line in file:
            sentencelist.append(line.strip())
            #if i % 1000 == 0:
            #    break
    return sentencelist

In [19]:
all_sentence_list = []

for source in source_overview:
    print(source)
    temp = _read_processed(source)
    all_sentence_list += temp

kaggle_trains_tokenized_processed
ekstep_sentences_tokenized_normalized
ai4bharat_data_tokenized_normalized
lm_train_tokenized_processed
openslr_tokenized_processed
saman
llm_kaggle


In [24]:
oscar_list = _read_processed('ocsar_kaggle')

In [20]:
len(all_sentence_list)

110089086

In [21]:
all_sentence_list = list(set(all_sentence_list))
len(all_sentence_list)

97928703

In [33]:
def _lm_model(all_sentences):
    #import re

    chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"\“\%\‘\”\�\']'

    with open('/media/benedikt/T7/text_full_v4.txt', 'w') as f:
        for sentence in tqdm(all_sentences):
            sentence = normalizeUnicode(sentence.strip(), normalize_nukta=False)
            f.write(f"{sentence}\n")

In [34]:
_lm_model(all_sentence_list)

100%|████████████████████████████| 97928703/97928703 [22:41<00:00, 71931.22it/s]


In [None]:
#v2
#kaggle_trains_tokenized_processed
#ekstep_sentences_tokenized_normalized
#ai4bharat_data_tokenized_normalized
#lm_train_tokenized_processed
#openslr_tokenized_processed
#saman
#llm_kaggle
#ocsar_kaggle

#v3
#kaggle_trains_tokenized_processed
#ekstep_sentences_tokenized_normalized
#ai4bharat_data_tokenized_normalized
#lm_train_tokenized_processed
#openslr_tokenized_processed
#saman
#llm_kaggle

## Building LLM

In [82]:
!kenlm/build/bin/lmplz --help

Builds unpruned language models with modified Kneser-Ney smoothing.

Please cite:
@inproceedings{Heafield-estimate,
  author = {Kenneth Heafield and Ivan Pouzyrevsky and Jonathan H. Clark and Philipp Koehn},
  title = {Scalable Modified {Kneser-Ney} Language Model Estimation},
  year = {2013},
  month = {8},
  booktitle = {Proceedings of the 51st Annual Meeting of the Association for Computational Linguistics},
  address = {Sofia, Bulgaria},
  url = {http://kheafield.com/professional/edinburgh/estimate\_paper.pdf},
}

Provide the corpus on stdin.  The ARPA file will be written to stdout.  Order of
the model (-o) is the only mandatory option.  As this is an on-disk program,
setting the temporary file location (-T) and sorting memory (-S) is recommended.

Memory sizes are specified like GNU sort: a number followed by a unit character.
Valid units are % for percentage of memory (supported platforms only) and (in
increasing powers of 1024): b, K, M, G, T, P, E, Z, Y.  Default is K (*1024).

In [127]:
#!kenlm/build/bin/lmplz -o 5 --prune 1 1 1 1 1 -S 60% < "/media/benedikt/T7/text_90m.txt" > "/media/benedikt/T7/5gram.arpa"

In [2]:
!head -20 /media/benedikt/T7/5gram.arpa

\data\
ngram 1=8147238
ngram 2=165829289
ngram 3=605263100
ngram 4=946321989
ngram 5=1067225196

\1-grams:
-8.263013	<unk>	0
0	<s>	-1.8262564
-2.1241102	</s>	0
-3.2409623	কিন্তু	-0.82435185
-3.247854	সে	-0.920128
-4.239222	রাজি	-0.9719501
-3.0399382	না	-0.9857863
-4.718111	হওয়াতে	-0.4159398
-5.0647545	সহযোগীদের	-0.46949044
-3.0452187	নিয়ে	-1.110584
-5.8365865	সিফাতকে	-0.32493073
-3.1894114	অনেক	-1.0336629


In [3]:
with open("/media/benedikt/T7/5gram.arpa", "r") as read_file, open("/media/benedikt/T7/5gram_correct.arpa", "w") as write_file:
    has_added_eos = False
    for line in read_file:
        if not has_added_eos and "ngram 1=" in line:
            count=line.strip().split("=")[-1]
            write_file.write(line.replace(f"{count}", f"{int(count)+1}"))
        elif not has_added_eos and "<s>" in line:
            write_file.write(line)
            write_file.write(line.replace("<s>", "</s>"))
            has_added_eos = True
        else:
            write_file.write(line)

In [4]:
!kenlm/build/bin/build_binary -S 60% /media/benedikt/T7/5gram_correct.arpa 5gram.binary

Reading /media/benedikt/T7/5gram_correct.arpa
----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100
****************************************************************************************************
SUCCESS


In [None]:
#!kenlm/build/bin/build_binary -S 60% /media/benedikt/T7/5gram_correct.arpa /media/benedikt/T7/5gram.binary

In [5]:
from pyctcdecode import BeamSearchDecoderCTC
from pyctcdecode import build_ctcdecoder

from transformers import pipeline
from transformers import Wav2Vec2CTCTokenizer, Wav2Vec2FeatureExtractor, Wav2Vec2Processor,Wav2Vec2ProcessorWithLM
from transformers import Wav2Vec2ForCTC
from pyctcdecode import build_ctcdecoder
from bnunicodenormalizer import Normalizer 
import librosa
from jiwer import wer
from transformers import Wav2Vec2ProcessorWithLM,pipeline


processor = Wav2Vec2Processor.from_pretrained("best_models/best_v20_processor")

vocab_dict = processor.tokenizer.get_vocab()

vocab_dict = {k: v for k, v in sorted(vocab_dict.items(), key=lambda item: item[1])}
decoder = build_ctcdecoder(
    labels=list(vocab_dict.keys()),
    kenlm_model_path='/media/benedikt/T7/5gram_correct.arpa',
    #alpha=0.2, #alpha 0.2 and beta 0.5 ->
    #beta=0.2
)

processor = Wav2Vec2ProcessorWithLM(
    feature_extractor=processor.feature_extractor,
    tokenizer=processor.tokenizer,
    decoder=decoder,
)

processor.save_pretrained("/media/benedikt/T7/lms/new_model_arpa")

Loading the LM will be faster if you build a binary file.
Reading /media/benedikt/T7/5gram_correct.arpa
----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100
****************************************************************************************************
Found entries of length > 1 in alphabet. This is unusual unless style is BPE, but the alphabet was not recognized as BPE type. Is this correct?
Unigrams and labels don't seem to agree.


In [6]:
from pyctcdecode import BeamSearchDecoderCTC
from pyctcdecode import build_ctcdecoder

from transformers import pipeline
from transformers import Wav2Vec2CTCTokenizer, Wav2Vec2FeatureExtractor, Wav2Vec2Processor,Wav2Vec2ProcessorWithLM
from transformers import Wav2Vec2ForCTC
from pyctcdecode import build_ctcdecoder
from bnunicodenormalizer import Normalizer 
import librosa
from jiwer import wer
from transformers import Wav2Vec2ProcessorWithLM,pipeline


processor = Wav2Vec2Processor.from_pretrained("best_models/best_v20_processor")


vocab_dict = processor.tokenizer.get_vocab()

vocab_dict = {k: v for k, v in sorted(vocab_dict.items(), key=lambda item: item[1])}
decoder = build_ctcdecoder(
    labels=list(vocab_dict.keys()),
    kenlm_model_path='5gram.binary',
    #alpha=0.2, #alpha 0.2 and beta 0.5 ->
    #beta=0.2
)

processor = Wav2Vec2ProcessorWithLM(
    feature_extractor=processor.feature_extractor,
    tokenizer=processor.tokenizer,
    decoder=decoder,
)

processor.save_pretrained("/media/benedikt/T7/lms/new_model_bin_mixed")

Unigrams not provided and cannot be automatically determined from LM file (only arpa format). Decoding accuracy might be reduced.
Found entries of length > 1 in alphabet. This is unusual unless style is BPE, but the alphabet was not recognized as BPE type. Is this correct?
No known unigrams provided, decoding results might be a lot worse.


In [4]:
from datasets import load_dataset

bn_wiki = load_dataset("wikipedia", language="bn", date="20230920", beam_runner='DirectRunner')

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/367M [00:00<?, ?B/s]



  0%|          | 0/1 [00:00<?, ?parquet files/s]

  0%|          | 0/1 [00:00<?, ?shards/s]

In [14]:
#from unidecode import unidecode

In [29]:
bnorm = Normalizer()

In [2]:
lexicon_list = []
i = 0
with open('language_model/lexicon.lst', 'r') as file:
    for line in file:
       #print(line, end='')  # Each line already ends with a newline character, so specify end='' to avoid printing double newlines
        values = line.split('\t')
        lexicon_list.append(values[0])
        i+=1
        if i % 100000 == 0:
            print(f"{i} samples are processed")

100000 samples are processed


In [58]:
lexicon_list

['আধিকারিকও',
 'শতরঞ্চি',
 'মোজাম্মেলের',
 'অন্তরার',
 'মাস্টারশেফ',
 'পৌরবাসীর',
 'মেজবাহউদ্দিন',
 'প্ল্যানিং',
 'নির্দলীয়',
 'রঙ্গরাজন',
 'খুকুমণি',
 'দোহারনবাবগঞ্জ',
 'পাহাৰৰ',
 'কদাকার',
 'নাস্তিক্যবাদী',
 'স্বর্ণযুগে',
 'মাঘে',
 'মেজারমেন্ট',
 'সিদুঁর',
 'তবলছড়ি',
 'অর্ধেন্দুকুমার',
 'মেহেদীবাগানের',
 'পরিবেদনা',
 'চেয়ারম্যানও',
 'শিল্পকলায়',
 'ধানসহ',
 'এনবিসিসি',
 'বেহেশতে',
 'ডিব্রুইনের',
 'এম্পায়ার',
 'ছত্রিশচল্লিশ',
 'কুকুরগুলিকে',
 'সুখকর',
 'দুর্বৃত্তদের',
 'ছাত্র।',
 'স্পৃষ্টে',
 'বিঘার',
 'রাতভোর',
 'শেষাবধি',
 'পুশকিন',
 'সাদাকাতুল',
 'জেনিংসের',
 'মেধাতালিকা',
 'আচরণের',
 'আম্বিয়া',
 'খাতিরযত্ন',
 'পদ্মপাতা',
 'শুনিয়েই',
 'ব্যাঙ্গালুরু',
 'অ্যানাদার',
 'দেনে',
 'আবেগভরে',
 'সাঙ্গীতিক',
 'আইইউটি',
 'কৃষিজমির',
 'ফুটা',
 'নবাবপুরের',
 'শুভদিন',
 'রমনীর',
 'এজাহারনামীয়',
 'কুবায়',
 'রেকর্ডিংয়ে',
 'বিড়লার',
 'নৌকাধানের',
 'নালিতাবাড়ি',
 'চিত্রনাট্যকার',
 'রাজপুত,',
 'কৃষিবিজ্ঞানী',
 'জন্মকালীন',
 'চেজের',
 'মাওয়া',
 'রথীন্দ্র',
 'শুয়েছিলেন',
 '১২৮টি',
 'ভোটযুদ্ধে',
 

In [104]:
uni_list = []
i = 0
with open('/media/benedikt/T7/lms/v21_prune_ver/new_model_bin_mixed/language_model/unigrams.txt', 'r') as file:
    for line in file:
       #print(line, end='')  # Each line already ends with a newline character, so specify end='' to avoid printing double newlines
        uni_list.append(line.strip())
        i+=1
        if i % 1000000 == 0:
            print(f"{i} samples are processed")

1000000 samples are processed
2000000 samples are processed
3000000 samples are processed


In [126]:
uni_list[-10000:]

['☑–',
 '☑️',
 '☑️#',
 '☕',
 '☕☕',
 '☕☕আমি',
 '☕☕হাই',
 '☘',
 '☘️',
 '☛',
 '☞',
 '☞#',
 '☞এখন',
 '☞জাতিসংঘ',
 '☞দোকানদার',
 '☞বঙ্গভঙ্গ',
 '☞বেগম',
 '☞ভদ্র',
 '☞শ্যায়েখ',
 '☞সিটি',
 '☞সিম',
 '☞সুয়েজ',
 '☞১ম',
 '☞২য়',
 '☞☞',
 '☠',
 '☠️সাথেhksyshsh',
 '☢',
 '☢☢',
 '☢☢☢',
 '☪',
 '☯',
 '☯☯',
 '☰',
 '☴☴',
 '☴☴☴',
 '☴♕',
 '☴❀❀',
 '☷',
 '☸️☸️☸️',
 '☹',
 '☹️',
 '☹️☹️',
 '☺',
 '☺এখানে',
 '☺সাধারণত',
 '☺☺',
 '☺☺☺',
 '☺️',
 '☻',
 '☻ঘোড়ার',
 '☻নামকরণঃ',
 '☻প্রবক্তাঃ',
 '☻বলদের',
 '☻বায়ুপ্রবাহ',
 '☻বুৎপত্তিগত',
 '☻ব্যুৎপত্তিগত',
 '☻ভূপৃষ্ঠের',
 '☻মস্তক',
 '☻সংজ্ঞাঃ',
 '☼',
 '☼►',
 '☼☼',
 '☾',
 '♀',
 '♀️',
 '♂',
 '♂♀',
 '♄',
 '♇',
 '♈',
 '♊',
 '♋',
 '♌',
 '♍',
 '♐',
 '♔',
 '♕',
 '♕anisa',
 '♕আব্দুল্লাহ',
 '♕আসাদুল্লাহ',
 '♕ইফতেখার',
 '♕জামিনুল',
 '♕মোঃআশরাফ',
 '♕☴',
 '♗',
 '♘',
 '♙',
 '♚',
 '♛',
 '♝',
 '♞',
 '♟',
 '♠',
 '♠#',
 '♠অবদমিত',
 '♠উত্তরে',
 '♠পশ্চিমে',
 '♠পাবলিক',
 '♠পূর্বে',
 '♠বঙ্গবন্ধু',
 '♠♠',
 '♠♠♠',
 '♡',
 '♡♡',
 '♣',
 '♣অমর',
 '♣আপনার',
 '♣আমি',
 '♣আশা',
 '♣কিভাবে',
 '♣খাস',
 '♣

In [59]:
import random
random.seed(42)
uni_sample = random.sample(uni_list, 1000000)

In [70]:
norm_list = [normalizeUnicode(x, normalize_nukta=True) for x in uni_list]

In [71]:
norm_list_v2 = [x for x in norm_list if x != '']

In [72]:
norm_list_set = list(set(norm_list_v2))

In [73]:
print(len(norm_list_v2))
print(len(norm_list_set))

7974788
7497262


In [76]:
len(norm_list_set) / len(norm_list)

0.8959504145761987

In [79]:
norm_list_set

['ক্লিক১৯',
 'সেনেগালেই',
 'বলিয়ু',
 'মোসকি',
 'অলেগ',
 'জমজমটি',
 'ব্যাসকে',
 'ফলতই',
 'ফোনেনা',
 'গিয়েছিললিংকনের',
 'পাকিস্তানপক্ষের',
 'মুসয়াবকে',
 'আয়াত২৫',
 'চড়ীশ্বরের',
 'আছিম',
 'কুতুবগঞ্জ',
 'পাকালিয়া',
 'শাহজানান',
 'মেয়েটাসেই',
 'রিভিউধর্মী',
 'বলে—পড়',
 'প্রহরীদিগের',
 'ভেলকিবজিতে',
 'উসিলাও',
 'কাষ্ঠবিড়ালের',
 'কাইজ্যাকেলেংকারী',
 'সংগৃহীতনির্ভুল',
 'বিএইচএএল',
 'ফ্রেফতারও',
 'ভ্রাতৃঘাতক',
 'পদান্তরিত',
 'অনকুশ',
 'সুন্দরকি',
 'অতিবাদকে',
 'কথকচূড়ামণি',
 'সেনাপ্রধানেরও',
 'যীশাইও',
 'হৃৎপিণ্ডসহ',
 'হোল্ডেনরাইড',
 'আলবাকী',
 'হুব্বা',
 'ইসিককুলের',
 'কাচ্চিসড়ক',
 'থায়োমাইড',
 'বাক্সংযমের',
 'শাসনপ্রণালিকে',
 'মধুনিয়া',
 'প্রাণপাথরের',
 'হবুমাস্টার',
 'বিজয়াশান',
 '১৮৮৬টি',
 'শিল্পসিটি',
 'প্যাপিরাইটি',
 'গঙ্গাধরণকে',
 'এন্টিলিউকট্রিন',
 'পপরিস্থিতি',
 'সুরিদাস',
 'চড়বেননা',
 'আইরের',
 'তাজবাগের',
 '১৬৫৬২১৩',
 'পিঁপড়েরের',
 'মুর্শিদাবাদকেন্দ্রিক',
 'ভালনারাবল',
 'উচ্চাকাঙ্ক্ষাটিও',
 'এন্ডেয়ারমেন্ট',
 'চুলহএই',
 '২৩০৪১৮',
 'আজারমাহের',
 'দূলিতলে',
 'মোর্সেলিন'

# LLM

In [1]:
from tqdm import tqdm
from transformers import pipeline
from bnunicodenormalizer import Normalizer
from tqdm import tqdm
import random
random.seed(42)

current_version = 'v18'

In [10]:
source_overview = ['kaggle_trains_tokenized_processed',
 'ekstep_sentences_tokenized_normalized',
 'ai4bharat_data_tokenized_normalized',
 'lm_train_tokenized_processed',
 'openslr_tokenized_processed',
 'llm_kaggle',
 'newspaper',
 'bible'
]

In [11]:
def _read_processed(sourcename):
    i = 0
    sentencelist = []
    with open(f'/media/benedikt/T7/text_corpi_v2/{sourcename}.txt', 'r') as file:
        for line in file:
            sentencelist.append(line.strip())
            #if i % 1000 == 0:
            #    break
    return sentencelist

In [12]:
all_sentence_list = {}

for source in source_overview:
    print(source)
    #temp = _read_processed(source)
    all_sentence_list[source] = _read_processed(source)

kaggle_trains_tokenized_processed
ekstep_sentences_tokenized_normalized
ai4bharat_data_tokenized_normalized
lm_train_tokenized_processed
openslr_tokenized_processed
llm_kaggle
newspaper
bible


In [13]:
len(all_sentence_list['llm_kaggle'])

10882324

In [15]:
kaggle_lm_small = random.sample(all_sentence_list['llm_kaggle'], 3_000_000)

In [16]:
len(all_sentence_list['newspaper'])

40140614

In [17]:
newspaper_small = random.sample(all_sentence_list['newspaper'], 10_000_000)

In [18]:
new_all_sentences = (all_sentence_list['kaggle_trains_tokenized_processed']+
                    all_sentence_list['ekstep_sentences_tokenized_normalized']+
                    all_sentence_list['ai4bharat_data_tokenized_normalized']+
                    all_sentence_list['lm_train_tokenized_processed']+
                    all_sentence_list['openslr_tokenized_processed']+
                    newspaper_small+
                    kaggle_lm_small+
                    all_sentence_list['bible'])

In [19]:
len(new_all_sentences)

104524992

In [20]:
new_all_sentences = list(set(new_all_sentences))
len(new_all_sentences)

98796267

In [21]:
def _lm_model_dw(all_sentences):
    #import re

    chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"\“\%\‘\”\�\']'

    with open(f'/media/benedikt/T7/text_full_{current_version}_base.txt', 'w') as f:
        for sentence in tqdm(all_sentences):
            sentence = normalizeUnicode(sentence.strip(), normalize_nukta=False)
            f.write(f"{sentence}\n")

            #f.write(sentence)
            #f.write(' ')

In [22]:
_lm_model_dw(new_all_sentences)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████| 98796267/98796267 [14:23<00:00, 114419.44it/s]


In [23]:
import os
os.mkdir(f"/media/benedikt/T7 Shield/llms/{current_version}")

In [2]:
!kenlm/build/bin/lmplz -o 5 --prune 0 0 0 2 2 -S 60% --discount_fallback < "/media/benedikt/T7/text_full_{current_version}_base.txt" > "/media/benedikt/T7 Shield/llms/{current_version}/5gram.arpa"

=== 1/5 Counting and sorting n-grams ===
Reading /media/benedikt/T7/text_full_v18_base.txt
----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100
****************************************************************************************************
Unigram tokens 1447276556 types 8191038
=== 2/5 Calculating and sorting adjusted counts ===
Chain sizes: 1:98292456 2:7891704320 3:14796946432 4:23675113472 5:34526208000
Statistics:
1 8191038 D1=0.726861 D2=1.03416 D3+=1.30096
2 158007309 D1=0.751129 D2=1.08441 D3+=1.34914
3 564499091 D1=0.833052 D2=1.18366 D3+=1.37207
4 54602021/871497640 D1=0.898842 D2=1.31814 D3+=1.44294
5 36815061/972493915 D1=0.82901 D2=1.46179 D3+=1.78006
Memory estimate for binary LM:
type       MB
probing 18621 assuming -p 1.5
probing 23099 assuming -r models -p 1.5
trie    10875 without quantization
trie     6420 assuming -q 8 -b 8 quantization 
trie     8895 assuming -a 22 array pointer compression
trie     4440 assuming

In [3]:
with open(f"/media/benedikt/T7 Shield/llms/{current_version}/5gram.arpa", "r") as read_file, open(f"/media/benedikt/T7 Shield/llms/{current_version}/5gram_correct.arpa", "w") as write_file:
    has_added_eos = False
    for line in read_file:
        if not has_added_eos and "ngram 1=" in line:
            count=line.strip().split("=")[-1]
            write_file.write(line.replace(f"{count}", f"{int(count)+1}"))
        elif not has_added_eos and "<s>" in line:
            write_file.write(line)
            write_file.write(line.replace("<s>", "</s>"))
            has_added_eos = True
        else:
            write_file.write(line)

In [4]:
!kenlm/build/bin/build_binary -S 60% "/media/benedikt/T7 Shield/llms/{current_version}/5gram_correct.arpa" "/media/benedikt/T7 Shield/llms/{current_version}/5gram.binary"

Reading /media/benedikt/T7 Shield/llms/v18/5gram_correct.arpa
----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100
****************************************************************************************************
SUCCESS


In [5]:
from pyctcdecode import BeamSearchDecoderCTC
from pyctcdecode import build_ctcdecoder

from transformers import pipeline
from transformers import Wav2Vec2CTCTokenizer, Wav2Vec2FeatureExtractor, Wav2Vec2Processor,Wav2Vec2ProcessorWithLM
from transformers import Wav2Vec2ForCTC
from pyctcdecode import build_ctcdecoder
from bnunicodenormalizer import Normalizer 
import librosa
from jiwer import wer
from transformers import Wav2Vec2ProcessorWithLM,pipeline


processor = Wav2Vec2Processor.from_pretrained("best_models/best_v20_processor")

vocab_dict = processor.tokenizer.get_vocab()

vocab_dict = {k: v for k, v in sorted(vocab_dict.items(), key=lambda item: item[1])}
decoder = build_ctcdecoder(
    labels=list(vocab_dict.keys()),
    kenlm_model_path=f'/media/benedikt/T7 Shield/llms/{current_version}/5gram_correct.arpa',
    #alpha=0.2, #alpha 0.2 and beta 0.5 ->
    #beta=0.2
)

processor = Wav2Vec2ProcessorWithLM(
    feature_extractor=processor.feature_extractor,
    tokenizer=processor.tokenizer,
    decoder=decoder,
)

processor.save_pretrained(f"/media/benedikt/T7 Shield/llms/{current_version}/new_model_arpa")

Loading the LM will be faster if you build a binary file.
Reading /media/benedikt/T7 Shield/llms/v18/5gram_correct.arpa
----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100
****************************************************************************************************
Found entries of length > 1 in alphabet. This is unusual unless style is BPE, but the alphabet was not recognized as BPE type. Is this correct?
Unigrams and labels don't seem to agree.


In [6]:
from pyctcdecode import BeamSearchDecoderCTC
from pyctcdecode import build_ctcdecoder

from transformers import pipeline
from transformers import Wav2Vec2CTCTokenizer, Wav2Vec2FeatureExtractor, Wav2Vec2Processor,Wav2Vec2ProcessorWithLM
from transformers import Wav2Vec2ForCTC
from pyctcdecode import build_ctcdecoder
from bnunicodenormalizer import Normalizer 
import librosa
from jiwer import wer
from transformers import Wav2Vec2ProcessorWithLM,pipeline


processor = Wav2Vec2Processor.from_pretrained("best_models/best_v20_processor")


vocab_dict = processor.tokenizer.get_vocab()

vocab_dict = {k: v for k, v in sorted(vocab_dict.items(), key=lambda item: item[1])}
decoder = build_ctcdecoder(
    labels=list(vocab_dict.keys()),
    kenlm_model_path=f'/media/benedikt/T7 Shield/llms/{current_version}/5gram.binary',
    #alpha=0.2, #alpha 0.2 and beta 0.5 ->
    #beta=0.2
)

processor = Wav2Vec2ProcessorWithLM(
    feature_extractor=processor.feature_extractor,
    tokenizer=processor.tokenizer,
    decoder=decoder,
)

processor.save_pretrained(f"/media/benedikt/T7 Shield/llms/{current_version}/new_model_bin_mixed")

Unigrams not provided and cannot be automatically determined from LM file (only arpa format). Decoding accuracy might be reduced.
Found entries of length > 1 in alphabet. This is unusual unless style is BPE, but the alphabet was not recognized as BPE type. Is this correct?
No known unigrams provided, decoding results might be a lot worse.
