In [29]:
from datasets import load_dataset
import nltk
import utils
import markov_chains

# Language modeling

In [2]:
corpus=[
    "<s> I am Sam </s>",
    "<s> Sam I am </s>",
    "<s> I do not like green eggs and ham </s>"
]

In [3]:
bag_of_words = utils.bag_of_ngrams(corpus=corpus, n=1, tokenize_function=utils.split_tokenize)


1grams: 100%|██████████| 3/3 [00:00<00:00, 20068.44it/s]
bag of 1grams: 3it [00:00, 1660.67it/s]


In [4]:
bag_of_words

Unnamed: 0,<s>,I,</s>,am,Sam,do,not,like,green,eggs,and,ham
0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [5]:
bigram = utils.bag_of_ngrams(corpus=corpus, n=2, tokenize_function=utils.split_tokenize)

2grams: 100%|██████████| 3/3 [00:00<00:00, 17898.88it/s]
bag of 2grams: 3it [00:00, 1308.00it/s]


In [6]:
bigram

Unnamed: 0,<s> I,I am,am Sam,Sam </s>,<s> Sam,Sam I,am </s>,I do,do not,not like,like green,green eggs,eggs and,and ham,ham </s>
0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [7]:
def ngrams_compute_pw1_knowing_w2(w2, w1, bag_of_words, bigram):
    p = bigram[f"{w1} {w2}"].sum() / bag_of_words[w1].sum()
    print(f"p({w2}|{w1})={p:.4f}")

In [8]:
ngrams_compute_pw1_knowing_w2(w2="I", w1="<s>", bigram=bigram, bag_of_words=bag_of_words)
ngrams_compute_pw1_knowing_w2(w2="</s>", w1="Sam", bigram=bigram, bag_of_words=bag_of_words)
ngrams_compute_pw1_knowing_w2(w2="eggs", w1="green", bigram=bigram, bag_of_words=bag_of_words)
ngrams_compute_pw1_knowing_w2(w2="Sam", w1="am", bigram=bigram, bag_of_words=bag_of_words)
ngrams_compute_pw1_knowing_w2(w2="am", w1="I", bigram=bigram, bag_of_words=bag_of_words)
ngrams_compute_pw1_knowing_w2(w2="do", w1="I", bigram=bigram, bag_of_words=bag_of_words)

p(I|<s>)=0.6667
p(</s>|Sam)=0.5000
p(eggs|green)=1.0000
p(Sam|am)=0.5000
p(am|I)=0.6667
p(do|I)=0.3333


In [9]:
tokenized_corpus, words_freq = utils.split_tokenize(corpus=corpus)
tokenized_corpus

[['<s>', 'I', 'am', 'Sam', '</s>'],
 ['<s>', 'Sam', 'I', 'am', '</s>'],
 ['<s>', 'I', 'do', 'not', 'like', 'green', 'eggs', 'and', 'ham', '</s>']]

In [10]:
states = list(words_freq.keys())
markov_model = markov_chains.MarkovChains(states=states)

In [11]:
markov_model.states

['</s>',
 '<s>',
 'I',
 'Sam',
 'am',
 'and',
 'do',
 'eggs',
 'green',
 'ham',
 'like',
 'not']

In [12]:
markov_model.fit(tokenized_corpus=tokenized_corpus)
markov_model.transition_matrix

array([[0.08333334, 0.08333334, 0.08333334, 0.08333334, 0.08333334,
        0.08333334, 0.08333334, 0.08333334, 0.08333334, 0.08333334,
        0.08333334, 0.08333334],
       [0.        , 0.        , 0.6666667 , 0.33333334, 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.6666667 ,
        0.        , 0.33333334, 0.        , 0.        , 0.        ,
        0.        , 0.        ],
       [0.5       , 0.        , 0.5       , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        ],
       [0.5       , 0.        , 0.        , 0.5       , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 1.        ,
        0.        ,

In [13]:
markov_model.transition_matrix.sum(axis=1)

array([0.99999994, 1.        , 1.        , 1.        , 1.        ,
       1.        , 1.        , 1.        , 1.        , 1.        ,
       1.        , 1.        ], dtype=float32)

In [14]:
print("Most probabale next states:")
for current_state in markov_model.states:
    next_state, probas = markov_model.predict_next_state(current_state=current_state)
    print(f"Current state: [{current_state}]".ljust(23)
        + f"| next state: [{next_state}]".ljust(22)
        + f"| with proba: p({current_state}|{next_state})={probas.max():.4f}")

Most probabale next states:
Current state: [</s>]  | next state: [</s>]  | with proba: p(</s>|</s>)=0.0833
Current state: [<s>]   | next state: [I]     | with proba: p(<s>|I)=0.6667
Current state: [I]     | next state: [am]    | with proba: p(I|am)=0.6667
Current state: [Sam]   | next state: [</s>]  | with proba: p(Sam|</s>)=0.5000
Current state: [am]    | next state: [</s>]  | with proba: p(am|</s>)=0.5000
Current state: [and]   | next state: [ham]   | with proba: p(and|ham)=1.0000
Current state: [do]    | next state: [not]   | with proba: p(do|not)=1.0000
Current state: [eggs]  | next state: [and]   | with proba: p(eggs|and)=1.0000
Current state: [green] | next state: [eggs]  | with proba: p(green|eggs)=1.0000
Current state: [ham]   | next state: [</s>]  | with proba: p(ham|</s>)=1.0000
Current state: [like]  | next state: [green] | with proba: p(like|green)=1.0000
Current state: [not]   | next state: [like]  | with proba: p(not|like)=1.0000


In [15]:
print("Generate sequences:")
print("Seq1:", " ".join(markov_model.generate(start=None, length=10)))
print("Seq2:", " ".join(markov_model.generate(start="<s>", length=10)))
print("Seq3:", " ".join(markov_model.generate(start="I", length=10)))

Generate sequences:
Seq1: <s> I am </s> </s> </s> </s> </s> </s> </s>
Seq2: <s> I am </s> </s> </s> </s> </s> </s> </s>
Seq3: I am </s> </s> </s> </s> </s> </s> </s> </s>


In [16]:
print("Evaluation:")

data = [
    ("machine translation", "<s> I am", "<s> am I"),
    ("spell correction", "<s> I do not like", "<s> I do not lik"),
]

for title, text1, text2 in data:
    score1 = markov_model.score_sequence(sequence=text1.split())
    score2 = markov_model.score_sequence(sequence=text2.split())
    print(f"{title.ljust(20)}: p({text1})={score1:.4f} > p({text2})={score2:.4f}")

Evaluation:
machine translation : p(<s> I am)=0.4444 > p(<s> am I)=0.0000
spell correction    : p(<s> I do not like)=0.2222 > p(<s> I do not lik)=0.0000


In [17]:
tokenized_corpus

[['<s>', 'I', 'am', 'Sam', '</s>'],
 ['<s>', 'Sam', 'I', 'am', '</s>'],
 ['<s>', 'I', 'do', 'not', 'like', 'green', 'eggs', 'and', 'ham', '</s>']]

Using Wolof corpus

In [18]:
raw_data = load_dataset("galsenai/french-wolof-translation")["train"]
raw_data

Dataset({
    features: ['french', 'wolof', 'sources'],
    num_rows: 17777
})

In [19]:
wolof_corpus = raw_data["wolof"]
wolof_corpus[:10]

['Bataaxal bii jëwriñu lu ajju ci mbiru bitim réew bu Ekuwatër moo ko wara wóoral, te dafa wara mengoo ak yenni càkuteef yi.\n',
 '"Amuñu woon benn jot ngir rawal sunu bopp.\n',
 'Ekost sa dëkk la te bëggna nu nga toog fi."\n',
 'Xibaari Jotna : Espaañ joxe na juróom-ñett-fukki milyaar ak ñeent ci xaalisu Seefa ngir dimbalee ko Senegaal.\n',
 'ñaata at nga am',
 'Abu Usmaan Si, Mamadu Yoro Jàllo ak Usmaan Njaay ñoo faatu, ci doxu nemmeeku koom-koom gi bu Maki Sàll bi.\n',
 'Ni ñu ko tàmm a waxe fii : boo xamatul foo jëm, dellul fa nga jóge woon, baax na lool ñu dellu fa ñu jóge woon ngir bégal way-jëfandiku yi.\n',
 'Zambie réew la mu bokk ci ONU, Union Africaine, ak Southern African Development Community (SADC).',
 'tooy',
 'Ay teemeeri nit faatu ca donu bu Nias, ci tefesu Sumatra.\n']

In [20]:
tokenized_wolof_corpus, words_freq = utils.simple_tokenize(corpus=wolof_corpus)

In [21]:
tokenized_wolof_corpus[:10]

[['bataaxal',
  'bii',
  'jëwriñu',
  'lu',
  'ajju',
  'ci',
  'mbiru',
  'bitim',
  'réew',
  'bu',
  'ekuwatër',
  'moo',
  'ko',
  'wara',
  'wóoral',
  'te',
  'dafa',
  'wara',
  'mengoo',
  'ak',
  'yenni',
  'càkuteef',
  'yi'],
 ['amuñu', 'woon', 'benn', 'jot', 'ngir', 'rawal', 'sunu', 'bopp'],
 ['ekost', 'sa', 'dëkk', 'la', 'te', 'bëggna', 'nu', 'nga', 'toog', 'fi'],
 ['xibaari',
  'jotna',
  'espaañ',
  'joxe',
  'na',
  'juróom',
  'ñett',
  'fukki',
  'milyaar',
  'ak',
  'ñeent',
  'ci',
  'xaalisu',
  'seefa',
  'ngir',
  'dimbalee',
  'ko',
  'senegaal'],
 ['ñaata', 'at', 'nga', 'am'],
 ['abu',
  'usmaan',
  'si',
  'mamadu',
  'yoro',
  'jàllo',
  'ak',
  'usmaan',
  'njaay',
  'ñoo',
  'faatu',
  'ci',
  'doxu',
  'nemmeeku',
  'koom',
  'koom',
  'gi',
  'bu',
  'maki',
  'sàll',
  'bi'],
 ['ni',
  'ñu',
  'ko',
  'tàmm',
  'a',
  'waxe',
  'fii',
  'boo',
  'xamatul',
  'foo',
  'jëm',
  'dellul',
  'fa',
  'nga',
  'jóge',
  'woon',
  'baax',
  'na',
  'lool',
  'ñ

In [22]:
markov_model = markov_chains.MarkovChains(states=list(words_freq))

In [30]:
len(markov_model.states), markov_model.states[:10] # I need a better tokenization function; more to come

(25466,
 ['0', '00', '000', '0006779', '000ngir', '007', '01', '012914', '02', '0230'])

In [None]:
# tf_idf = utils.tfidf(corpus=wolof_corpus)

tfidf: 2900it [00:06, 457.03it/s]


KeyboardInterrupt: 

In [26]:
for ex, l in zip(wolof_corpus, raw_data["french"]) :
    if "0006779" in ex.lower():
        print(ex, l)

Indi nañu lépp lu ci war ci li weesu wuute ak sàrt bii teew te jóge ci Kilifay nguur gi wala yi yor wàllu galag, te ñu mën see téye yii nekk ca : bu limat 0006779/MEF/DGID/BLEC bu ñaar-fukki fan ci weeru ut 2004 ; xibaar yi, bataaxal yi ak tontu waa nguur gi te jóge ca kër jëwriñ ja yor wàllu koom ak koppaaral ak barab bu mag bay doxal mbirum galag ak këyit yi ci aju.
 Sont rapportées toutes dispositions réglementaires antérieures contraires à la présente loi émanant des autorités administratives ou fiscales, notamment celles contenues dans: la circulaire n° 0006779/MEF/DGID/BLEC du 20 août 2004; les circulaires, notes, lettres et réponses administratives émanant du Ministère de l’Economie et des finances et de la Direction générale des Impôts et des Domaines.

