# Examples with statistical language models

In [1]:
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

In [2]:
import typing
import json

In [3]:
import pymongo
from sklearn.datasets import fetch_20newsgroups

In [4]:
from collections import defaultdict

In [5]:
import nltk
from nltk.tokenize import sent_tokenize

## Simple statistical model using skip-gram

In [64]:
class N2Model(object):
    
    def __init__(self, label: str):
        self.label = label
        self.index = defaultdict(lambda: defaultdict(lambda: 0))
        self.n = defaultdict(lambda: 0)
        self.N = sum(self.n.values())
    
    @staticmethod
    def skip(sequence, s=2):
        n = 2
        k_grams = []
        for i in range(len(sequence)):
            for j in range(i+1, min(i+s, len(sequence))):
                k_grams.append((sequence[i], sequence[j]))
        return k_grams
    
    @staticmethod
    def tokenize(text):
        for sent in sent_tokenize(text):
            tokens = ['#S'] + nltk.word_tokenize(sent.lower()) + ['#E']
            yield tokens
    
    def add(self, sequence):
        for a, b in sequence:
            self.n[a] += 1
            self.index[a][b] += 1
    
    def fit(self, texts, s=3):
        for text in texts:
            if text is not None:
                for tokens in N2Model.tokenize(text):
                    self.add(N2Model.skip(tokens, s=s))
        self.N = sum(self.n.values())
    
    def frequency_filter(self, min_bgram=50, min_unigram=100):
        for k, v in self.index.items():
            self.index[k] = defaultdict(lambda: 0, dict([(z, c) for z, c in v.items() if c >= min_bgram]))
        self.n = defaultdict(lambda: 0, dict([(z, c) for z, c in self.n.items() if c >= min_unigram]))
        self.N = sum(self.n.values())
    
    def p_word(self, word):
        try:
            if self.n[word] > 0:
                return self.n[word] / self.N
            else:
                return 1 / (self.N + len(self.n))
        except KeyError:
            return 1 / (self.N + len(self.n))
    
    def p_gram(self, w1, w2):
        try:
            if self.index[w1][w2] > 0:
                if self.n[w1] > 0:
                    return self.index[w1][w2] / self.n[w1]
                else:
                    return self.p_word(w1) * self.p_word(w2)
            else:
                return self.p_word(w1) * self.p_word(w2)
        except KeyError:
            return self.p_word(w1) * self.p_word(w2)
    
    def p_text(self, text, s=3):
        probs = []
        for tokens in N2Model.tokenize(text):
            sequence = N2Model.skip(tokens, s=s)
            p = []
            for a, b in sequence:
                w = self.p_gram(a, b)
                if w is not None:
                    p.append(w)
            data = np.array(p)
            probs.append(np.prod(data))
        return probs
    
    def save(self):
        idx = {}
        for w, i in self.index.items():
            k = dict([(x, y) for x, y in i.items()])
            idx[w] = k
        self.index = idx
        self.n = dict([(x, y) for x, y in self.n.items()])
        # do serialize

class N2Classifier(object):
    
    def __init__(self, models: typing.Iterable[N2Model]):
        self.models = {}
        for model in models:
            model.save()
            self.models[model.label] = model
        self.G = N2Model(label='global')
        
    def frequency_filter(self, min_bgram=50, min_unigram=100):
        for model in self.models.values():
            model.frequency_filter(min_bgram=min_bgram, min_unigram=min_unigram)
    
    def global_model(self):
        for model in self.models.values():
            for k, v in model.index.items():
                self.G.n[k] += model.n[k]
                for z, c in v.items():
                    self.G.index[k][z] += c
        self.G.N = sum(self.G.n.values())
            
    def save(self, file):
        data = {}
        for label, model in self.models.items():
            data[label] = model.index
        with open(file, 'w') as out:
            json.dump(data, out)
            
    def load(self, file):
        with open(file, 'r') as infile:
            data = json.load(infile)
        for label, index_data in data.items():
            model = N2Model(label=label)
            for k, v in index_data.items():
                for z, c in v.items():
                    model.index[k][z] = c
                    model.n[k] += c
            model.N = sum(model.n.values())
            self.models[label] = model
    
    def p_word(self, label, word):
        return self.models[label].p_word(word)
    
    def p_gram(self, label, w1, w2):
        return self.models[label].p_gram(w1, w2)
    
    def p_text(self, label, text):
        return self.models[label].p_text(text)
    
    def kl_gram(self, label, w1, w2):
        p_k = self.models[label].p_gram(w1, w2)
        p = self.G.p_gram(w1, w2)
        return p_k * np.log(p_k / p)
    
    def kl_text(self, label, text, s=3):
        scores = []
        for tokens in N2Model.tokenize(text):
            sequence = N2Model.skip(tokens, s=s)
            data = np.array([self.kl_gram(label, a, b) for a, b in sequence])
            scores.append(np.exp(data.sum()))
        return scores

## Example: 20News classification

### Create and index corpus

In [8]:
load_dir = '/Users/alfio/Dati/sklearn/'
data_train = fetch_20newsgroups(subset='train', data_home=load_dir, remove=('headers', 'footers', 'quotes'))
data_test = fetch_20newsgroups(subset='test', data_home=load_dir, remove=('headers', 'footers', 'quotes'))

In [9]:
train_corpus = defaultdict(lambda: [])
for i, text in enumerate(data_train.data):
    target = data_train.target_names[data_train.target[i]]
    train_corpus[target].append(text)

## Fit & save

models = dict([(genre, N2Model(label=genre)) for genre in train_corpus.keys()])
for genre, data in tqdm(train_corpus.items()):
    models[genre].fit(data, s=5)

K = N2Classifier(models.values())

outfile = '../../data/lm-news.json'
K.save(outfile)

## Load models

In [65]:
M = N2Classifier(models=[])
infile = '../../data/lm-news.json'
M.load(infile)

In [66]:
item = 3
text = data_test.data[item]
target = data_test.target_names[data_test.target[item]]
print(text[:400], '\n')
print(target, '\n')
genres = list(M.models.keys())
probs = np.zeros(len(genres))
for i, genre in enumerate(genres):
    # Note that we have multiple sentences here!
    probs[i] = np.median(np.array(M.p_text(genre, text)))
for j, p in sorted(enumerate(probs), key=lambda x: -x[1]):
    print(genres[j], p)


They were attacking the Iraqis to drive them out of Kuwait,
a country whose citizens have close blood and business ties
to Saudi citizens.  And me thinks if the US had not helped out
the Iraqis would have swallowed Saudi Arabia, too (or at 
least the eastern oilfields).  And no Muslim country was doing
much of anything to help liberate Kuwait and protect Saudi
Arabia; indeed, in some masses of ci 

talk.politics.mideast 

misc.forsale 2.749735285342311e-43
rec.sport.hockey 4.0624280278472776e-44
comp.sys.ibm.pc.hardware 2.0542005543863144e-44
comp.sys.mac.hardware 1.5193340887785927e-44
sci.electronics 1.0489064404015975e-46
comp.os.ms-windows.misc 1.880976165149902e-47
comp.graphics 4.03500509226384e-48
rec.sport.baseball 9.491050801605009e-50
rec.motorcycles 2.671238106774861e-50
rec.autos 3.9374126682999835e-51
talk.religion.misc 1.1846131032069708e-51
comp.windows.x 1.0745001086149284e-51
sci.space 3.1602076619281703e-52
talk.politics.misc 6.61422621267017e-54
sci.med 4.6082167298

## Issue of generic, noisy, text: fixing by specificity
We create a `global` model that computes the probability of a bigram over the whole corpus. Then, we compute the a score for bigrams by means of KL divergence, that is:

$$
KL(P_k(w_i \mid w_{i-1})) = P_k(w_i \mid w_{i-1}) \log \left ( \frac{P_k(w_i \mid w_{i-1})}{P(w_i \mid w_{i-1})} \right ),
$$

where $P_k(w_i \mid w_{i-1})$ denotes the probability estimated by the model $k$ and $P(w_i \mid w_{i-1})$ the global probability.

Then we compute a classification score for a sequence of bigrams $(w_1, w_2), \dots, (w_{n-1}, w_n)$ by:

$$
\sigma_k = \exp \left ( \sum\limits_{i=1}^{n} KL(P_k(w_i \mid w_{i-1})) \right )
$$

In [15]:
M.global_model()

In [16]:
print(M.models['talk.politics.mideast'].p_gram('saudi', 'arabia'))
print(M.models['comp.sys.mac.hardware'].p_gram('saudi', 'arabia'))

0.2
1.073592525892085e-11


In [17]:
a = M.kl_gram('talk.politics.mideast', 'saudi', 'arabia')
print(f'{a:.20f}')
b = M.kl_gram('comp.sys.mac.hardware', 'saudi', 'arabia')
print(f'{b:.20f}')

-0.01600854153470727612
-0.00000000025474235976


In [18]:
item = 3
text = data_test.data[item]
target = data_test.target_names[data_test.target[item]]
print(text[:400], '\n')
print(target, '\n')
genres = list(M.models.keys())
probs = np.zeros(len(genres))
for i, genre in enumerate(genres):
    probs[i] = np.median(np.array(M.kl_text(genre, text)))
for j, p in sorted(enumerate(probs), key=lambda x: -x[1]):
    print(genres[j], p)


They were attacking the Iraqis to drive them out of Kuwait,
a country whose citizens have close blood and business ties
to Saudi citizens.  And me thinks if the US had not helped out
the Iraqis would have swallowed Saudi Arabia, too (or at 
least the eastern oilfields).  And no Muslim country was doing
much of anything to help liberate Kuwait and protect Saudi
Arabia; indeed, in some masses of ci 

talk.politics.mideast 

talk.politics.misc 1.7367846296428642
comp.sys.mac.hardware 1.5411875183357442
rec.sport.hockey 1.5115550883854625
talk.politics.mideast 1.4782089143499884
alt.atheism 1.4255910788582544
sci.crypt 1.4088569219390155
rec.sport.baseball 1.391599781355192
talk.religion.misc 1.3819814254276068
rec.autos 1.3561091577058808
comp.sys.ibm.pc.hardware 1.3487476200188737
soc.religion.christian 1.344990505175291
sci.electronics 1.238146537297835
comp.windows.x 1.2155094783893277
rec.motorcycles 1.1960712374649423
sci.med 1.179093846456174
sci.space 1.1309613073853133
misc.forsa

## Evaluate the two models

In [19]:
lm_pred = []
kl_pred = []
targets = []
docs = []
genres = list(M.models.keys())
for t, text in tqdm(list(enumerate(data_test.data[:100]))):
    try:
        lm_probs = np.zeros(len(genres))
        kl_probs = np.zeros(len(genres))
        for i, genre in enumerate(genres):
            lm_probs[i] = np.max(np.array(M.p_text(genre, text)))
            kl_probs[i] = np.max(np.array(M.kl_text(genre, text)))
        lm_target = [j for j, p in sorted(enumerate(lm_probs), key=lambda x: -x[1])][0]
        kl_target = [j for j, p in sorted(enumerate(kl_probs), key=lambda x: -x[1])][0]
        lm_pred.append(genres[lm_target])
        kl_pred.append(genres[kl_target])
        targets.append(data_test.target_names[data_test.target[t]])
        docs.append(t)
    except ValueError:
        pass

HBox(children=(HTML(value=''), FloatProgress(value=0.0), HTML(value='')))




In [20]:
from sklearn.metrics import classification_report

In [21]:
print(classification_report(targets, lm_pred, zero_division=0))

                          precision    recall  f1-score   support

             alt.atheism       0.00      0.00      0.00         5
           comp.graphics       0.00      0.00      0.00         8
 comp.os.ms-windows.misc       0.00      0.00      0.00         5
comp.sys.ibm.pc.hardware       0.83      0.71      0.77         7
   comp.sys.mac.hardware       0.40      0.40      0.40         5
          comp.windows.x       0.29      0.40      0.33         5
            misc.forsale       0.33      1.00      0.50         4
               rec.autos       0.17      0.20      0.18         5
         rec.motorcycles       0.22      0.29      0.25         7
      rec.sport.baseball       0.25      0.33      0.29         6
        rec.sport.hockey       1.00      0.25      0.40         4
               sci.crypt       0.50      0.67      0.57         3
         sci.electronics       0.00      0.00      0.00         1
                 sci.med       0.33      0.50      0.40         4
         

In [22]:
print(classification_report(targets, kl_pred, zero_division=0))

                          precision    recall  f1-score   support

             alt.atheism       0.00      0.00      0.00         5
           comp.graphics       0.00      0.00      0.00         8
 comp.os.ms-windows.misc       0.00      0.00      0.00         5
comp.sys.ibm.pc.hardware       0.17      0.29      0.21         7
   comp.sys.mac.hardware       0.00      0.00      0.00         5
          comp.windows.x       0.20      0.20      0.20         5
            misc.forsale       0.00      0.00      0.00         4
               rec.autos       0.00      0.00      0.00         5
         rec.motorcycles       0.33      0.14      0.20         7
      rec.sport.baseball       0.33      0.50      0.40         6
        rec.sport.hockey       0.00      0.00      0.00         4
               sci.crypt       0.00      0.00      0.00         3
         sci.electronics       0.14      1.00      0.25         1
                 sci.med       0.00      0.00      0.00         4
         

In [23]:
kl_pred[:10]

['sci.crypt',
 'comp.graphics',
 'misc.forsale',
 'misc.forsale',
 'rec.sport.baseball',
 'comp.os.ms-windows.misc',
 'rec.sport.hockey',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.ibm.pc.hardware',
 'talk.politics.mideast']

In [24]:
targets[:10]

['rec.autos',
 'comp.windows.x',
 'alt.atheism',
 'talk.politics.mideast',
 'talk.religion.misc',
 'sci.med',
 'soc.religion.christian',
 'soc.religion.christian',
 'comp.windows.x',
 'comp.graphics']

In [25]:
data_test.data[docs[4]]

"\nI've just spent two solid months arguing that no such thing as an\nobjective moral system exists."

In [27]:
for tokens in N2Model.tokenize(data_test.data[docs[4]]):
    sequence = N2Model.skip(tokens, s=5)
    for a, b in sequence:
        print(a, b, M.p_gram('rec.sport.baseball', a, b), M.p_gram('talk.religion.misc', a, b))

#S i 0.03939831914779723 0.03509384984025559
#S 've 0.001338258123226808 0.0010483226837060702
#S just 0.0035865317702478455 0.0027456070287539937
#S spent 5.353032492907232e-05 0.00014976038338658146
i 've 0.007965686274509803 0.010369068541300527
i just 0.006127450980392157 0.0036906854130052723
i spent 7.862102948513678e-07 0.0008787346221441124
i two 0.0004084967320261438 0.0005272407732864675
've just 0.008064516129032258 0.0035087719298245615
've spent 3.982437768037974e-08 7.443486319272112e-08
've two 8.32119891532145e-07 5.080179412903217e-07
've solid 7.964875536075948e-08 1.4886972638544225e-08
just spent 1.0164851238580796e-07 1.6323434910684456e-07
just two 0.001579778830963665 1.1140744326542142e-06
just solid 2.0329702477161592e-07 3.264686982136892e-08
just months 1.0699843409032417e-07 4.897030473205337e-08
spent two 6.37511207222208e-08 0.025
spent solid 6.102122386509799e-09 2.0893996685676104e-09
spent months 3.211643361320947e-09 3.1340995028514152e-09
spent arguin

### As multi label

In [None]:
lm_pred = []
kl_pred = []
targets = []
genres = list(M.models.keys())
for t, text in tqdm(list(enumerate(data_test.data[:100]))):
    lm_probs = np.zeros(len(genres))
    kl_probs = np.zeros(len(genres))
    for i, genre in enumerate(genres):
        lm_probs[i] = np.median(np.array(M.p_text(genre, text)))
        kl_probs[i] = np.median(np.array(M.kl_text(genre, text)))
    lm_target = [j for j, p in sorted(enumerate(lm_probs), key=lambda x: -x[1])][:4]
    kl_target = [j for j, p in sorted(enumerate(kl_probs), key=lambda x: -x[1])][:4]
    lm_pred.append([genres[lmt] for lmt in lm_target])
    kl_pred.append([genres[klt] for klt in kl_target])
    targets.append(data_test.target_names[data_test.target[t]])

In [None]:
for t in kl_pred[:10]:
    print(", ".join(t))

In [None]:
targets[:10]

## Filter specific words only
In this example, we keep in the index only the words that are specifically relevant for each model.

In [54]:
def select_words(label):
    label_words = [x for x in M.models[label].n.keys() if M.models[label].n[x] > 0]
    label_stats = [M.models[label].n[x] for x in label_words]
    L = sum(label_stats)
    p_m = np.array(label_stats) / L
    p_g = np.array([M.G.n[x] for x in label_words]) / M.G.N
    kl_words = p_m * np.log(p_m / p_g)
    w = [x for i, x in enumerate(label_words) if kl_words[i] > 0]
    return w

In [55]:
new_models = []
for label in M.models.keys():
    W = set(select_words(label))
    nm = N2Model(label=label)
    for k, v in M.models[label].index.items():
        for z, c in v.items():
            if k in W and z in W:
                nm.index[k][z] = c
                nm.n[k] += c
    nm.N = sum(nm.n.values())
    new_models.append(nm)

In [56]:
K = N2Classifier(new_models)

In [57]:
for tokens in N2Model.tokenize(data_test.data[docs[4]]):
    sequence = N2Model.skip(tokens, s=5)
    for a, b in sequence:
        print(a, b, K.p_gram('rec.sport.baseball', a, b), K.p_gram('talk.religion.misc', a, b))

#S i 0.051150184168462016 0.04068287037037037
#S 've 0.0017374383209396067 0.0012152777777777778
#S just 0.004656334700118146 0.00318287037037037
#S spent 6.949753283758427e-05 0.00017361111111111112
i 've 0.010233534505379166 0.011743630573248409
i just 0.007871949619522435 0.004179936305732484
i spent 1.2546236625094159e-06 0.0009952229299363057
i two 0.0005247966413014956 5.786018183629562e-08
've just 0.010471204188481676 0.0040650406504065045
've spent 6.287932813941182e-08 1.0480101095281288e-07
've two 1.245010697160354e-06 2.8331219609332643e-09
've solid 1.0899083544164714e-07 2.8331219609332643e-09
just spent 1.5703371477748394e-07 2.2664283669470102e-07
just two 0.0020964360587002098 6.126914159416653e-09
just solid 2.7219177228097213e-07 6.126914159416653e-09
just months 1.0124514988739007e-08 6.126914159416653e-09
spent two 9.777570920107492e-08 4.1460321379511186e-10
spent solid 8.559489694370195e-09 4.1460321379511186e-10
spent months 3.1838097448864805e-10 4.14603213795

In [58]:
print(K.p_text('rec.sport.baseball', data_test.data[docs[4]]))
print(K.p_text('talk.religion.misc', data_test.data[docs[4]]))

[1.0058711905924107e-288]
[2.685894332951408e-217]


In [59]:
lm_pred = []
kl_pred = []
targets = []
docs = []
genres = list(K.models.keys())
for t, text in tqdm(list(enumerate(data_test.data[:100]))):
    try:
        lm_probs = np.zeros(len(genres))
        kl_probs = np.zeros(len(genres))
        for i, genre in enumerate(genres):
            lm_probs[i] = np.max(np.array(K.p_text(genre, text)))
            kl_probs[i] = np.max(np.array(K.kl_text(genre, text)))
        lm_target = [j for j, p in sorted(enumerate(lm_probs), key=lambda x: -x[1])][0]
        kl_target = [j for j, p in sorted(enumerate(kl_probs), key=lambda x: -x[1])][0]
        lm_pred.append(genres[lm_target])
        kl_pred.append(genres[kl_target])
        targets.append(data_test.target_names[data_test.target[t]])
        docs.append(t)
    except ValueError:
        pass

HBox(children=(HTML(value=''), FloatProgress(value=0.0), HTML(value='')))






In [60]:
print(classification_report(targets, lm_pred, zero_division=0))

                          precision    recall  f1-score   support

             alt.atheism       0.00      0.00      0.00         5
           comp.graphics       0.33      0.12      0.18         8
 comp.os.ms-windows.misc       0.00      0.00      0.00         5
comp.sys.ibm.pc.hardware       1.00      0.14      0.25         7
   comp.sys.mac.hardware       0.14      0.20      0.17         5
          comp.windows.x       0.00      0.00      0.00         5
            misc.forsale       0.21      0.75      0.33         4
               rec.autos       0.20      0.20      0.20         5
         rec.motorcycles       0.25      0.29      0.27         7
      rec.sport.baseball       0.25      0.33      0.29         6
        rec.sport.hockey       0.00      0.00      0.00         4
               sci.crypt       0.33      0.67      0.44         3
         sci.electronics       0.00      0.00      0.00         1
                 sci.med       1.00      0.25      0.40         4
         

In [62]:
lm_pred[:10]

['sci.electronics',
 'sci.electronics',
 'soc.religion.christian',
 'talk.politics.guns',
 'alt.atheism',
 'sci.med',
 'soc.religion.christian',
 'talk.politics.misc',
 'talk.politics.misc',
 'talk.politics.guns']

In [63]:
targets[:10]

['rec.autos',
 'comp.windows.x',
 'alt.atheism',
 'talk.politics.mideast',
 'talk.religion.misc',
 'sci.med',
 'soc.religion.christian',
 'soc.religion.christian',
 'comp.windows.x',
 'comp.graphics']

## Word2word relations

In [None]:
words = [x for x, y in M.G.n.items() if y > 50]
len(words)

In [None]:
wv = np.zeros((len(words), len(words)))

In [None]:
data = list(enumerate(words))
for i, w1 in tqdm(data):
    for j, w2 in data:
        sigma = M.G.p_gram(w1, w2)
        wv[i,j] = sigma

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
S = cosine_similarity(wv[words.index('history')].reshape(1, -1), wv)

In [None]:
similar = [words[x] for x, y in sorted(enumerate(S[0]), key=lambda k: -k[1])]

In [None]:
similar[:10]

### Mutual information

In [None]:
mi = np.zeros((len(words), len(words)))
N = sum([sum(v.values()) for v in M.G.index.values()])
data = list(enumerate(words))
for i, w1 in tqdm(data):
    for j, w2 in data:
        sigma = (M.G.index[w1][w2] / N) * np.log((M.G.index[w1][w2] / N) / (M.G.p_word(w1) * M.G.p_word(w2)))
        mi[i,j] = sigma

In [None]:
S = cosine_similarity(wv[words.index('history')].reshape(1, -1), wv)

In [None]:
similar = [words[x] for x, y in sorted(enumerate(S[0]), key=lambda k: -k[1])]

In [None]:
similar[:10]

In [None]:
M.G.index['arabia']