# Examples with statistical language models

In [1]:
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

In [2]:
import typing
import json

In [3]:
import pymongo

In [4]:
from collections import defaultdict

In [5]:
import nltk
from nltk.tokenize import sent_tokenize

## Simple statistical model using skip-gram

In [6]:
class N2Model(object):
    
    def __init__(self, label: str):
        self.label = label
        self.index = defaultdict(lambda: defaultdict(lambda: 0))
        self.n = defaultdict(lambda: 0)
        self.N = sum(self.n.values())
    
    @staticmethod
    def skip(sequence, s=2):
        n = 2
        k_grams = []
        for i in range(len(sequence)):
            for j in range(i+1, min(i+s, len(sequence))):
                k_grams.append((sequence[i], sequence[j]))
        return k_grams
    
    @staticmethod
    def tokenize(text):
        for sent in sent_tokenize(text):
            tokens = ['#S'] + nltk.word_tokenize(sent.lower()) + ['#E']
            yield tokens
    
    def add(self, sequence):
        for a, b in sequence:
            self.n[a] += 1
            self.index[a][b] += 1
    
    def fit(self, texts, s=3):
        for text in texts:
            if text is not None:
                for tokens in N2Model.tokenize(text):
                    self.add(N2Model.skip(tokens, s=s))
        self.N = sum(self.n.values())
    
    def frequency_filter(self, min_bgram=50, min_unigram=100):
        for k, v in self.index.items():
            self.index[k] = defaultdict(lambda: 0, dict([(z, c) for z, c in v.items() if c >= min_bgram]))
        self.n = defaultdict(lambda: 0, dict([(z, c) for z, c in self.n.items() if c >= min_unigram]))
        self.N = sum(self.n.values())
    
    def p_word(self, word):
        if self.n[word] > 0:
            return self.n[word] / self.N
        else:
            return 1 / (self.N + len(self.n))
    
    def p_gram(self, w1, w2):
        if self.index[w1][w2] > 0:
            if self.n[w1] > 0:
                return self.index[w1][w2] / self.n[w1]
            else:
                return self.p_word(w1) * self.p_word(w2)
        else:
            return self.p_word(w1) * self.p_word(w2)
    
    def p_text(self, text, s=3):
        probs = []
        for tokens in N2Model.tokenize(text):
            sequence = N2Model.skip(tokens, s=s)
            data = np.array([self.p_gram(a, b) for a, b in sequence])
            probs.append(np.prod(data))
        return probs
    
    def save(self):
        idx = {}
        for w, i in self.index.items():
            k = dict([(x, y) for x, y in i.items()])
            idx[w] = k
        self.index = idx
        self.n = dict([(x, y) for x, y in self.n.items()])
        # do serialize

class N2Classifier(object):
    
    def __init__(self, models: typing.Iterable[N2Model]):
        self.models = {}
        for model in models:
            model.save()
            self.models[model.label] = model
        self.G = N2Model(label='global')
        
    def frequency_filter(self, min_bgram=50, min_unigram=100):
        for model in self.models.values():
            model.frequency_filter(min_bgram=min_bgram, min_unigram=min_unigram)
    
    def global_model(self):
        for model in self.models.values():
            for k, v in model.index.items():
                self.G.n[k] += model.n[k]
                for z, c in v.items():
                    self.G.index[k][z] += c
        self.G.N = sum(self.G.n.values())
            
    def save(self, file):
        data = {}
        for label, model in self.models.items():
            data[label] = model.index
        with open(file, 'w') as out:
            json.dump(data, out)
            
    def load(self, file):
        with open(file, 'r') as infile:
            data = json.load(infile)
        for label, index_data in data.items():
            model = N2Model(label=label)
            for k, v in index_data.items():
                for z, c in v.items():
                    model.index[k][z] = c
                    model.n[k] += c
            model.N = sum(model.n.values())
            self.models[label] = model
    
    def p_word(self, label, word):
        return self.models[label].p_word(word)
    
    def p_gram(self, label, w1, w2):
        return self.models[label].p_gram(w1, w2)
    
    def p_text(self, label, text):
        return self.models[label].p_text(text)
    
    def kl_gram(self, label, w1, w2):
        p_k = self.models[label].p_gram(w1, w2)
        p = self.G.p_gram(w1, w2)
        return p_k * np.log(p_k / p)
    
    def kl_text(self, label, text, s=3):
        scores = []
        for tokens in N2Model.tokenize(text):
            sequence = N2Model.skip(tokens, s=s)
            data = np.array([self.kl_gram(label, a, b) for a, b in sequence])
            scores.append(np.exp(data.sum()))
        return scores

## Example: model a movie genre

### Create and index corpus

In [7]:
db = pymongo.MongoClient()['movie-dialogs']['lines']

In [8]:
corpus = defaultdict(lambda: [])
for line in db.find():
    try:
        for genre in line['character']['movie']['genres']:
            corpus[genre].append(line['text'])
    except KeyError:
        pass

In [10]:
list(corpus.keys())
print(corpus['western'])



#### How skip-gram works

for text in corpus['war'][:1]:
    for tokens in tokenize(text):
        print(tokens)
        print(N2Model.skip(tokens, s=3), '\n')

## Fit & save

In [None]:
models = dict([(genre, N2Model(label=genre)) for genre in corpus.keys()])
for genre, data in tqdm(corpus.items()):
    models[genre].fit(data, s=3)

In [None]:
K = N2Classifier(models.values())

In [None]:
outfile = '../../data/lm-movies.json'
K.save(outfile)

## Load models

In [11]:
M = N2Classifier(models=[])
infile = '../../data/lm-movies.json'
M.load(infile)

In [13]:
text = 'Now you tell us.'
genres = list(M.models.keys())
probs = np.zeros(len(genres))
for i, genre in enumerate(genres):
    probs[i] = M.p_text(genre, text)[0]
for j, p in sorted(enumerate(probs), key=lambda x: -x[1]):
    print(genres[j], p)

war 8.05346836622751e-23
family 6.801429802448922e-23
crime 5.71792747116729e-23
action 5.717204289590075e-23
thriller 4.525927918451387e-23
adventure 3.5140864933681964e-23
western 3.463039357966921e-23
fantasy 2.6379193763693064e-23
mystery 2.4620661137802137e-23
drama 2.4204735489242364e-23
sci-fi 2.307068597942761e-23
romance 1.22373434232393e-23
comedy 1.105700146937e-23
short 2.8772230753664023e-25
documentary 2.789153644625565e-25
horror 5.229351974845286e-26
animation 4.1396721715201735e-26
musical 3.3623472865785005e-26
history 2.2447557841519672e-26
biography 1.8496867302830833e-26
music 1.2089811560404186e-26
film-noir 1.7728052087511597e-27
sport 3.430904087198093e-28
 4.7949698497096386e-42
adult 6.323247328092056e-53


In [20]:
M.models['family'].index['wound']['kill']

0