# Statistical language models

In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm_notebook

In [2]:
import nltk

In [3]:
from langmodels.corpora.moviedialog import MovieDialogCollection

In [4]:
db_name = 'movie-dialogs'
collection = 'lines'
m = {'$match': {'character.movie.id': 'm42'}}
p = {'$project': {'_id': 0, 'id': 1, 'text': 1}}
s = {'$sort': {'id': 1}}
pipeline = [m, p, s]

In [5]:
m42 = MovieDialogCollection(db_name, collection, 
                              use_pos=False, pipeline=pipeline)

In [6]:
s1 = 'The train for Marseilles leaves at five'
s2 = "A train to Marseilles and Lyon leaves five o'clock"

In [7]:
len([x for x in nltk.ngrams(m42.tokenize(s1), 2) 
     if x in nltk.ngrams(m42.tokenize(s2), 2)])

0

In [8]:
len([x for x in nltk.ngrams(m42.tokenize(s1), 3) 
     if x in nltk.ngrams(m42.tokenize(s2), 3)])

0

## Skip grams

In [9]:
def skip(sequence, n=2, s=2):
    k_grams = []
    for i in range(len(sequence)):
        for z in range(s):
            seq = [sequence[i]] + sequence[i+z+1:i+z+n]
            if len(seq) == n and seq not in k_grams:
                k_grams.append(tuple(seq))
    return k_grams       

In [10]:
t1 = m42.tokenize(s1)
t2 = m42.tokenize(s2)

In [11]:
print(t1)

['the', 'train', 'for', 'marseilles', 'leaves', 'at', 'five']


In [12]:
print(skip(t1, n=2, s=3))

[('the', 'train'), ('the', 'for'), ('the', 'marseilles'), ('train', 'for'), ('train', 'marseilles'), ('train', 'leaves'), ('for', 'marseilles'), ('for', 'leaves'), ('for', 'at'), ('marseilles', 'leaves'), ('marseilles', 'at'), ('marseilles', 'five'), ('leaves', 'at'), ('leaves', 'five'), ('at', 'five')]


In [13]:
for e in [x for x in skip(t1, n=2, s=3) 
          if x in skip(t2, n=2, s=3)]:
    print(e)

('train', 'marseilles')
('marseilles', 'leaves')
('leaves', 'five')


## Contextual models

In [14]:
from collections import defaultdict

In [15]:
context = defaultdict(lambda: defaultdict(lambda: 0))
for doc, line in m42:
    for a, b in skip(m42.tokenize(line), n=2, s=3):
        context[a][b] += 1

In [16]:
C = pd.DataFrame(context).T
C.fillna(0, inplace=True)

In [17]:
C.head()

Unnamed: 0,france,welcomes,you,any,violation,to,of,all,can,casablanca,...,arising,entertainment,faithful,though,published,apprehended,occur,diplomatist,norwegian,ma'am
unoccupied,2.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
france,0.0,1.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
welcomes,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
you,0.0,0.0,25.0,6.0,0.0,53.0,16.0,2.0,7.0,7.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
to,1.0,0.0,51.0,0.0,0.0,12.0,9.0,2.0,0.0,15.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Keep only words that appear in both the dimensions

In [18]:
words = [w for w in C.columns if w not in C.index]
words += [w for w in C.index if w not in C.columns]

In [19]:
len(words)

118

In [20]:
C.drop(words, axis=0, inplace=True, errors='ignore')
C.drop(words, axis=1, inplace=True, errors='ignore')

In [21]:
C.shape

(1244, 1244)

In [22]:
C.head()

Unnamed: 0,france,welcomes,you,any,violation,to,of,all,can,casablanca,...,says,arising,entertainment,faithful,though,published,apprehended,occur,norwegian,ma'am
unoccupied,2.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
france,0.0,1.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
welcomes,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
you,0.0,0.0,25.0,6.0,0.0,53.0,16.0,2.0,7.0,7.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
to,1.0,0.0,51.0,0.0,0.0,12.0,9.0,2.0,0.0,15.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [23]:
from sklearn.metrics.pairwise import cosine_similarity

In [24]:
sigma = cosine_similarity(C, C)

In [25]:
j = list(C.index).index('train')

In [26]:
for i, x in sorted(enumerate(sigma[j]), key=lambda y: -y[1])[:5]:
    print(C.index[i])

train
soon
immediately
living
sick


## Matrix factorization

In [27]:
from sklearn.decomposition import TruncatedSVD

In [28]:
S = TruncatedSVD(n_components=100)

In [29]:
Cs = S.fit_transform(C)

In [30]:
Cs.shape

(1244, 100)

In [31]:
sigma = cosine_similarity(Cs, Cs)

In [32]:
j = list(C.index).index('train')

In [33]:
for i, x in sorted(enumerate(sigma[j]), key=lambda y: -y[1])[:5]:
    print(C.index[i])

train
abdul
soon
knowledge
immediately


## LDA

In [34]:
from sklearn.decomposition import LatentDirichletAllocation

In [35]:
r = {'$match': {'character.movie.genres': 'war'}}
tm = MovieDialogCollection(db_name, collection, 
                           use_pos=False,
                           drop_stopwords=True,
                           pipeline=[r, p, s],
                           pos_filter=['NOUN'],
                           lemma=True
                          )

In [65]:
lda = LatentDirichletAllocation(n_components=50)

In [37]:
I = defaultdict(lambda: defaultdict(lambda: 0))
docs = []
for doc, tokens in tm.get_tokens():
    docs.append(doc)
    for token in tokens:
        I[doc][token] += 1

In [38]:
X = pd.DataFrame(I).T
X.fillna(0, inplace=True)

In [40]:
X.shape

(7073, 4244)

### Document distribution over documents

In [66]:
theta = lda.fit_transform(X)

In [67]:
theta.shape

(7073, 50)

### Word distribution over topics

In [68]:
phi = lda.components_ / lda.components_.sum(axis=1)[:, np.newaxis]

In [69]:
phi.shape

(50, 4244)

### Get most relevant documents per topic

In [73]:
documents = dict([(d, t) for d, t in tm])
topic = 1

In [74]:
for i, x in sorted(enumerate(theta[:,topic]), key=lambda y: -y[1])[:5]:
    print(i, x)
    print(documents[docs[i]], '\n')

4192 0.859999999999999
What do you do in the joint besides pimp? 

6874 0.8366666666666466
She'll never forgive me! 

4222 0.8203732644422244
Let's get his clothes off quick. 

4999 0.8039999999999722
I sent telegrams, I guess the military traffic held them up. 

7042 0.8039999999999508
You will remember the name? Von Scherbach? VON SCHER-BACH! 



### Get most relevant words per topic

In [75]:
for i, x in sorted(enumerate(phi[topic,:]), key=lambda y: -y[1])[:5]:
    print(i, x)
    print(X.columns[i], '\n')

37 0.347526097853741
sir 

40 0.05967234120950544
lord 

271 0.04346697422513307
thank 

431 0.02282598214856176
lady 

771 0.022410885193540186
tree 

