# Words Sequences
Author: Pierre Nugues

# Imports

In [None]:
import math
import regex as re
import sys

## Reading a corpus

In [None]:
file_name = '../../corpus/Selma.txt'
text = open(file_name).read().strip()
text[:50]

## The tokenizer

In [None]:
def tokenize(text):
    words = re.findall('\p{L}+', text)
    return words

## Unigrams

A function to count the words

In [None]:
def count_unigrams(words):
    frequency = {}
    for word in words:
        if word in frequency:
            frequency[word] += 1
        else:
            frequency[word] = 1
    return frequency

We analyze Selma Lagerlöf

In [None]:
words = tokenize(text.lower())
frequency = count_unigrams(words)
for word in sorted(frequency.keys(), key=frequency.get, reverse=True)[:15]:
    print(word, '\t', frequency[word])

## Bigrams

We can extend the counts to pairs of words

In [None]:
def count_bigrams(words):
    bigrams = [tuple(words[idx:idx + 2])
               for idx in range(len(words) - 1)]
    frequencies = {}
    for bigram in bigrams:
        if bigram in frequencies:
            frequencies[bigram] += 1
        else:
            frequencies[bigram] = 1
    return frequencies

In [None]:
words = tokenize(text.lower())
frequency_bigrams = count_bigrams(words)
for bigram in sorted(frequency_bigrams.keys(), key=frequency_bigrams.get, reverse=True)[:15]:
    print(bigram, '\t', frequency_bigrams[bigram])

## Trigrams

In [None]:
def count_trigrams(words):
    trigrams = [tuple(words[idx:idx + 3])
                for idx in range(len(words) - 2)]
    frequencies = {}
    for trigram in trigrams:
        if trigram in frequencies:
            frequencies[trigram] += 1
        else:
            frequencies[trigram] = 1
    return frequencies

In [None]:
words = tokenize(text.lower())
frequency_trigrams = count_trigrams(words)
for trigram in sorted(frequency_trigrams.keys(), key=frequency_trigrams.get, reverse=True)[:15]:
    print(trigram, '\t', frequency_trigrams[trigram])

## N-grams

In [None]:
def count_ngrams(words, n):
    ngrams = [tuple(words[idx:idx + n])
              for idx in range(len(words) - n + 1)]
    # "\t".join(words[idx:idx + n])
    frequencies = {}
    for ngram in ngrams:
        if ngram in frequencies:
            frequencies[ngram] += 1
        else:
            frequencies[ngram] = 1
    return frequencies

In [None]:
N = 10

In [None]:
words = tokenize(text.lower())
frequency_ngrams = count_ngrams(words, N)
for ngram in sorted(frequency_ngrams.keys(), key=frequency_ngrams.get, reverse=True)[:15]:
    print(ngram, '\t', frequency_ngrams[ngram])

## Cooccurrence measures

In all the computations, we need this

In [None]:
frequency = count_unigrams(words)
frequency_bigrams = count_bigrams(words)

### Mutual information

In [None]:
def mutual_info(words, freq_unigrams, freq_bigrams):
    mi = {}
    factor = len(words) * len(words) / (len(words) - 1)
    for bigram in freq_bigrams:
        mi[bigram] = (
            math.log(factor * freq_bigrams[bigram] /
                     (freq_unigrams[bigram[0]] *
                      freq_unigrams[bigram[1]]), 2))
    return mi

In [None]:
mi = mutual_info(words, frequency, frequency_bigrams)

Mutual information is highly biased toward low-frequency words

In [None]:
cutoff = 5
filtered_mi = {k: v for k, v in mi.items() if v >= cutoff}

In [None]:
for bigram in sorted(filtered_mi.keys(), key=filtered_mi.get, reverse=True)[:15]:
    print(bigram, '\t',
          frequency[bigram[0]], '\t',
          frequency[bigram[1]], '\t',
          frequency_bigrams[bigram], '\t',
          filtered_mi[bigram])

### Likelihood ratio

In [None]:
def likelihood_ratio(words, freq_unigrams, freq_bigrams):
    lr = {}
    for bigram in freq_bigrams:
        p = freq_unigrams[bigram[1]] / len(words)
        p1 = freq_bigrams[bigram] / freq_unigrams[bigram[0]]
        p2 = ((freq_unigrams[bigram[1]] - freq_bigrams[bigram])
              / (len(words) - freq_unigrams[bigram[0]]))
        if p1 != 1.0 and p2 != 0.0:
            lr[bigram] = 2.0 * (
                log_f(freq_bigrams[bigram],
                      freq_unigrams[bigram[0]], p1) +
                log_f(freq_unigrams[bigram[1]] -
                      freq_bigrams[bigram],
                      len(words) - freq_unigrams[bigram[0]], p2) -
                log_f(freq_bigrams[bigram],
                      freq_unigrams[bigram[0]], p) -
                log_f(freq_unigrams[bigram[1]] -
                      freq_bigrams[bigram],
                      len(words) - freq_unigrams[bigram[0]], p))
    return lr


def log_f(k, N, p):
    return k * math.log(p) + (N - k) * math.log(1 - p)

In [None]:
lr = likelihood_ratio(words, frequency, frequency_bigrams)

for bigram in sorted(lr, key=ts.get, reverse=True)[:15]:
    print(bigram, "\t", frequency[bigram[0]], "\t", frequency[bigram[1]], "\t",
          frequency_bigrams[bigram], '\t', lr[bigram])

### T-scores

In [None]:
def t_scores(words, freq_unigrams, freq_bigrams):
    ts = {}
    for bigram in freq_bigrams:
        ts[bigram] = ((freq_bigrams[bigram] -
                      freq_unigrams[bigram[0]] *
                      freq_unigrams[bigram[1]] /
                      len(words)) /
                      math.sqrt(freq_bigrams[bigram]))
    return ts

In [None]:
ts = t_scores(words, frequency, frequency_bigrams)

for bigram in sorted(ts, key=ts.get, reverse=True)[:15]:
    print(bigram, "\t", frequency[bigram[0]], "\t", frequency[bigram[1]], "\t",
          frequency_bigrams[bigram], '\t', ts[bigram])