In [None]:
%matplotlib inline
import matplotlib
import pandas as pd
import nltk

# N-grams

In [None]:
some_text = "This is a first sentence. Now comes the next sentence.".lower()
words = nltk.word_tokenize(some_text)

In [None]:
bgs = list(nltk.bigrams(words))

In [None]:
tgs = list(nltk.ngrams(words, n=3))

We might not want bigrams that have the last word of one sentence and the first word of the next.

In [None]:
sents = nltk.sent_tokenize(some_text)

In [None]:
new_bgs = []
for sent in sents:
    new_bgs += nltk.bigrams(nltk.word_tokenize(sent))

# Getting some data from wikipedia

There's a nice [library](https://pypi.org/project/Wikipedia-API/) that makes grabbing pages from wikipedia pretty easy.

You always start by creating this `wiki_wiki` object.

In [None]:
import wikipediaapi
wiki_wiki = wikipediaapi.Wikipedia(
        language='en',
        extract_format=wikipediaapi.ExtractFormat.WIKI
)

Let's grab some learning sciences related pages

In [None]:
pages = [
    "Learning Sciences",
    "Educational psychology",
    "Learning",
    "Informal learning",
    "Design-based research",
    "The Journal of the Learning Sciences",
    "Janet L. Kolodner",
    "Computer-supported collaborative learning",
    "Educational technolog"
]

In [None]:
import re
def underscorize(pagename):
    return re.sub(" ", "_", pagename)

In [None]:
page_dict = {}
for page in pages:
    pagename = underscorize(page)
    print(pagename)
    p_wiki = wiki_wiki.page(pagename)
    page_dict[pagename] = p_wiki.text.lower()

## Collocations in this LS Corpus
We are interested in finding **collocations**. Collocations are pairs of words, or phrases, that have limited compositionality. This means that the meaning of the phrase cannot be determined simply from the meaning of the parts. The whole thing is, from a meaning point of view, a unit.

In [None]:
bigram_fdist = nltk.FreqDist()
for page_text in page_dict.values():
    for sent in nltk.sent_tokenize(page_text):
        sent_bgs = nltk.bigrams(nltk.word_tokenize(sent))
        bigram_fdist.update(sent_bgs)

In [None]:
bigram_fdist.most_common(25)

In [None]:
f = open("lists/stop-words_english_5_en.txt")
stop_list = f.read().split("\n")
stop_list += list('!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~’')
stop_list += list("abcdefghijklmnopqrstuvwxyz0123456789")

In [None]:
most_common_bigrams = bigram_fdist.most_common(2000)
most_common_pruned = []
for bg_entry in most_common_bigrams:
    if bg_entry[0][0] in stop_list or bg_entry[0][1] in stop_list:
        continue
    most_common_pruned.append(bg_entry)

In [None]:
most_common_pruned[:25]

In [None]:
tdf = pd.DataFrame(most_common_pruned, columns=["words", "count"])
tdf.head(10)

# Are some bigrams more common than to be expected? Which are most unexpected?

In [None]:
word_fdist = nltk.FreqDist()
for page_text in page_dict.values():
    for word in nltk.word_tokenize(page_text):
        if word not in stop_list:
            word_fdist[word] += 1

In [None]:
fraction_expected = word_fdist["learning"] / word_fdist.N()
print("fraction_expected=" + str(p_learning))

fraction_observed = bigram_fdist[("informal", "learning")] / word_fdist["informal"]
print("fraction_observed=" + str(fraction_observed))

So we would expect to see "learning" after "informal" 3% of the times. But we see it over 70% of the time.

## t_test as measure

In [None]:
from scipy.stats import ttest_1samp
def student_t(w1, w2, word_fidst, bigram_fdist):
    mu = word_fdist[w1] * word_fdist[w2] / (word_fdist.N() * word_fdist.N())
    blist = bigram_fdist[(w1, w2)] * [1.0] + (bigram_fdist.N() - bigram_fdist[(w1, w2)]) * [0]
    result = ttest_1samp(blist, mu)
    return result

In [None]:
student_t("informal", "learning", word_fdist, bigram_fdist)

In [None]:
t = []
for mc in most_common_pruned[:50]:
    w1 = mc[0][0]
    w2 = mc[0][1]
    t.append([w1, w2, mc[1], round(student_t(w1, w2, word_fdist, bigram_fdist).statistic, 3)])
df = pd.DataFrame(t, columns=["w1", "w2", "count", "t"])
df.sort_values(by="t", ascending=False)[:10]

# Some random exploration of word co-occurrences

In [None]:
pruned_ls_words = []
pruned_ls_sents = []
for page in page_dict.values():
    for sent in nltk.sent_tokenize(page):
        psent = [w for w in nltk.word_tokenize(sent) if w not in stop_list]
        pruned_ls_words += psent
        pruned_ls_sents.append(psent)

In [None]:
import copy
pword_fdist = nltk.FreqDist(pruned_ls_words)
most_common = [w[0] for w in pword_fdist.most_common(50)]
mrow = [0 for r in most_common]
mat = [copy.copy(mrow) for r in most_common]
for sent in pruned_ls_sents:
    for n1, w1 in enumerate(most_common):
        if w1 in sent:
            for n2, w2 in enumerate(most_common):
                if w2 in sent:
                    mat[n1][n2] += 1

## Matplotlib digression

https://matplotlib.org/api/pyplot_summary.html

In [None]:
import matplotlib.pyplot as plt
plt.plot([1, 3, 4, 2])

In [None]:
plt.plot([1, 3, 4, 2], "bo")

In [None]:
plt.bar([1, 2, 3, 4], [1, 3, 4, 2])

## Back to co-occurrence

In [None]:
import matplotlib.pyplot as plt
import numpy as np
fig=plt.figure(figsize=(10, 10), dpi= 80, facecolor='w', edgecolor='k')
n = len(most_common)
plt.xticks(x_tick_marks, most_common, fontsize=8, rotation=90)
plt.yticks(y_tick_marks, most_common, fontsize=8)
plt.tick_params("x", top=True, labeltop=True, bottom=False, labelbottom=False)
plt.imshow(mat, norm=matplotlib.colors.LogNorm(), interpolation='nearest', cmap='YlOrBr')

In [None]:
import itertools
possible_pairs = list( itertools.combinations(most_common[:25], 2)) 

In [None]:
import networkx as nx
fig=plt.figure(figsize=(10, 10), dpi= 80, facecolor='w', edgecolor='k')
G2 = nx.Graph()
for pair in possible_pairs:
    wgt = mat[most_common.index(pair[0])][most_common.index(pair[1])]
    G2.add_edge(pair[0], pair[1], weight=wgt)
widths = list(nx.get_edge_attributes(G2, "weight").values())
widths = [w / 25 for w in widths]
pos = nx.spring_layout(G2, iterations=40, k=.1, weight="weight") 
nx.draw(G2, pos, with_labels=True, width=widths, node_color="gold", edge_color="orange")