In [1]:
import numpy as np
import re
from collections import defaultdict

In [2]:
from nltk.corpus import stopwords
sw = set(stopwords.words("english"))

In [3]:
def preprocess(text):
    words = []
    for word in text.split():
        if word.lower() not in sw:
            cleaned = re.sub('[\,.\'\(\);\/\*:\"\[\]]', '', word)
            if cleaned:
                words += [cleaned]
    return words

In [4]:
class Cluster:
    def __init__(self, n):
        self.words = defaultdict(int)
        self.n = n
        self.freq = 0
        
    def add_word(self, word):
        self.words[word] += 1
        self.freq += 1
        return self.update_set()
    
    def add_words(self, lst):
        for word in lst:
            self.words[word] += 1
            self.freq += 1
        return self.update_set()
    
    def update_set(self):
        """Maintains n words in the cluster"""
        lst = []
        while len(self.words) > self.n:
            srt = sorted(self.words.items(), key=lambda kv: kv[1])
            self.words.pop(srt[0][0])
            lst += (srt[0][0])
            self.freq -= srt[0][1]
        return lst
    
    def get_words(self):
        return set(self.words.keys())

In [5]:
text = "Five score years ago, a great American, in whose symbolic shadow we stand today, signed the Emancipation Proclamation. \
This momentous decree came as a great beacon light of hope to millions of Negro slaves who had been seared in the flames of \
withering injustice. It came as a joyous daybreak to end the long night of their captivity. \
But one hundred years later, the Negro still is not free; one hundred years later, the life of the Negro is still sadly \
crippled by the manacles of segregation and the chains of discrimination; one hundred years later, the Negro lives on a \
lonely island of poverty in the midst of a vast ocean of material prosperity; one hundred years later, the Negro is still \
languished in the corners of American society and finds himself in exile in his own land"

In [6]:
K = 6
DIFF_THRESH = 4

word_bag = defaultdict(list)
cluster_list = []

words = preprocess(text)

In [7]:
def best_cluster(window):
    b = None
    thresh = DIFF_THRESH
    chk_set = set(window)
    for word in window:
        for cluster in word_bag[word]:
            # check overlap between window and cluster
            chk = len(chk_set.intersection(cluster.get_words()))
            if chk >= thresh:
                b = cluster
                thresh = chk
    return b

In [8]:
for i in range(int(K/2), len(words)-1, K):
    window = []
    for j in range(int(-K/2), int(K/2)):
        window += [words[i+j]]
    print(window)
    cluster = best_cluster(window)
    if not cluster: # if no best cluster found, create new cluster
        cluster = Cluster(K)
        cluster_list += [cluster]
    removed = set(cluster.add_words(window))
    for word in window:
        if word in removed:
            word_bag[word].remove(cluster)
        elif cluster not in word_bag[word]:
            word_bag[word] += [cluster]

['Five', 'score', 'years', 'ago', 'great', 'American']
['whose', 'symbolic', 'shadow', 'stand', 'today', 'signed']
['Emancipation', 'Proclamation', 'momentous', 'decree', 'came', 'great']
['beacon', 'light', 'hope', 'millions', 'Negro', 'slaves']
['seared', 'flames', 'withering', 'injustice', 'came', 'joyous']
['daybreak', 'end', 'long', 'night', 'captivity', 'one']
['hundred', 'years', 'later', 'Negro', 'still', 'free']
['one', 'hundred', 'years', 'later', 'life', 'Negro']
['still', 'sadly', 'crippled', 'manacles', 'segregation', 'chains']
['discrimination', 'one', 'hundred', 'years', 'later', 'Negro']
['lives', 'lonely', 'island', 'poverty', 'midst', 'vast']
['ocean', 'material', 'prosperity', 'one', 'hundred', 'years']
['later', 'Negro', 'still', 'languished', 'corners', 'American']


In [9]:
for cluster in cluster_list:
    print(cluster.get_words(), cluster.freq)

{'years', 'Five', 'score', 'great', 'American', 'ago'} 6
{'today', 'stand', 'signed', 'whose', 'symbolic', 'shadow'} 6
{'decree', 'momentous', 'great', 'came', 'Proclamation', 'Emancipation'} 6
{'hope', 'slaves', 'light', 'Negro', 'millions', 'beacon'} 6
{'came', 'injustice', 'flames', 'seared', 'joyous', 'withering'} 6
{'night', 'captivity', 'long', 'daybreak', 'end', 'one'} 6
{'years', 'Negro', 'discrimination', 'hundred', 'later', 'one'} 15
{'chains', 'still', 'manacles', 'segregation', 'sadly', 'crippled'} 6
{'lives', 'lonely', 'poverty', 'island', 'midst', 'vast'} 6
{'years', 'material', 'hundred', 'prosperity', 'ocean', 'one'} 6
{'still', 'Negro', 'languished', 'later', 'American', 'corners'} 6
