In [1]:
import numpy as np
import re
import math
import operator
from collections import defaultdict

In [2]:
K = 6
WINDOW_OVERLAP = 2 # words of overlap between two windows
DIFF_THRESH = WINDOW_OVERLAP + 1 # threshold for forming new cluster

word_bag = defaultdict(list) # maps word to the list of clusters it belongs
cluster_list = [] # list of all clusters

tokens = []

In [3]:
class Cluster:
    def __init__(self, n):
        self.words = defaultdict(int)
        self.n = n
        self.freq = 0
        
    def add_word(self, word):
        self.words[word] += 1
        self.freq += 1
        return self.update_set()
    
    def add_words(self, lst):
        for word in lst:
            self.words[word] += 1
            self.freq += 1
        return self.update_set()
    
    def update_set(self):
        """Maintains n words in the cluster"""
        lst = []
        while len(self.words) > self.n:
            srt = sorted(self.words.items(), key=lambda kv: kv[1])
            self.words.pop(srt[0][0])
            lst += (srt[0][0])
            self.freq -= srt[0][1]
        return lst
    
    def get_words(self):
        return set(self.words.keys())
    
    def get_topKwords(self, K):
        sorted_dict = sorted(self.words.items(), reverse=True, key=operator.itemgetter(1))
        K = min(K, len(sorted_dict))
        word_list = []
        total = 0
        for i in range(K):
            word_list += [sorted_dict[i][0]]
            total += sorted_dict[i][1]
        return (total, word_list)
    
    def get_word_freq(self, word):
        return self.words[word]

## Text Input and Processing and Window Tokenization

In [4]:
def get_text(filename=''):
    if filename:
        return open(filename, 'r', encoding='utf-8').read()
    else:
        return "Five score years ago, a great American, in whose symbolic shadow we stand today, signed the Emancipation Proclamation. \
This momentous decree came as a great beacon light of hope to millions of Negro slaves who had been seared in the flames of \
withering injustice. It came as a joyous daybreak to end the long night of their captivity. \
But one hundred years later, the Negro still is not free; one hundred years later, the life of the Negro is still sadly \
crippled by the manacles of segregation and the chains of discrimination; one hundred years later, the Negro lives on a \
lonely island of poverty in the midst of a vast ocean of material prosperity; one hundred years later, the Negro is still \
languished in the corners of American society and finds himself in exile in his own land"

In [5]:
#text = get_text()
#text = get_text('darwin.txt')
#text = get_text('mlk.txt')
text = get_text('jupyter_usage_hindrances.txt')

In [6]:
# PREPROCESSING
from nltk.corpus import stopwords
sw = set(stopwords.words("english"))

def preprocess(text):
    words = []
    for word in text.split():
        if word.lower() not in sw:
            cleaned = re.sub('[\,.\'\(\);\/\*:\"\[\]!]', '', word)
            if cleaned:
                words += [cleaned]
    return words

In [7]:
# SENTENCE + WINDOW TOKENIZER
from nltk.tokenize import PunktSentenceTokenizer
tokenizer = PunktSentenceTokenizer()
#tokenizer.sent_end_chars = ('.', '?', '!', '--')
for sentence in tokenizer.tokenize(text):
    words = preprocess(sentence)
    i = 0
    n = len(words)
    while i < n:
        remain = (n - i)
        if remain < (1.5*K):
            div = int(math.ceil(remain / 2))
            if div < (K/2):
                tokens += [' '.join(words[i:])]
            else:
                tokens += [' '.join(words[i:i+div])]
                tokens += [' '.join(words[i+div-WINDOW_OVERLAP:])]
            i = n
        else:
            tokens += [' '.join(words[i:i+K])]
            i += WINDOW_OVERLAP

## Clustering

In [8]:
def best_cluster(window):
    b = None
    thresh = DIFF_THRESH
    chk_set = set(window)
    for word in window:
        for cluster in word_bag[word]:
            # check overlap between window and cluster
            chk = len(chk_set.intersection(cluster.get_words()))
            if chk >= thresh:
                b = cluster
                thresh = chk
    return b

In [9]:
# FORMING CLUSTERS
for token in tokens:
    window = token.split()
    cluster = best_cluster(window)
    if not cluster: # if no best cluster found, create new cluster
        cluster = Cluster(2*K)
        cluster_list += [cluster]
    removed = set(cluster.add_words(window))
    for word in window:
        if word in removed:
            try:
                word_bag[word].remove(cluster)
            except ValueError:
                pass
        elif cluster not in word_bag[word]:
            word_bag[word] += [cluster]

In [11]:
# FINDING TOP CLUSTERS
sorted_clusters = sorted(cluster_list, reverse=True, key=lambda x: x.get_topKwords(6)[0])
for cluster in sorted_clusters[:15]:
    print(cluster.get_topKwords(6), cluster.freq)

(48, ['editor', 'text', 'version', 'control', 'development', 'vim']) 71
(46, ['version', 'control', 'code', 'better', 'git', 'editor']) 65
(38, ['able', 'notebook', 'nice', 'sure', 'would', 'load']) 51
(37, ['code', 'output', 'cells', 'content', 'easy', 'table']) 52
(36, ['easy', 'way', 'code', 'cells', 'Would', 'push']) 50
(34, ['##', '-', 'ENH', 'notebook', 'git', '&']) 49
(33, ['run', '-', 'code', 'create', 'notebooks', 'cells']) 50
(33, ['would', 'cells', 'use', 'multiple', 'copy', 'able']) 50
(32, ['terminal', 'session', 'start', 'ipython', 'notebook', 'particular']) 49
(31, ['editing', 'text', 'features', 'Lack', 'file', 'management']) 44
(31, ['notebooks', 'version', 'control', 'use', 'via', 'UI']) 47
(30, ['control', '-', 'code', 'version', 'make', 'files']) 47
(30, ['use', 'Jupyter', 'Notebook', 'work', 'console', 'extensively']) 45
(30, ['-', 'UBY', 'notebook', 'keyboard', '##1', 'shortcut']) 45
(30, ['viable', 'workaround', 'httpsgithubcomjupyteratom-notebook', 'httpsgithubc