# Clustering text

Now we're going to learn how to cluster text.

For this I'm going to first use a small database of questions
that students submitted to me in advance of a discussion of one of my papers.

## Read it in, tokenize it, get a vocabulary, yadda yadda

In [None]:
import re, nltk
import numpy as np

In [None]:
contraction_patterns = re.compile(r"(?i)(.)('ll|'re|'ve|n't|'s|'m|'d)\b")
def is_contraction(the_text):
        return contraction_patterns.search(the_text)
    
def alpha_only (ltext):
    return [w.lower() for w in ltext if (len(w) > 0) and (w.isalpha() or w[0]=='<' or is_contraction(w))]

raw_file = open('corpora/student_questions.txt').read()
question_list = re.findall(r"(.*?)\n", raw_file)
question_corpus = []
for (i, question) in enumerate(question_list):
    question_corpus.append(alpha_only(nltk.word_tokenize(question)))

In [None]:
stop_f = open("lists/stop-words_english_5_en.txt")
stop_list = stop_f.read().split("\n")
stop_list += list('!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~’')
stop_list += list("abcdefghijklmnopqrstuvwxyz0123456789")
stop_list = set(stop_list)

In [None]:
set_vocab = set([])
for question in question_corpus:
    set_vocab = set_vocab.union(set(question))
pruned_vocab = set(sorted([w for w in list(set_vocab) if w not in stop_list]))

In [None]:
import nltk
word_fdist = nltk.FreqDist() # the corpus frequences
doc_fdist = nltk.FreqDist()# the document frequencies
for word in pruned_vocab:
    word_fdist[word] = 0
    doc_fdist[word] = 0
    for question in question_corpus:
        if word in question:
            doc_fdist[word] += 1
            word_fdist[word] += question.count(word)

In [None]:
vocab_list = [w[0] for w in word_fdist.most_common(500)]
print(vocab_list)

## Compute the document vectors

In [None]:
import numpy as np
def norm_vec(vec):
    mag = np.dot(vec, vec)
    if mag == 0:
        return vec
    else:
        return(vec / np.sqrt(mag))
    
def pure_tf(tf, df, cf, N):
    return tf

def tf(tf, df, cf, N):
    if tf == 0:
        result = 0
    else:
        result = (1 + np.log(tf))
    return result

def tfidf(tf, df, cf, N):
    if tf == 0 or df == 0:
        result = 0
    else:
        result = (1 + np.log(tf)) / df
    return result

def weight_factor2(tf, df, cf, N):
    if tf == 0:
        result = 0
    else:
        result = (1 + np.log(tf))
    return result

def weighted_word(the_text, word):
    return tf(the_text.count(word), doc_fdist[word], word_fdist[word], len(question_corpus))

def compute_doc_vector(word_list):
    return norm_vec([weighted_word(word_list, word) for word in vocab_list])

In [None]:
question_vectors = []
for question in question_corpus:
    question_vectors.append(compute_doc_vector(question))

### Put the vectors in a matrix

In [None]:
td_matrix = np.zeros([len(vocab_list), len(question_vectors)])
for (i, vec) in enumerate(question_vectors):
    td_matrix[:, i] = vec

In [None]:
td_matrix.shape
X = td_matrix.transpose()

### Cluster them

In [None]:
from sklearn.cluster import AgglomerativeClustering
clustering = AgglomerativeClustering(n_clusters=5, linkage="ward").fit(X)

In [None]:
clustering.labels_

Try something tricky, if necessary

In [None]:
def orthogonalize(vectors):
    total_v = np.zeros(len(vectors[0]))
    for vec in vectors:
            total_v = total_v + vec
    total_v = norm_vec(total_v)
    new_doc_vectors = []
    for v in vectors:
        new_doc_vectors.append(norm_vec(v - np.dot(v, total_v) * total_v))
    return new_doc_vectors

In [None]:
total_v = np.zeros(len(question_vectors[0]))
for vec in question_vectors:
    total_v = total_v + vec
total_v = norm_vec(total_v)

In [None]:
ortho_vectors = orthogonalize(question_vectors)

In [None]:
oclustering = AgglomerativeClustering(n_clusters=5, linkage="ward").fit(ortho_vectors)
oclustering.labels_

In [None]:
from sklearn.neighbors.nearest_centroid import NearestCentroid
def get_centroids(X, clustering):
    clf = NearestCentroid()
    clf.fit(X, clustering.labels_)
    centroids = clf.centroids_
    return centroids

centroids = get_centroids(question_vectors, oclustering)

In [None]:
def top_words_from_centroid(centroids, n, to_print=10, printit=True):
    sc = list(np.argsort(centroids[n]))
    sc.reverse()
    result = []
    for i in range(to_print):
        if printit:
            print(vocab_list[sc[i]], round(centroids[n][sc[i]], 3))
        result.append([vocab_list[sc[i]], round(centroids[n][sc[i]], 3)])
    return result

def top_words_from_centroids(centroids, to_print=10, printit=False):
    result = []
    for n in range(len(centroids)):
        if printit:
            print("Cluster ", n)
            top_words_from_centroid(centroids, n, to_print, printit=printit)
            print("\n")
        result.append(top_words_from_centroid(centroids, n, to_print, printit=printit))
    return result
        
class ListTable(list):
    def _repr_html_(self):
        html = ["<table style= 'border: 1px solid black; display:inline-block'>"]
        for row in self:
            html.append("<tr>")
            for col in row:
                html.append("<td align='left' style='border: .5px solid gray;'>{0}</td>".format(col))
            
            html.append("</tr>")
        html.append("</table>")
        return ''.join(html)
    
class MultiTable(list):
    def _repr_html_(self):
        html = []
        for l in self:
            html.append("<table style= 'border: 1px solid black; display:inline-block; margin-right: 10px;'>")
            for row in l:
                html.append("<tr>")
                for col in row:
                    html.append("<td align='left' style='border: .5px solid gray;'>{0}</td>".format(col))

                html.append("</tr>")
            html.append("</table>")
        return ''.join(html)

In [None]:
MultiTable(top_words_from_centroids(centroids, printit=False))

In [None]:
def print_questions(cluster_number):
    res = []
    for n, i in enumerate(oclustering.labels_):
        if i == cluster_number:
            res.append([n, question_list[n]])
    return res

In [None]:
ListTable(print_questions(1))