# Seasons Clustering

In [None]:
import re, nltk
import numpy as np

## Read in the corpus and get the vocabulary, as usual

In [None]:
from seasons_module import load_seasons_corpus
seasons_corpus = load_seasons_corpus()
fnames = list(seasons_corpus.keys())
docs = [seasons_corpus[fname][0] for fname in fnames]

In [None]:
set_vocab = set([])
for doc in docs:
    set_vocab = set_vocab.union(set(doc))
f = open("lists/seasons_stop_list.txt")
stop_list = set(f.read().split("\n"))
pruned_vocab = set(sorted([w for w in list(set_vocab) if w not in stop_list]))

In [None]:
import nltk
word_fdist = nltk.FreqDist() # the corpus frequences
doc_fdist = nltk.FreqDist()# the document frequencies
for word in pruned_vocab:
    word_fdist[word] = 0
    doc_fdist[word] = 0
    for doc in docs:
        if word in doc:
            doc_fdist[word] += 1
            word_fdist[word] += doc.count(word)

In [None]:
vocab_list = [w[0] for w in word_fdist.most_common(500)]

## Get the document vectors

In [None]:
import numpy as np
def norm_vec(vec):
    mag = np.dot(vec, vec)
    if mag == 0:
        return vec
    else:
        return(vec / np.sqrt(mag))
    
def pure_tf(tf, df, cf, N):
    return tf

def tf(tf, df, cf, N):
    if tf == 0:
        result = 0
    else:
        result = (1 + np.log(tf))
    return result

def tfidf(tf, df, cf, N):
    if tf == 0 or df == 0:
        result = 0
    else:
        result = (1 + np.log(tf)) / df
    return result

def weight_factor2(tf, df, cf, N):
    if tf == 0:
        result = 0
    else:
        result = (1 + np.log(tf))
    return result

def weighted_word(the_text, word):
    return tf(the_text.count(word), doc_fdist[word], word_fdist[word], len(seasons_corpus.keys()))

def compute_doc_vector(word_list):
    return norm_vec([weighted_word(word_list, word) for word in vocab_list])

In [None]:
doc_vectors = []
for doc in docs:
    doc_vectors.append(compute_doc_vector(doc))

Build the termxdocument matrix

In [None]:
td_matrix = np.zeros([len(vocab_list), len(fnames)])
for (i, vec) in enumerate(doc_vectors):
    td_matrix[:, i] = vec
X = td_matrix.transpose()

In [None]:
X.shape

## Do the clustering

In [None]:
from sklearn.cluster import AgglomerativeClustering
clustering = AgglomerativeClustering(n_clusters=7).fit(X)

## Examine the results

In [None]:
clustering.labels_

In [None]:
from sklearn.neighbors.nearest_centroid import NearestCentroid
def get_centroids(X, clustering):
    clf = NearestCentroid()
    clf.fit(X, clustering.labels_)
    centroids = clf.centroids_
    return centroids

centroids = get_centroids(X, clustering)

In [None]:
def top_words_from_centroid(centroids, n, to_print=10, printit=True):
    sc = list(np.argsort(centroids[n]))
    sc.reverse()
    result = []
    for i in range(to_print):
        if printit:
            print(vocab_list[sc[i]], round(centroids[n][sc[i]], 3))
        result.append([vocab_list[sc[i]], round(centroids[n][sc[i]], 3)])
    return result

def top_words_from_centroids(centroids, to_print=10, printit=False):
    result = []
    for n in range(len(centroids)):
        if printit:
            print("Cluster ", n)
            top_words_from_centroid(centroids, n, to_print, printit=printit)
            print("\n")
        result.append(top_words_from_centroid(centroids, n, to_print, printit=printit))
    return result
        
class ListTable(list):
    def _repr_html_(self):
        html = ["<table style= 'border: 1px solid black; display:inline-block'>"]
        for row in self:
            html.append("<tr>")
            for col in row:
                html.append("<td align='left' style='border: .5px solid gray;'>{0}</td>".format(col))
            
            html.append("</tr>")
        html.append("</table>")
        return ''.join(html)
    
class MultiTable(list):
    def _repr_html_(self):
        html = []
        for l in self:
            html.append("<table style= 'border: 1px solid black; display:inline-block; margin-right: 10px;'>")
            for row in l:
                html.append("<tr>")
                for col in row:
                    html.append("<td align='left' style='border: .5px solid gray;'>{0}</td>".format(col))

                html.append("</tr>")
            html.append("</table>")
        return ''.join(html)

In [None]:
MultiTable(top_words_from_centroids(centroids))

In [None]:
res = []
for n, fname in enumerate(fnames):
    res.append([fname, clustering.labels_[n]])
ListTable(res)

## Examine some metrics

In [None]:
from yellowbrick.cluster import SilhouetteVisualizer 
from sklearn.cluster import KMeans
visualizer = SilhouetteVisualizer(KMeans(n_clusters=7))
visualizer.fit(X)

In [None]:
MultiTable(top_words_from_centroids(visualizer.cluster_centers_))