# Seasons in Pieces

## Read it in, as usual

In [None]:
from seasons_module import load_seasons_corpus
seasons_corpus = load_seasons_corpus()
fnames = list(seasons_corpus.keys())
docs = [seasons_corpus[fname][0] for fname in fnames]

## Break each transcript into a list of overlapping lists

In [None]:
def sliding_segment(tokenized_text, segment_size, step_size):
    result = []
    maxv = len(tokenized_text)
    for left in range(0, maxv, step_size):
        result.append(tokenized_text[left:(left + segment_size)])
        if ((left + segment_size) >= maxv):
            break
    return result

In [None]:
all_segments = []
for doc in docs:
    all_segments += sliding_segment(doc, 100, 25)

## Build the vocabulary, treating each segment as a document

In [None]:
set_vocab = set([])
for seg in all_segments:
    set_vocab = set_vocab.union(set(seg))
f = open("lists/stoplist3k2sw.txt")
stop_list = set(f.read().split("\n"))
pruned_vocab = set(sorted([w for w in list(set_vocab) if w not in stop_list]))

In [None]:
import nltk
word_fdist = nltk.FreqDist() # the corpus frequences
doc_fdist = nltk.FreqDist()# the document frequencies
for word in pruned_vocab:
    word_fdist[word] = 0
    doc_fdist[word] = 0
    for seg in all_segments:
        if word in seg:
            doc_fdist[word] += 1
            word_fdist[word] += seg.count(word)

In [None]:
vocab_list = [w[0] for w in word_fdist.most_common(500)]

## Get the vector for each segment and put them in a matrix

In [None]:
import numpy as np
def norm_vec(vec):
    mag = np.dot(vec, vec)
    if mag == 0:
        return vec
    else:
        return(vec / np.sqrt(mag))
    
def pure_tf(tf, df, cf, N):
    return tf

def tf(tf, df, cf, N):
    if tf == 0:
        result = 0
    else:
        result = (1 + np.log(tf))
    return result

def tfidf(tf, df, cf, N):
    if tf == 0 or df == 0:
        result = 0
    else:
        result = (1 + np.log(tf)) / df
    return result

def weight_factor2(tf, df, cf, N):
    if tf == 0:
        result = 0
    else:
        result = (1 + np.log(tf))
    return result

def weighted_word(the_text, word):
    return tf(the_text.count(word), doc_fdist[word], word_fdist[word], len(seasons_corpus.keys()))

def compute_doc_vector(word_list):
    return norm_vec([weighted_word(word_list, word) for word in vocab_list])

from sklearn.neighbors.nearest_centroid import NearestCentroid
def get_centroids(X, clustering):
    clf = NearestCentroid()
    clf.fit(X, clustering.labels_)
    centroids = clf.centroids_
    return centroids

def top_words_from_centroid(centroids, n, to_print=10, printit=True):
    sc = list(np.argsort(centroids[n]))
    sc.reverse()
    result = []
    for i in range(to_print):
        if printit:
            print(vocab_list[sc[i]], round(centroids[n][sc[i]], 3))
        result.append([vocab_list[sc[i]], round(centroids[n][sc[i]], 3)])
    return result

def top_words_from_centroids(centroids, to_print=10, printit=False):
    result = []
    for n in range(len(centroids)):
        if printit:
            print("Cluster ", n)
            top_words_from_centroid(centroids, n, to_print, printit=printit)
            print("\n")
        result.append(top_words_from_centroid(centroids, n, to_print, printit=printit))
    return result
        
class ListTable(list):
    def _repr_html_(self):
        html = ["<table style= 'border: 1px solid black; display:inline-block'>"]
        for row in self:
            html.append("<tr>")
            for col in row:
                html.append("<td align='left' style='border: .5px solid gray;'>{0}</td>".format(col))
            
            html.append("</tr>")
        html.append("</table>")
        return ''.join(html)
    
class MultiTable(list):
    def _repr_html_(self):
        html = []
        for l in self:
            html.append("<table style= 'border: 1px solid black; display:inline-block; margin-right: 10px;'>")
            for row in l:
                html.append("<tr>")
                for col in row:
                    html.append("<td align='left' style='border: .5px solid gray;'>{0}</td>".format(col))

                html.append("</tr>")
            html.append("</table>")
        return ''.join(html)

In [None]:
doc_vectors = []
for seg in all_segments:
    doc_vectors.append(compute_doc_vector(seg))
td_matrix = np.zeros([len(vocab_list), len(all_segments)])
for (i, vec) in enumerate(doc_vectors):
    td_matrix[:, i] = vec
X = td_matrix.transpose()

In [None]:
X.shape

In [None]:
from sklearn.cluster import AgglomerativeClustering, KMeans
clustering = AgglomerativeClustering(n_clusters=7).fit(X)

In [None]:
centroids = get_centroids(X, clustering)

In [None]:
MultiTable(top_words_from_centroids(centroids, printit=False))

In [None]:
def orthogonalize(vectors):
    total_v = np.zeros(len(vectors[0]))
    for vec in vectors:
            total_v = total_v + vec
    total_v = norm_vec(total_v)
    new_doc_vectors = []
    for v in vectors:
        new_doc_vectors.append(norm_vec(v - np.dot(v, total_v) * total_v))
    return new_doc_vectors

In [None]:
ortho_vectors = orthogonalize(doc_vectors)

In [None]:
td_matrix = np.zeros([len(vocab_list), len(all_segments)])
for (i, vec) in enumerate(ortho_vectors):
    td_matrix[:, i] = vec
ortho_X = td_matrix.transpose()

In [None]:
clustering = AgglomerativeClustering(n_clusters=7, linkage="ward", affinity="euclidean").fit(ortho_X)

In [None]:
centroids = get_centroids(X, clustering)

In [None]:
MultiTable(top_words_from_centroids(centroids, printit=False))

In [None]:
kclustering = KMeans(n_clusters=7).fit(X)

In [None]:
MultiTable(top_words_from_centroids(kclustering.cluster_centers_, printit=False))

## Reapply to transcripts

### Cluster the segments

In [None]:
clustering = AgglomerativeClustering(n_clusters=7, linkage="ward", affinity="euclidean").fit(X)
centroids = get_centroids(X, clustering)
MultiTable(top_words_from_centroids(centroids, printit=False))

### Create a matrix of the centroids

In [None]:
normalized_centroids = [norm_vec(centroid) for centroid in centroids]

In [None]:
centroid_matrix = np.zeros([len(normalized_centroids),len(normalized_centroids[0])])
for i in range(len(normalized_centroids)):
    centroid_matrix[i] = normalized_centroids[i]
centroid_matrix.shape

### Resegment the transcript of one student and compute vectors for the segments
Put them in a matrix

In [None]:
student_doc = seasons_corpus["angelapre"][0]
segmented_student = sliding_segment(student_doc, 100, 25)
segment_vectors = []
for seg in segmented_student:
    segment_vectors.append(compute_doc_vector(seg))

In [None]:
segment_matrix = np.zeros([len(segment_vectors[0]), len(segment_vectors)])
for i in range(len(segment_vectors)):
    segment_matrix[:, i] = segment_vectors[i]
segment_matrix.shape

### Multiply the two matrices to code each segment (sort of)

In [None]:
the_array = np.dot(centroid_matrix, segment_matrix)
the_array.shape

### Make a heatmap of the matrix

First we give a name to each cluster that is the top three words in the cluster

In [None]:
top_word_lists = top_words_from_centroids(centroids, printit=False)
labels = []
for clus in top_word_lists:
    new_label = "{}-{}-{}".format(clus[0][0], clus[1][0], clus[2][0])
    labels.append(new_label)
labels

Now make the plot

In [None]:
import matplotlib
import matplotlib.cm as cm
fig = matplotlib.pyplot.figure()
ax = fig.add_subplot(111)
dialogs = []
(nrows, ncols) = the_array.shape
cax = ax.imshow(the_array, cmap=cm.gist_yarg, aspect="auto", interpolation='nearest')

ind = np.arange(ncols)
ax.set_xticks(ind, minor=False)
ax.set_xticks(ind + .5, minor=True)
ax.get_xaxis().set_ticklabels(ind + 1, size="x-small")

ind = np.arange(nrows)

ax.set_yticks(ind, minor=False)
ax.set_yticks(ind + .5, minor=True)
ax.get_yaxis().set_ticklabels(labels, size="small", rotation="horizontal")

ax.grid(True, which='minor', linestyle=':')

fig.set_facecolor("white")