In [None]:
from pre_processing.get_books import get_preprocessed_data
from pre_processing.frequency_inverse import get_freq_inverse
from pre_processing.get_all_words import get_all_words

from analysis.k_means import k_means_analysis


from analysis.cosine_similarity import get_cosine_similarity
from sklearn.manifold import MDS
from scipy.cluster.hierarchy import ward, dendrogram
from sklearn.cluster import KMeans
from sklearn.cluster import MeanShift, estimate_bandwidth

from sklearn.cluster import SpectralClustering

from analysis.multidimensional_scaling import get_multi_scaling_positions
from analysis.multidimensional_scaling import get_LSA_scaling_positions
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
import numpy as np
import nltk

from gensim import corpora, models, similarities 

from sklearn.decomposition import NMF


NUMBERS_ONLY = False

IF_MIN = 0.1
IF_MAX = 0.9
N_GRAMS = 3

K_MEANS_N_CLUSTERS = 6

UseMDS=True

%matplotlib inline

[nltk_data] Downloading package stopwords to /home/ayoung/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
list_of_books = get_preprocessed_data(NUMBERS_ONLY)
frequency_term_matrix, terms, term_freq_vectorizer = get_freq_inverse(list_of_books,IF_MAX,IF_MIN,N_GRAMS)


In [None]:
from sklearn.linear_model import LassoLarsIC

In [None]:
book_words_list = []
all_words_list = []
for book in list_of_books:
    book_words_list.append(" ".join(book.word_list))

for book_words in book_words_list:
    words = [word.lower() for sentence in nltk.sent_tokenize(book_words) for word in nltk.word_tokenize(sentence)]
    filtered_tokens = []
    for word in words:
        filtered_tokens.append(word)
    all_words_list.extend(filtered_tokens)


In [None]:
km = KMeans(n_clusters=K_MEANS_N_CLUSTERS)
km.fit(frequency_term_matrix)
clusters = km.labels_.tolist()


titles = []
published = []
authors = []
period = []
for book in list_of_books:
    authors.append(book.meta["author"])
    titles.append(book.meta["title"])
    published.append(book.meta["published"])
    period.append(book.meta["period"])

books = {'titles': titles, 'cluster': clusters, 'published': published, 'authors':authors, 'period':period}
frame = pd.DataFrame(books, index = [clusters] , columns = ['titles', 'cluster', 'published','authors', 'period']).sort_index()
frame.to_csv("kmeansresults.csv")

In [None]:
vocab_frame = pd.DataFrame({'words': all_words_list}, index=all_words_list)
cluster_names = {}
for i in range(K_MEANS_N_CLUSTERS):
    cluster_names[i] = []
cluster_colors = {0: '#1b9e77', 1: '#d95f02', 2: '#7570b3', 3: '#e7298a', 4: '#66a61e'}

In [None]:
order_centroids = km.cluster_centers_.argsort()[:, ::-1] 
if not NUMBERS_ONLY:
    for i in range(K_MEANS_N_CLUSTERS):
            for ind in order_centroids[i, :K_MEANS_N_CLUSTERS]:  # replace 6 with n words per cluster
                cluster_names[i].append(vocab_frame.ix[terms[ind].split(' ')].values.tolist()[0][0])
else:
    for i in range(K_MEANS_N_CLUSTERS):
        #print("Cluster {} words:".format(i))
        for ind in order_centroids[i, :K_MEANS_N_CLUSTERS]:  # replace 6 with n words per cluster
            cluster_names[i].append(vocab_frame.ix[terms[ind].split(' ')].values.tolist()[0][0])

In [None]:
posMDS = get_multi_scaling_positions(frequency_term_matrix, False)

posLSA = get_LSA_scaling_positions(frequency_term_matrix)

if UseMDS:
    pos = posMDS
    
else:
    pos = posLSA
print(pos)

In [None]:

graph_dict = {'x':posMDS[:, 0], 'y':posMDS[:, 1], 'label':clusters, 'title':titles}
df = pd.DataFrame(graph_dict)

groups = df.groupby('label')
#posMDS

ig, ax = plt.subplots(figsize=(16, 15))  # set size
ax.margins(0.05)  # Optional, just adds 5% padding to the autoscaling

for name, group in groups:
    ax.plot(group.x, group.y, marker='o', linestyle='', ms=12,
            label=cluster_names[name][0:5],
            mec='none')
    ax.set_aspect('auto')
    ax.tick_params( \
        axis='x',  # changes apply to the x-axis
        which='both',  # both major and minor ticks are affected
    )
    ax.tick_params( \
        axis='y',  # changes apply to the y-axis
        which='both',  # both major and minor ticks are affected
       )

    ax.legend(numpoints=1)  # show legend with only 1 point
    ax.legend(loc='center left', bbox_to_anchor=(0.85,0.93))


# plt.imshow(Z, interpolation='nearest',
#            extent=(xx.min(), xx.max(), yy.min(), yy.max()),
#            cmap=plt.cm.Paired,
#            aspect='auto', origin='lower')

# add label in x,y position with the label as the book title

for i in range(len(df)):
    ax.text(df.ix[i]['x'], df.ix[i]['y'], df.ix[i]['title'], size=11)

df.ix[2]
ig.savefig('kmeans6topics.pdf')


In [None]:
#posLSA
graph_dict = {'x':posLSA[:, 0], 'y':posLSA[:, 1], 'label':clusters, 'title':titles}
df = pd.DataFrame(graph_dict)

groups = df.groupby('label')
ig, ax = plt.subplots(figsize=(23, 13))  # set size
ax.margins(0.05)  # Optional, just adds 5% padding to the autoscaling

for name, group in groups:
    ax.plot(group.x, group.y, marker='o', linestyle='', ms=12,
            label=cluster_names[name][0:5],
            mec='none')
    ax.set_aspect('auto')
    ax.tick_params( \
        axis='x',  # changes apply to the x-axis
        which='both',  # both major and minor ticks are affected
    )
    ax.tick_params( \
        axis='y',  # changes apply to the y-axis
        which='both',  # both major and minor ticks are affected
       )

    ax.legend(numpoints=1)  # show legend with only 1 point
    ax.legend(loc='center left', bbox_to_anchor=(1,0.93))


 
# plt.imshow(Z, interpolation='nearest',
#            extent=(xx.min(), xx.max(), yy.min(), yy.max()),
#            cmap=plt.cm.Paired,
#            aspect='auto', origin='lower')

# add label in x,y position with the label as the book title
for i in range(len(df)):
    ax.text(df.ix[i]['x'], df.ix[i]['y'], df.ix[i]['title'], size=11)

df.ix[2]

In [None]:

linkage_matrix = ward(pos) #define the linkage_matrix using ward clustering pre-computed distances

fig, ax = plt.subplots(figsize=(15, 20)) # set size
ax = dendrogram(linkage_matrix, orientation="right", labels=titles);

plt.tick_params(\
    axis= 'x',          # changes apply to the x-axis
    which='both',      # both major and minor ticks are affected
    bottom='off',      # ticks along the bottom edge are off
    top='off',         # ticks along the top edge are off
    labelbottom='off')

plt.tight_layout() #show plot with tight layout

#uncomment below to save figure
plt.savefig('ward_clusters.png', dpi=200) #save figure as ward_clust

In [None]:
bandwidth = estimate_bandwidth(pos, quantile=0.208)

ms = MeanShift(bandwidth=bandwidth, bin_seeding=True)
ms.fit(pos)
labels = ms.labels_
cluster_centers = ms.cluster_centers_

labels_unique = np.unique(labels)
n_clusters_ = len(labels_unique)

print("number of estimated clusters : %d" % n_clusters_)

In [None]:
from itertools import cycle

plt.figure(1)
plt.figure(figsize=(20,10))
plt.clf()




colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk')
for k, col in zip(range(n_clusters_), colors):
    my_members = labels == k
    cluster_center = cluster_centers[k]
    plt.plot(pos[my_members, 0], pos[my_members, 1], col + '.', markersize=12)
    plt.plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor=col,
             markeredgecolor='k', markersize=9)
plt.title('Estimated number of clusters: %d' % n_clusters_)


for i in range(len(df)):
    plt.text(pos[i, 0], pos[i, 1], df.ix[i]['title'], size=11)

df.ix[2]

plt.show()

In [None]:
#LDA AND NMF

In [None]:
import string
def strip_proppers(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent) if word.islower()]
    return "".join([" "+i if not i.startswith("'") and i not in string.punctuation else i for i in tokens]).strip()

In [None]:

#strip any proper nouns (NNP) or plural proper nouns (NNPS) from a text
from nltk.tag import pos_tag

def strip_proppers_POS(text):
    tagged = pos_tag(text.split()) #use NLTK's part of speech tagger
    non_propernouns = [word for word,pos in tagged if pos != 'NNP' and pos != 'NNPS']
    return non_propernouns

In [None]:
import re
from nltk.corpus import stopwords

nltk.download("stopwords")
english_stopwords = stopwords.words('english')

def tokenize(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        m = re.match('(\w+)', token)
        if m:
            for c in m.groups():
                filtered_tokens.append(c)
    return filtered_tokens

In [None]:


preprocess = [strip_proppers(doc) for doc in book_words_list] #remove proper names
tokenized_text = [tokenize(text) for text in preprocess]#tokenize
texts = [[word for word in text if word not in english_stopwords] for text in tokenized_text]#stopwords


In [None]:

#create a Gensim dictionary from the texts
dictionary = corpora.Dictionary(texts)

#remove extremes (similar to the min/max df step used when creating the tf-idf matrix)
dictionary.filter_extremes(no_below=1, no_above=0.8)

#convert the dictionary to a bag of words corpus for reference
corpus = [dictionary.doc2bow(text) for text in texts]

In [None]:
lda = models.LdaModel(corpus, 
                      num_topics=6 ,
                            id2word=dictionary, 
                            update_every=6, 
                            chunksize=10000, 
                            passes=100)

In [None]:
lda.show_topics()


In [None]:
nmf = NMF(n_components=6, random_state=1).fit(frequency_term_matrix)

feature_names = term_freq_vectorizer.get_feature_names()

for topic_idx, topic in enumerate(nmf.components_):
    print("Topic #%d:" % topic_idx)
    print(" ".join([feature_names[i]
                    for i in topic.argsort()[:-5 - 1:-1]]))
    print()