In [1]:
'''
1. Subdivide each document into sentences. 
2. Perform stop word removal and stemming, then convert the sentences into a sentence-term matrix. 
3. Weight terms in the matrix using TF-IDF and normalize each row to correct for sentence length. 
4.1 Use cosine similarity to compute a pairwise sentence similarity matrix, 
4.2 convert this to a dissimilarity matrix, then use k-means clustering to cluster the sentences into ten clusters.
'''

In [13]:
import nltk
import re
import string
nltk.download( 'stopwords' )

f = open('NorwegianWood.txt', 'r')
D1 = content = f.read()
f.close()

f = open('SnowCountry.txt', 'r')
D2 = content = f.read()
f.close()

f = open('Kokoro.txt', 'r')
D3 = content = f.read()
f.close()

f = open('TheRemainsOfTheDay.txt', 'r')
D4 = content = f.read()
f.close()

doc = [ ]
doc.append(D1)
doc.append(D2)
doc.append(D3)
doc.append(D4)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/zhanyina/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [26]:
# 1. Subdivide each document into sentences. 
txt = []
for i in range( 0, len( doc ) ):
    temp = [x for x in map(str.strip, doc[i].split('.')) if x]
    for item in temp:
        txt.append(item)
len(txt)

80

In [27]:
# 2. Perform stop word removal and stemming, then convert the sentences into a sentence-term matrix. 
def porter_stem( txt ):
    """Porter stem terms in text block

    Args:
      txt (list of string): Text block as list of individual terms

    Returns:
      (list of string): Text block with terms Porter stemmed
    """

    porter = nltk.stem.porter.PorterStemmer()

    for i in range( 0, len( txt ) ):
        txt[ i ] = porter.stem( txt[ i ] )

    return txt


def remove_stop_word( txt ):
    """Remove all stop words from text blo
    Args:
      txt (list of string): Text block as list of individual terms

    Returns:
      (list of string): Text block with stop words removed
    """

    term_list = [ ]
    stop_word = nltk.corpus.stopwords.words( 'english' )

    for term in txt:
        term_list += ( [ ] if term in stop_word else [ term ] )

    return term_list


# Mainline

# Remove punctuation except hyphen

punc = string.punctuation.replace( '-', '' )
for i in range( 0, len( txt ) ):
    txt[ i ] = re.sub( '[' + punc + ']+', '', txt[ i ] )

# Lower-case and tokenize text

for i in range( 0, len( txt ) ):
    txt[ i ] = txt[ i ].lower().split()

# Stop word remove w/nltk stop word list, then Porter stem

for i in range( 0, len( txt ) ):
    txt[ i ] = remove_stop_word( txt[ i ] )
    txt[ i ] = porter_stem( txt[ i ] )

# Create list of all (unique) stemmed terms

term_list = set( txt[ 0 ] )
for i in range( 1, len( txt ) ):
    term_list = term_list.union( txt[ i ] )
term_list = sorted( term_list )

# Count occurrences of unique terms in each document

n = len( term_list )
freq = [ ]
for i in range( 0, len( txt ) ):
    freq.append( [ 0 ] * n )
    for term in txt[ i ]:
        pos = term_list.index( term )
        freq[ -1 ][ pos ] += 1

# Print transposed term-frequency list for easier viewing
print( '........................' )
for i in range( 0, len( term_list ) ):
    print( f'{term_list[ i ]: <{20}}', end='' )
    for j in range( 0, len( txt ) ):
        print( f'{freq[ j ][ i ]:4d} ', end='' )
    print( '' )

....................mice..lord..1984
747                    1    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0 
accord                 0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    1    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0 
across                 0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0  

august                 0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    1    0    0    0    0    0    0    0    0    0    0    0    0 
avoid                  0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    1    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0 
away                   0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    1    0    0    0    0    1    0    0    0    0    

central                0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    1    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0 
certain                0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    1    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0 
chaise-longu           0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    

crew                   0    1    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0 
crowd                  0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    1    0    1    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0 
dark                   0    0    0    0    0    0    0    0    0    0    0    1    0    0    0    0    0    0    0    0    0    0    1    0    0    0    0    

express                0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    1    0    0 
face                   0    0    0    0    0    1    0    0    0    0    0    0    0    0    0    0    0    0    0    1    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0 
fact                   0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    

foot                   0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    1    0    0    0    0    0 
ford                   0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    1    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0 
forese                 0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    

greatest               0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    1    0    0    0 
ground                 0    1    0    1    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0 
hall                   0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    

increasingli           0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    1    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0 
inde                   0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    1    0    0    0    0 
indulg                 0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    

left                   0    0    0    0    0    0    0    0    0    0    1    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    1    1    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    1    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0 
leg                    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    1    0    0    0    0    0    0    0    0    0    0    0 
lent                   0    1    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    

met                    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    1    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0 
midst                  0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    1    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0 
might                  0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    1    0    0    0    0    0    

outlook                0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    1    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0 
own                    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    1    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0 
parent                 0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    

pour                   0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    1    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0 
preoccupi              0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    1    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0 
presum                 0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    

receiv                 0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    1    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0 
refer                  0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    1    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0 
regularli              0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    

see                    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    1    0    0    0    0    0    0    0    1    2    0    1    0 
seem                   0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    1    0    0    0    0    0    0    0    0    0    0    0    1    0    1    0    0 
sen                    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    

situat                 0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    1    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0 
six                    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    1    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0 
skull                  0    0    0    0    0    1    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    

suggest                0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    1    0    0    0    0    0    1    0    0    0    0    0    0    0    0 
suit                   0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    1    1    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0 
summer                 0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    

town                   0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    1    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0 
train                  0    0    0    0    0    0    0    0    0    0    0    0    1    0    1    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0 
tri                    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    

write                  0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    1    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0 
ye                     0    0    0    0    0    0    0    0    1    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0 
year                   0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    

In [41]:
# 3. Weight terms in the matrix using TF-IDF and normalize each row to correct for sentence length. 

import gensim
#  Convert term vectors into gensim dictionary

dict = gensim.corpora.Dictionary( txt )

corp = [ ]
for i in range( 0, len( txt ) ):
    corp.append( dict.doc2bow( txt[ i ] ) )

#  Create TFIDF vectors based on term vectors bag-of-word corpora

tfidf_model = gensim.models.TfidfModel( corp )

tfidf = [ ]
for i in range( 0, len( corp ) ):
    tfidf.append( tfidf_model[ corp[ i ] ] )

#  Create pairwise document similarity index

n = len( dict )
index = gensim.similarities.SparseMatrixSimilarity( tfidf_model[ corp ], num_features = n )

#  Print TFIDF vectors and pairwise similarity per document

for i in range( 0, len( tfidf ) ):
    s = 'Sentence ' + str( i + 1 ) + ' TFIDF:'

    for j in range( 0, len( tfidf[ i ] ) ):
        s = s + ' (' + dict.get( tfidf[ i ][ j ][ 0 ] ) + ','
        s = s + ( '%.3f' % tfidf[ i ][ j ][ 1 ] ) + ')'

    print(s)


Sentence 1 TFIDF: (747,0.314) (airport,0.264) (approach,0.264) (cloud,0.264) (cover,0.215) (dens,0.314) (hamburg,0.314) (huge,0.314) (plung,0.314) (seat,0.235) (strap,0.314) (thirty-seven,0.314)
Sentence 2 TFIDF: (airport,0.178) (air,0.211) (atop,0.211) (billboard,0.211) (bmw,0.211) (build,0.178) (cold,0.158) (crew,0.211) (drench,0.211) (earth,0.178) (everyth,0.211) (flag,0.211) (flemish,0.211) (gear,0.211) (gloomi,0.211) (ground,0.178) (landscap,0.211) (lent,0.211) (novemb,0.211) (rain,0.423) (squat,0.211)
Sentence 3 TFIDF: (so—germani,1.000)
Sentence 4 TFIDF: (cover,0.185) (ground,0.228) (beatl,0.271) (began,0.271) (ceil,0.271) (flow,0.271) (music,0.228) (norwegian,0.271) (orchestr,0.271) (plane,0.228) (soft,0.271) (speaker,0.271) (sweet,0.271) (version,0.271) (wood,0.271)
Sentence 5 TFIDF: (ever,0.305) (fail,0.362) (harder,0.362) (hit,0.362) (melodi,0.362) (never,0.271) (send,0.362) (shudder,0.362) (time,0.214)
Sentence 6 TFIDF: (seat,0.278) (bent,0.371) (face,0.313) (forward,0.371)

In [53]:
# 4.1 Use cosine similarity to compute a pairwise sentence similarity matrix,
sim_matrix_raw = []
for i in range( 0, len( corp ) ):
    print ('Sentence', ( i + 1 ), 'sim: [ ', end = '')

    sim = index[ tfidf_model[ corp[ i ] ] ]

    sim_matrix_raw.append(sim)
    
    for j in range( 0, len( sim ) ):
        print('%.3f ' % sim[ j ], end = '')

    print(']')

Sentence 1 sim: [ 1.000 0.047 0.000 0.040 0.000 0.065 0.089 0.000 0.000 0.000 0.000 0.054 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.040 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.045 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.061 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 ]
Sentence 2 sim: [ 0.047 1.000 0.000 0.041 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.071 0.000 0.000 0.074 0.000 0.000 0.000 0.079 0.049 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 ]


Sentence 23 sim: [ 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.076 0.127 0.149 0.000 0.000 0.000 0.184 0.109 0.000 0.000 0.000 1.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.044 0.000 0.000 0.000 0.000 0.084 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 ]
Sentence 24 sim: [ 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.195 0.000 0.000 0.000 0.000 0.000 1.000 0.128 0.000 0.124 0.000 0.150 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.134 0.000 0.000 0.176 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.103 0.000 0.000 0.000 0.000 

Sentence 41 sim: [ 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.109 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 1.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.222 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 ]
Sentence 42 sim: [ 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.058 0.346 0.186 0.000 0.000 0.000 0.117 0.000 0.000 1.000 0.000 0.000 0.157 0.000 0.000 0.000 0.000 0.000 0.000 0.103 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.071 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.136 0.000 0.000 0.000 0.000 0.000 

Sentence 59 sim: [ 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.103 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.070 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.076 0.089 0.000 0.100 0.000 1.000 0.000 0.204 0.081 0.168 0.076 0.041 0.000 0.000 0.000 0.000 0.089 0.063 0.000 0.000 0.000 0.000 0.051 0.040 0.000 0.000 0.000 ]
Sentence 60 sim: [ 0.000 0.000 0.000 0.000 0.000 0.000 0.080 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.113 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.092 0.090 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 1.000 0.000 0.000 0.000 0.000 0.000 0.054 0.000 0.000 0.000 0.000 0.000 0.139 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 

Sentence 77 sim: [ 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.045 0.000 0.000 0.000 0.000 0.037 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.040 0.000 0.043 0.000 0.021 0.000 0.069 0.000 0.038 0.045 0.000 0.026 0.000 0.130 0.000 0.000 0.000 0.077 1.000 0.000 0.086 0.000 ]
Sentence 78 sim: [ 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.052 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.070 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.058 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.067 0.037 0.103 0.000 0.000 0.000 0.000 0.056 0.000 0.000 0.000 0.000 0.041 0.000 1.000 0.000 0.000 

In [64]:
#  4.2 convert this to a dissimilarity matrix, then use k-means clustering to cluster the sentences into ten clusters.
import numpy as np
sim_matrix = np.asmatrix(sim_matrix_raw)
sim_matrix.shape
sim_matrix_raw

one_matrix = np.ones(6400).reshape((80, 80))

dis_matrix = np.subtract(one_matrix,sim_matrix)
dis_matrix

from sklearn.datasets import fetch_20newsgroups
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
from sklearn import metrics

from sklearn.cluster import KMeans, MiniBatchKMeans
# Partition data into three clusters
km = KMeans( n_clusters = 10, random_state = 101 )
km.fit( dis_matrix )
km.labels_

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=10, n_init=10, n_jobs=None, precompute_distances='auto',
       random_state=101, tol=0.0001, verbose=0)

In [7]:
#### Dr Healey's

In [1]:
import nltk
import re
import string

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans

nltk.download( 'stopwords' )

txt = [
    'I was thirty-seven then, strapped in my seat as the huge 747 plunged through dense cloud cover on approach to the Hamburg airport. Cold November rains drenched the earth and lent everything the gloomy air of a Flemish landscape: the ground crew in rain gear, a flag atop a squat airport building, a BMW billboard. So Germany again. Once the plane was on the ground, soft music began to flow from the ceiling speakers: a sweet orchestral cover version of the Beatles\' "Norwegian Wood." The melody never failed to send a shudder through me, but this time it hit me harder than ever. I bent forward in my seat, face in hands to keep my skull from splitting open. Before long one of the German stewardesses approached and asked in English if I were sick. "No," I said, "just dizzy." "Are you sure?" "Yes, I\'m sure. Thanks." She smiled and left, and the music changed to a Billy Joel tune. I straightened up and looked out the plane window at the dark clouds hanging over the North Sea, thinking of what I had lost in the course of my life: times gone forever, friends who had died or disappeared, feelings I would never know again.',
    'The train came out of the long tunnel into the snow country. The earth lay white under the night sky. The train pulled up at a signal stop. A girl who had been sitting on the other side of the car came over and opened the window in front of Shimamura. The snowy cold poured in. Leaning far out the window, the girl called to the station master as though he were a great distance away. The station master walked slowly over the snow, a lantern in his hand. His face was buried to the nose in a muffler, and the flaps of his cap were turned down over his ears. It\'s that cold, is it, thought Shimamura. Low, barracklike buildings that might have been railway dormitories were scattered here and there up the frozen slope of the mountain. The white of the snow fell away into the darkness some distance before it reached them.',
    'I always called him "Sensei." I shall therefore refer to him simply as "Sensei," and not by his real name. It is not because I consider it more discreet, but it is because I find it more natural that I do so. Whenever the memory of him comes back to me now, I find that I think of him as "Sensei" still. And with pen in hand, I cannot bring myself to write of him in any other way. It was at Kamakura, during the summer holidays, that I first met Sensei. I was then a very young student. I went there at the insistence of a friend of mine, who had gone to Kamakura to swim. We were not together for long. It had taken me a few days to get together enough money to cover the necessary expenses, and it was only three days after my arrival that my friend received a telegram from home demanding his return. His mother, the telegram explained, was ill. My friend, however, did not believe this. For some time his parents had been trying to persuade him, much against his will, to marry a certain girl. According to our modern outlook, he was really too young to marry. Moreover, he was not in the least fond of the girl. It was in order to avoid an unpleasant situation that instead of going home, as he normally would have done, he had gone to the resort near Tokyo to spend his holidays. He showed me the telegram, and asked me what he should do. I did not know what to tell him. It was, however, clear that if his mother was truly ill, he should go home. And so he decided to leave after all. I, who had taken so much trouble to join my friend, was left alone. There were many days left before the beginning of term, and I was free either to stay in Kamakura or to go home. I decided to stay. My friend was from a wealthy family in the Central Provinces, and had no financial worries. But being a young student, his standard of living was much the same as my own. I was therefore not obliged, when I found myself alone, to change my lodgings. My inn was in a rather out-of-the-way district of Kamakura, and if one wished to indulge in such fashionable pastimes as playing billiards and eating ice cream, one had to walk a long way across rice fields. If one went by rickshaw, it cost twenty sen. Remote as the district was, however, many rich families had built their villas there. It was quite near the sea also, which was convenient for swimmers such as myself. I walked to the sea every day, between thatched cottages that were old and smoke-blackened. The beach was always crowded with men and women, and at times the sea, like a public bath, would be covered with a mass of black heads. I never ceased to wonder how so many city holiday-makers could squeeze themselves into so small a town. Alone in this noisy and happy crowd, I managed to enjoy myself, dozing on the beach or splashing about in the water. It was in the midst of this confusion that I found Sensei. In those days, there were two tea houses on the beach. For no particular reason, I had come to patronize one of them. Unlike those people with their great villas in the Hase area who had their own bathing huts, we in our part of the beach were obliged to make use of these tea houses which served also as communal changing rooms. In them the bathers would drink tea, rest, have their bathing suits rinsed, wash the salt from their bodies, and leave their hats and sunshades for safe-keeping. I owned no bathing suit to change into, but I was afraid of being robbed, and so I regularly left my things in the tea house before going into the water.',
    'It seems increasingly likely that I really will undertake the expedition that has been preoccupying my imagination now for some days. An expedition, I should say, which I will undertake alone, in the comfort of Mr Farraday\'s Ford; an expedition which, as I foresee it, will take me through much of the finest countryside of England to the West Country, and may keep me away from Darlington Hall for as much as five or six days. The idea of such a journey came about, I should point out, from a most kind suggestion put to me by Mr Farraday himself one afternoon almost a fortnight ago, when I had been dusting the portraits in the library. In fact, as I recall, I was up on the step-ladder dusting the portrait of Viscount Wetherby when my employer had entered carrying a few volumes which he presumably wished returned to the shelves. On seeing my person, he took the opportunity to inform me that he had just that moment finalized plans to return to the United States for a period of five weeks between August and September. Having made this announcement, my employer put his volumes down on a table, seated himself on the chaise-longue, and stretched out his legs. "You realize, Stevens, I don\'t expect you to be locked up here in this house all the time I\'m away. Why don\'t you take the car and drive off somewhere for a few days? You look like you could make good use of a break." Coming out of the blue as it did, I did not quite know how to reply to such a suggestion. I recall thanking him for his consideration, but quite probably I said nothing very definite for my employer went on: "I\'m serious, Stevens. I really think you should take a break. I\'ll foot the bill for the gas. You fellows, you\'re always locked up in these big houses helping out, how do you ever get to see around this beautiful country of yours?" This was not the first time my employer had raised such a question; indeed, it seems to be something which genuinely troubles him. On this occasion, in fact, a reply of sorts did occur to me as I stood up there on the ladder; a reply to the effect that those of our profession, although we did not see a great deal of the country in the sense of touring the countryside and visiting picturesque sites, did actually \'see\' more of England than most, placed as we were in houses where the greatest ladies and gentlemen of the land gathered. Of course, I could not have expressed this view to Mr Farraday without embarking upon what might have seemed a presumptuous speech. I thus contented myself by saying simply: "It has been my privilege to see the best of England over the years, sir, within these very walls."'
]

# Split text blocks into sentences

full_sent = [ ]
for i in range( 0, len( txt ) ):
    sent = re.sub( r'[\.!\?]"', '"', txt[ i ] )
    full_sent += re.split( '[\.!\?]', sent )
full_sent = [sent.strip() for sent in full_sent]

# Remove empty sentences

i = 0
while i < len( full_sent ):
    if len( full_sent[ i ] ) == 0:
        del full_sent[ i ]
    else:
        i += 1

# Remove punctuation

sent = [ ]

punc = string.punctuation.replace( '-', '' )
for i in range( 0, len( full_sent ) ):
    sent.append( re.sub( '[' + punc + ']+', '', full_sent[ i ] ) )

# Porter stem

porter = nltk.stem.porter.PorterStemmer()
stems = { }

for i in range( 0, len( sent ) ):
    tok = sent[ i ].split()
    for j in range( 0, len( tok ) ):
        if tok[ j ] not in stems:
            stems[ tok[ j ] ] = porter.stem( tok[ j ] )
        tok[ j ] = stems[ tok[ j ] ]

    sent[ i ] = ' '.join( tok )

# Remove empty sentences after stop word removal

i = 0
while i < len( sent ):
    if len( sent[ i ] ) == 0:
        del sent[ i ]
    else:
        i += 1

# Convert frequencies to TF-IDF values, get cosine similarity
# of all pairs of documents

tfidf = TfidfVectorizer( stop_words='english', max_df=0.8, max_features=1000 )
term_vec = tfidf.fit_transform( sent )
X = cosine_similarity( term_vec )

# Fit vectors to clusters

clust = KMeans( n_clusters=5, random_state=1 ).fit( X )
print( clust.labels_ )

for i in range( 0, len( set( clust.labels_ ) ) ):
    print( f'Cluster {i}:' )

    for j in range( 0, len( clust.labels_ ) ):
        if clust.labels_[ j ] == i:
            print( full_sent[ j ].replace( '"', '' ).strip() )

    print()

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/zhanyina/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


[1 1 1 0 1 1 0 1 0 1 1 1 1 1 0 2 2 1 1 0 2 1 2 1 2 3 1 1 2 2 4 2 3 3 2 1 4
 3 1 3 1 1 1 3 2 1 1 1 3 1 0 1 0 2 0 1 0 1 0 1 1 1 1 1 2 0 0 4 0 1 1 0 4 0
 1]
Cluster 0:
Once the plane was on the ground, soft music began to flow from the ceiling speakers: a sweet orchestral cover version of the Beatles' Norwegian Wood The melody never failed to send a shudder through me, but this time it hit me harder than ever
No, I said, just dizzy Are you sure Yes, I'm sure
I straightened up and looked out the plane window at the dark clouds hanging over the North Sea, thinking of what I had lost in the course of my life: times gone forever, friends who had died or disappeared, feelings I would never know again
Leaning far out the window, the girl called to the station master as though he were a great distance away
The white of the snow fell away into the darkness some distance before it reached them
The beach was always crowded with men and women, and at times the sea, like a public bath, would be cover

In [2]:
import numpy as np
from scipy.spatial.distance import cdist

def elbow( X, max_clust=25 ):
    distort = [ ]
    inertia = [ ]

    map_distort = { }
    map_inertia = { }

    elbow_distort = 1
    elbow_inertia = 1

    K = range( 1, max_clust )
    for k in K:
        kmean_model = KMeans( n_clusters=k )
        kmean_model.fit( X )

        distort.append( sum( np.min( cdist( X, kmean_model.cluster_centers_, 'euclidean' ), axis=1 ) ) / X.shape[ 0 ] )
        inertia.append( kmean_model.inertia_ )

        map_distort[ k ] = sum( np.min( cdist( X, kmean_model.cluster_centers_, 'euclidean' ), axis=1 ) ) / X.shape[ 0 ]
        map_inertia[ k ] = kmean_model.inertia_

    prev_k = ''
    prev_v = 0
    prev_pct = 0
    for i,(k,v) in enumerate( map_distort.items() ):
        if prev_k == '':
            print( f'{k}: {v:.4f}' )
            prev_k = str( k )
            prev_v = v
            continue

        print( f'{k}: {v:.4f} ', end='' )

        diff_v = prev_v - v
        diff_v_pct = diff_v / prev_v * 100.0
        print( f'{diff_v:.4f}, {diff_v_pct:.2f}%' )

        if i > 2 and prev_pct - diff_v_pct < 0.5:
            elbow_distort = i + 1
            break

        prev_k = str( k )
        prev_v = v
        prev_pct = diff_v_pct

    print()

    prev_k = ''
    prev_v = 0
    prev_pct = 0
    for i,(k,v) in enumerate( map_inertia.items() ):
        if prev_k == '':
            print( f'{k}: {v:.4f}' )
            prev_k = str( k )
            prev_v = v
            continue

        print( f'{k}: {v:.4f} ', end='' )

        diff_v = prev_v - v
        diff_v_pct = diff_v / prev_v * 100.0
        print( f'{diff_v:.4f}, {diff_v_pct:.2f}%' )

        if i > 2 and prev_pct - diff_v_pct < 0.5:
            elbow_inertia = i + 1
            break

        prev_k = str( k )
        prev_v = v
        prev_pct = diff_v_pct

    return max( elbow_distort, elbow_inertia )

In [3]:
import pandas as pd
from sklearn.decomposition import LatentDirichletAllocation as LDA
from sklearn.feature_extraction.text import CountVectorizer

# Count raw term frequencies

# for LDA you only uses Count Vectorizer, NEVER TFIDF.. sklearn already does TFIDF inside LDA
count = CountVectorizer( stop_words='english' )
term_vec = count.fit_transform( sent )

n_topic = 10

# Build a string list of [ 'Topic 1', 'Topic 2', ..., 'Topic n' ]

col_nm = [ ]
for i in range( 1, n_topic + 1 ):
    col_nm += [ f'Topic {i}' ]

# Fit an LDA model to the term vectors, get cosine similarities

lda_model = LDA( n_components=n_topic )
concept = lda_model.fit_transform( term_vec )
X = cosine_similarity( concept )

# Print top 10 terms for each topic

feat = count.get_feature_names()
topic_list = [ ]
for i,topic in enumerate( lda_model.components_ ):
    top_n = [ feat[ i ] for i in topic.argsort()[ -10: ] ]
    top_feat = ' '.join( top_n )
    topic_list.append( f"topic_{'_'.join(top_n[ :3 ] ) }" )

    print( f'Topic {i}: {top_feat}' )
print()

# Cluster sentences and print clusters

clust = KMeans( n_clusters=5 ).fit( concept )

for i in range( 0, len( set( clust.labels_ ) ) ):
    print( f'Cluster {i}:' )
    for j in range( 0, len( clust.labels_ ) ):
        if clust.labels_[ j ] != i:
            continue
        print( full_sent[ j ] )

    print()

Topic 0: came seat snow howev train ask wa approach realli long
Topic 1: away bath walk sea window way chang wa hous tea
Topic 2: farraday quit countri away know break alon expedit day did
Topic 3: thank beach music came veri went thi wa cover time
Topic 4: open marri hand ill left girl telegram stay wa hi
Topic 5: water fact great england decid countri repli hous thi did
Topic 6: sensei come student kamakura home young day friend hi wa
Topic 7: im alon therefor build earth friend cold wa sure rain
Topic 8: recal portrait dust wish suit volum simpli veri leav wa
Topic 9: holiday shimamura becaus troubl alway employ sensei thi wa hi

Cluster 0:
I was thirty-seven then, strapped in my seat as the huge 747 plunged through dense cloud cover on approach to the Hamburg airport
Cold November rains drenched the earth and lent everything the gloomy air of a Flemish landscape: the ground crew in rain gear, a flag atop a squat airport building, a BMW billboard
So Germany again
Once the plane was 