In [1]:
import requests
import bs4
import numpy as np
import nltk
import matplotlib.pyplot as plt
import re
import scipy
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger') 
nltk.download('wordnet') 
nltk.download('stopwords')
nltk.download('omw-1.4')
lemmatizer=nltk.stem.WordNetLemmatizer() 

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Vera\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Vera\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Vera\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Vera\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Vera\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [23]:
def remove_header_footer(text, name_book):
    header = "*** START OF THE PROJECT GUTENBERG EBOOK " + name_book.upper() + " ***"
    footer = "*** END OF THE PROJECT GUTENBERG EBOOK " + name_book.upper() + " ***"
    no_header_text = text[(text.find(header) + len(header)):]
    no_footer_text = no_header_text[:no_header_text.find(footer)]
    
    return no_footer_text

In [63]:
def tokenize_text(text):
    
    temp_tokenizedtext = nltk.word_tokenize(text)    
    mycrawled_nltktext = nltk.Text(temp_tokenizedtext)    
    
    return mycrawled_nltktext


def lower_case_text(text):
    mycrawled_lowercasetext = [] 

    for k in range(len(text)):        
        lowercaseword = text[k].lower()        
        mycrawled_lowercasetext.append(lowercaseword)    
        
    return mycrawled_lowercasetext

def tagtowordnet(postag):   
    wordnettag = -1   
    if postag[0] == 'N':        
        wordnettag = 'n'   
    elif postag[0] == 'V':        
        wordnettag = 'v'   
    elif postag[0] == 'J':        
        wordnettag = 'a'    
    elif postag[0] == 'R':        
        wordnettag = 'r'    
    return(wordnettag)

def lemmatizetext(nltktexttolemmatize):    
    # Tag the text with POS tags    
    taggedtext = nltk.pos_tag(nltktexttolemmatize)   
    # Lemmatize each word text    
    lemmatizedtext = []    
    for l in range(len(taggedtext)):       
        # Lemmatize a word using the WordNet converted POS tag       
        wordtolemmatize = taggedtext[l][0]        
        wordnettag = tagtowordnet(taggedtext[l][1])        
        if wordnettag != -1:            
            lemmatizedword = lemmatizer.lemmatize(wordtolemmatize,wordnettag)        
        else:            
            lemmatizedword=wordtolemmatize       
            # Store the lemmatized word        
        lemmatizedtext.append(lemmatizedword)
        
    return(lemmatizedtext) 


def make_vocabulary(text):
    myvocabulary = [] 
    myindices_in_vocabulary = []
    # Find the vocabulary of each document    
    # Get unique words and where they occur      
    uniqueresults = np.unique(text,return_inverse=True)   
     # Store the vocabulary and indices
    myvocabulary = uniqueresults[0]    
    myindices_in_vocabulary = uniqueresults[1]    
    
    return myvocabulary, myindices_in_vocabulary


In [106]:
def prune_text(mycrawled_lemmatizedtexts, unifiedvocabulary, myindices_in_unifiedvocabulary,  highest_totaloccurrences_indices, occur_counts):
    nltkstopwords=nltk.corpus.stopwords.words('english') 
    pruningdecisions = np.zeros((len(unifiedvocabulary),1)) 
    for k in range(len(unifiedvocabulary)):    
        # Rule 1: check the nltk stop word list    
        if (unifiedvocabulary[k] in nltkstopwords):        
            pruningdecisions[k] = 1    
            
        # Rule 2: if the word is in the top 1% of frequent words    
        if (k in highest_totaloccurrences_indices[\
                                                  0:int(np.floor(len(unifiedvocabulary)*0.01))]):        
            pruningdecisions[k] = 1    
        # Rule 3: if the word is too short    
        if len(unifiedvocabulary[k]) < 2:        
            pruningdecisions[k] = 1    
            
        # Rule 4: if the word is too long    
        if len(unifiedvocabulary[k])>20:        
            pruningdecisions[k] = 1    
            
        # Rule 5: if the word appears less than 4 times
        if occur_counts[k] < 4:
            pruningdecisions[k] = 1  
         
    oldtopruned = [] 
    tempind = -1 
    for k in range(len(unifiedvocabulary)):    
        if pruningdecisions[k] == 0:        
            tempind = tempind + 1        
            oldtopruned.append(tempind)    
        else:        
            oldtopruned.append(-1) 
    #%% Create pruned texts 
    mycrawled_prunedtexts = [] 
    myindices_in_prunedvocabulary = [] 
    for k in range(len(mycrawled_lemmatizedtexts)):       
        temp_newindices = []    
        temp_newdoc = []    
        for l in range(len(mycrawled_lemmatizedtexts[k])):        
            temp_oldindex = myindices_in_unifiedvocabulary[k][l]                    
            temp_newindex = oldtopruned[temp_oldindex]        
            if temp_newindex != -1:            
                temp_newindices.append(temp_newindex)            
                temp_newdoc.append(unifiedvocabulary[temp_oldindex])    
        mycrawled_prunedtexts.append(temp_newdoc)    
        myindices_in_prunedvocabulary.append(temp_newindices)
    
    remainingindices = np.squeeze(np.where(pruningdecisions == 0)[0]) 
    remainingvocabulary = unifiedvocabulary[remainingindices] 
    return mycrawled_prunedtexts, myindices_in_prunedvocabulary, remainingvocabulary

In [107]:
wizard_oz_html = requests.get('https://www.gutenberg.org/files/55/55-0.txt')
wizard_oz_text = str(bs4.BeautifulSoup(wizard_oz_html.content,'html.parser'))
wizard_oz_text = remove_header_footer(wizard_oz_text, 'THE WONDERFUL WIZARD OF OZ')

wizard_oz_paragraphs = re.split('\r\n\r\n[\r\n]*', wizard_oz_text) 

wizard_oz_texts = []
for par in wizard_oz_paragraphs:
    par = " ".join(par.split())
    wizard_oz_texts.append(par)
    
wizard_oz_tokenized = []
for par in wizard_oz_texts:
    par = tokenize_text(par)
    wizard_oz_tokenized.append(par)
    

wizard_oz_texts_lower = []
for par in wizard_oz_tokenized:
    par = lower_case_text(par)
    wizard_oz_texts_lower.append(par)
    

wizard_oz_texts_lemmatized = []
for par in wizard_oz_texts_lower:
    par = lemmatizetext(par)
    par = nltk.Text(par)
    wizard_oz_texts_lemmatized.append(par)
    
vocabularies = [] 
indices_in_vocabularies = []
# Find the vocabulary of each document 
for par in wizard_oz_texts_lemmatized:   
    # Get unique words and where they occur       
    uniqueresults = np.unique(par,return_inverse=True)    
    uniquewords = uniqueresults[0]    
    wordindices = uniqueresults[1]    
    # Store the vocabulary and indices of document words in it    
    vocabularies.append(uniquewords)    
    indices_in_vocabularies.append(wordindices) 
    
tempvocabulary = []  
for par in wizard_oz_texts_lemmatized:   
    tempvocabulary.extend(par) 
    # Find the unique elements among all vocabularies 
uniqueresults = np.unique(tempvocabulary,return_inverse=True) 
unifiedvocabulary = uniqueresults[0] 
wordindices = uniqueresults[1] 
# Translate previous indices to the unified vocabulary. 
# Must keep track where each vocabulary started in 
# the concatenated one. 
vocabularystart = 0 
indices_in_unifiedvocabulary = [] 
for k in range(len(wizard_oz_texts_lemmatized)):    
    # In order to shift word indices, we must temporarily    
    # change their data type to a Numpy array    
    tempindices = np.array(indices_in_vocabularies[k])    
    tempindices = tempindices + vocabularystart    
    tempindices = wordindices[tempindices]    
    indices_in_unifiedvocabulary.append(tempindices)    
    vocabularystart = vocabularystart + len(vocabularies[k])
    
unifiedvocabulary_totaloccurrencecounts = np.zeros((len(unifiedvocabulary),1)) 

for k in range(len(wizard_oz_texts_lemmatized)): 
    occurrencecounts = np.zeros((len(unifiedvocabulary),1))    
    for l in range(len(indices_in_unifiedvocabulary[k])):        
        occurrencecounts[indices_in_unifiedvocabulary[k][l]] = \
        occurrencecounts[indices_in_unifiedvocabulary[k][l]] + 1    
    unifiedvocabulary_totaloccurrencecounts = \
    unifiedvocabulary_totaloccurrencecounts + occurrencecounts  

highest_totaloccurrences_indices = np.argsort(\
                                               -1*unifiedvocabulary_totaloccurrencecounts,axis=0) 

wizard_oz_pruned_texts, vocabulary_pruned_indices, vocabulary_pruned = prune_text(wizard_oz_texts_lemmatized, unifiedvocabulary, 
                                                                                  indices_in_unifiedvocabulary, 
                                                                                  unifiedvocabulary_totaloccurrencecounts,
                                                                                  highest_totaloccurrences_indices)
                                                                                 

In [306]:
n_docs = len(wizard_oz_pruned_texts) 
n_vocab = len(vocabulary_pruned) 
# Matrix of term frequencies 
tfmatrix = scipy.sparse.lil_matrix((n_docs,n_vocab)) 
# Row vector of document frequencies 
dfvector = scipy.sparse.lil_matrix((1,n_vocab)) 
# Loop over documents 
for k in range(n_docs):    
    # Row vector of which words occurred in this document    
    temp_dfvector = scipy.sparse.lil_matrix((1,n_vocab))   
    temp_total_terms = len(np.unique(wizard_oz_pruned_texts[k]))
    # Loop over words    
    for l in range(len(wizard_oz_pruned_texts[k])):        
    # Add current word to term-frequency count and document-count        
        currentword = vocabulary_pruned_indices[k][l]        
        tfmatrix[k,currentword] = tfmatrix[k,currentword] + 1        
        temp_dfvector[0,currentword] = 1  
    # Length normalization frequence of raw counts
    if temp_total_terms == 0:
        temp_total_terms = 1
    tfmatrix[k,:] /= temp_total_terms
    # Add which words occurred in this document to overall document counts
    dfvector = dfvector + temp_dfvector
    # Use the count statistics to compute the tf-idf matrix 
tfidfmatrix = scipy.sparse.lil_matrix((n_docs,n_vocab)) 
# Let's use length-normalized frequency term count, and smoothed logarithmic idf 
idfvector = 1 + np.log((1 / (np.array(dfvector.todense())[0] + 1)) * n_docs)
for k in range(n_docs):    
    # Combine the tf and idf terms    
    tfidfmatrix[k,:] = tfmatrix[k,:] * idfvector

In [307]:
dimensiontotals = np.squeeze(np.array(\
                                          np.sum(tfidfmatrix,axis=0))) 
highesttotals = np.argsort(-1 * dimensiontotals) 
Xsmall = tfidfmatrix[:,highesttotals[0:500]] 
Xsmall = Xsmall.todense() 
# Normalize the documents to unit vector norm 
tempnorms = np.squeeze(np.array(np.sum(np.multiply(Xsmall,Xsmall),axis=1))) 
# If any documents have zero norm, avoid dividing them by zero 
tempnorms[tempnorms == 0] = 1 
Xsmall = scipy.sparse.diags(tempnorms ** -0.5).dot(Xsmall) 

import sklearn    
import sklearn.mixture 
# Create the mixture model object, and 
# choose the number of components and EM iterations 
mixturemodel = sklearn.mixture.GaussianMixture(n_components=10, \
                                             covariance_type='diag',max_iter=100,init_params='random') 
fittedmixture = mixturemodel.fit(Xsmall) 
sklearn_mixturemodel_means = fittedmixture.means_ 
sklearn_mixturemodel_weights = fittedmixture.weights_ 
sklearn_mixturemodel_covariances = fittedmixture.covariances_

# Find top 10 words with highest mean feature value for each cluster 
for k in range(10):    
    print(k, '\\\\')    
    highest_dimensionweight_indices = np.argsort( \
                                                  -np.squeeze(sklearn_mixturemodel_means[k,:]),axis=0)    
    highest_dimensionweight_indices = highesttotals[highest_dimensionweight_indices]    
    print(' '.join(vocabulary_pruned[highest_dimensionweight_indices[1:10]]), '\\\\') 

0 \\
move mouth mouse mourning mourn motionless mother mess moth \\
1 \\
move mouth mouse mourning mourn motionless mother mess moth \\
2 \\
move mouth mouse mourning mourn motionless mother mess moth \\
3 \\
mouth mouse mourning mourn motionless moth moss mortal morning—all \\
4 \\
move mouth mouse mourning mourn motionless mother mess moth \\
5 \\
reply replace platter scarcely scar rock rocky mother mess \\
6 \\
move mouth mouse mourning mourn motionless mother mess moth \\
7 \\
move mouth mouse mourning mourn motionless mother mess moth \\
8 \\
move mouth mouse mourning mourn motionless mother mess moth \\
9 \\
move mouth mouse mourning mourn motionless mother mess moth \\


array([0.09980762, 0.09985188, 0.10007271, 0.10031327, 0.09932124,
       0.10004263, 0.09994624, 0.10134318, 0.09760854, 0.10169269])

In [233]:
from numpy.matlib import repmat

In [234]:
X = tfidfmatrix 
# Normalize the documents to unit vector norm 
tempnorms = np.squeeze(np.array(np.sum(X.multiply(X),axis=1))) 
# If any documents have zero norm, avoid dividing them by zero 
tempnorms[tempnorms == 0] = 1 
X = scipy.sparse.diags(tempnorms ** -0.5).dot(X) 
n_data = np.shape(X)[0] 
n_dimensions = np.shape(X)[1]


In [235]:
def initialize_mixturemodel(X,n_components):    
    # Create lists of sparse matrices to hold the parameters    
    n_dimensions = np.shape(X)[1]    
    mixturemodel_means = scipy.sparse.lil_matrix((n_components,n_dimensions))    
    mixturemodel_weights = np.zeros((n_components))    
    mixturemodel_covariances = []    
    mixturemodel_inversecovariances = []    
    for k in range(n_components):        
        tempcovariance = scipy.sparse.lil_matrix((n_dimensions,n_dimensions))            
        mixturemodel_covariances.append(tempcovariance)        
        tempinvcovariance = scipy.sparse.lil_matrix((n_dimensions,n_dimensions))           
        mixturemodel_inversecovariances.append(tempinvcovariance)    
    # Initialize the parameters    
    for k in range(n_components):        
        mixturemodel_weights[k] = 1/n_components        
        # Pick a random data point as the initial mean        
        tempindex=scipy.stats.randint.rvs(low=0,high=n_components)        
        mixturemodel_means[k] = X[tempindex,:].toarray()          
        # Initialize the covariance matrix to be spherical        
        for l in range(n_dimensions):           
            mixturemodel_covariances[k][l,l] = 1            
            mixturemodel_inversecovariances[k][l,l] = 1    
    return(mixturemodel_weights,mixturemodel_means,mixturemodel_covariances,\
                                                                   mixturemodel_inversecovariances)


In [236]:
def run_estep(X,mixturemodel_means,mixturemodel_covariances, \
              mixturemodel_inversecovariances,mixturemodel_weights):    
    # For each component, compute terms that do not involve data    
    meanterms = np.zeros((n_components))    
    logdeterminants = np.zeros((n_components))    
    logconstantterms = np.zeros((n_components))    
    for k in range(n_components):        
        # Compute mu_k*inv(Sigma_k)*mu_k        
        meanterms[k] = (mixturemodel_means[k,:] * \
                      mixturemodel_inversecovariances[k] * mixturemodel_means[k,:].T)[0,0]       
        # Compute determinant of Sigma_k. For a diagonal matrix         
        # this is just the product of the main diagonal        
        logdeterminants[k] = np.sum(np.log(mixturemodel_covariances[k].diagonal(0)))        
        # Compute constant term beta_k * 1/(|Sigma_k|^1/2)        
        # Omit the (2pi)^d/2 as it cancels out        
        logconstantterms[k] = np.log(mixturemodel_weights[k]) - 0.5 * logdeterminants[k]    
    print('E-step part2 ')    
    # Compute terms that involve distances of data from components    
    xnorms = np.zeros((n_data,n_components))    
    xtimesmu = np.zeros((n_data,n_components))    
    for k in range(n_components):        
        print(k)        
        xnorms[:,k] = (X * mixturemodel_inversecovariances[k] * X.T).diagonal(0)        
        xtimesmu[:,k] = np.squeeze((X * mixturemodel_inversecovariances[k] * \
                                     mixturemodel_means[k,:].T).toarray())    
    xdists = xnorms + repmat(meanterms,n_data,1) - 2 * xtimesmu            
    # Substract maximal term before exponent (cancels out) to maintain computational precision    
    numeratorterms = logconstantterms - xdists / 2    
    numeratorterms -= repmat(np.max(numeratorterms,axis=1),n_components,1).T    
    numeratorterms = np.exp(numeratorterms)    
    mixturemodel_componentmemberships = numeratorterms / repmat(\
                                                                         np.sum(numeratorterms,axis=1),n_components,1).T    
    return(mixturemodel_componentmemberships)


In [237]:
def run_mstep_sumweights(mixturemodel_componentmemberships):    
    # Compute total weight per component    
    mixturemodel_weights = np.sum(mixturemodel_componentmemberships,axis=0)    
    return(mixturemodel_weights) 

def run_mstep_means(X,mixturemodel_componentmemberships,mixturemodel_weights):    
    # Update component means    
    mixturemodel_means = scipy.sparse.lil_matrix((n_components,n_dimensions))    
    for k in range(n_components):        
        mixturemodel_means[k,:] = \
                                np.sum(scipy.sparse.diags(mixturemodel_componentmemberships[:,k]).dot(X),axis=0)        
        mixturemodel_means[k,:] /= mixturemodel_weights[k]    
    return(mixturemodel_means) 

def run_mstep_covariances(X,mixturemodel_componentmemberships,mixturemodel_weights,mixturemodel_means):    
    # Update diagonal component covariance matrices    
    n_dimensions = np.shape(X)[1]    
    n_components = np.shape(mixturemodel_componentmemberships)[1]    
    tempcovariances = np.zeros((n_components,n_dimensions))    
    mixturemodel_covariances = []    
    mixturemodel_inversecovariances = []    
    for k in range(n_components):        
        tempcovariances[k,:] = \
                        np.sum(scipy.sparse.diags(mixturemodel_componentmemberships[:,k]).dot(X.multiply(X)),axis=0) \
                                                                -mixturemodel_means[k,:].multiply(mixturemodel_means[k,:]) * mixturemodel_weights[k]        
        tempcovariances[k,:] /= mixturemodel_weights[k]        
        # Convert to sparse matrices        
        tempepsilon = 1e-10        
        # Add a small regularization term        
        temp_covariance = scipy.sparse.diags(tempcovariances[k,:]+tempepsilon)        
        temp_inversecovariance = scipy.sparse.diags((tempcovariances[k,:]+tempepsilon)**-1)        
        mixturemodel_covariances.append(temp_covariance)        
        mixturemodel_inversecovariances.append(temp_inversecovariance)     
    return(mixturemodel_covariances,mixturemodel_inversecovariances)   

def run_mstep_normalizeweights(mixturemodel_weights):    
    # Update mixture-component prior probabilities    
    mixturemodel_weights /= sum(mixturemodel_weights)    
    return(mixturemodel_weights)


In [308]:
def perform_emalgorithm(X,n_components,n_emiterations):    
    mixturemodel_weights,mixturemodel_means,mixturemodel_covariances,\
                                mixturemodel_inversecovariances=initialize_mixturemodel(X,n_components)    
    for t in range(n_emiterations):        
        # ====== E-step: Compute the component membership        
        # probabilities of each data point ======        
        print('E-step ' + str(t))        
        mixturemodel_componentmemberships=run_estep(X,mixturemodel_means,mixturemodel_covariances,\
                                                    mixturemodel_inversecovariances,mixturemodel_weights)            
        # ====== M-step: update component parameters======        
        print('M-step ' + str(t))        
        print('M-step part1 ' + str(t))        
        mixturemodel_weights=run_mstep_sumweights(mixturemodel_componentmemberships)        
        print('M-step part2 ' + str(t))        
        mixturemodel_means=run_mstep_means(X,mixturemodel_componentmemberships,mixturemodel_weights)        
        print('M-step part3 ' + str(t))        
        mixturemodel_covariances,mixturemodel_inversecovariances=run_mstep_covariances(X,\
                                                                                       mixturemodel_componentmemberships,mixturemodel_weights,mixturemodel_means)        
        print('M-step part4 ' + str(t))        
        mixturemodel_weights=run_mstep_normalizeweights(mixturemodel_weights)    
        return(mixturemodel_weights,mixturemodel_means,mixturemodel_covariances,\
               mixturemodel_inversecovariances) 
    
# Try out the functions we just defined on the data 
n_components = 10 
n_emiterations = 20
mixturemodel_weights,mixturemodel_means,mixturemodel_covariances,\
                        mixturemodel_inversecovariances = perform_emalgorithm(X,n_components,n_emiterations)


E-step 0
E-step part2 
0
1
2
3
4
5
6
7
8
9
M-step 0
M-step part1 0
M-step part2 0
M-step part3 0
M-step part4 0


In [313]:
for k in range(n_components):    
    print(k, '\\\\')    
    highest_dimensionweight_indices = \
                                np.argsort(-np.squeeze(\
                                                        mixturemodel_means[k,:].toarray()),axis=0)           
    print(' '.join(vocabulary_pruned[\
                                    highest_dimensionweight_indices[1:20]]), '\\\\')


0 \\
rate rat rarely rare rapidly rapid rap rank rang ran raise rainstorm rain rage rather raft quite quietly quiet \\
1 \\
rate rat rarely rare rapidly rapid rap rank rang ran raise rainstorm rain rage rather raft quite quietly quiet \\
2 \\
rate rat rarely rare rapidly rapid rap rank rang ran raise rainstorm rain rage rather raft quite quietly quiet \\
3 \\
rate rat rarely rare rapidly rapid rap rank rang ran raise rainstorm rain rage rather raft quite quietly quiet \\
4 \\
rate rat rarely rare rapidly rapid rap rank rang ran raise rainstorm rain rage rather raft quite quietly quiet \\
5 \\
rate rat rarely rare rapidly rapid rap rank rang ran raise rainstorm rain rage rather raft quite quietly quiet \\
6 \\
rate rat rarely rare rapidly rapid rap rank rang ran raise rainstorm rain rage rather raft quite quietly quiet \\
7 \\
rate rat rarely rare rapidly rapid rap rank rang ran raise rainstorm rain rage rather raft quite quietly quiet \\
8 \\
rate rat rarely rare rapidly rapid rap rank

In [310]:
meanterms = np.zeros((n_components)) 
logdeterminants = np.zeros((n_components)) 
logconstantterms = np.zeros((n_components)) 

for k in range(n_components):    
    # Compute mu_k*inv(Sigma_k)*mu_k    
    meanterms[k] = (mixturemodel_means[k,:] * \
                    mixturemodel_inversecovariances[k] * mixturemodel_means[k,:].T)[0,0] 
    # Compute terms that involve distances of data from components 
xnorms = np.zeros((n_data,n_components)) 
xtimesmu = np.zeros((n_data,n_components)) 
for k in range(n_components):    
    xnorms[:,k] = (X * mixturemodel_inversecovariances[k] * X.T).diagonal(0)    
    xtimesmu[:,k] = np.squeeze((X * mixturemodel_inversecovariances[k] * \
                                 mixturemodel_means[k,:].T).toarray()) 
xdists = xnorms + repmat(meanterms,n_data,1) - 2 * xtimesmu 


0
[570 746 747 748 749 750 751 752 753 754]
“ I am Oz , the Great and Terrible . Who are you , and why do you seek me ? ”
1
[570 746 747 748 749 750 751 752 753 754]
“ I am Oz , the Great and Terrible . Who are you , and why do you seek me ? ”
2
[570 746 747 748 749 750 751 752 753 754]
“ I am Oz , the Great and Terrible . Who are you , and why do you seek me ? ”
3
[570 746 747 748 749 750 751 752 753 754]
“ I am Oz , the Great and Terrible . Who are you , and why do you seek me ? ”
4
[570 746 747 748 749 750 751 752 753 754]
“ I am Oz , the Great and Terrible . Who are you , and why do you seek me ? ”
5
[ 170  792   65 1071  402  362  124  797  206  892]
“ ‘ Why , he is a man , ’ said the other , and I quite agreed with him . The farmer carried me under his arm to the cornfield , and set me up on a tall stick , where you found me . He and his friend soon after walked away and left me alone .
6
[ 170  792   65 1071  402  362  124  797  206  892]
“ ‘ Why , he is a man , ’ said the other

In [311]:
for k in range(n_components):    
    tempdists = np.array(np.squeeze(xdists[:,k]))    
    highest_componentprob_indices = np.argsort(tempdists,axis=0)    
    print(k, '\\\\')    
    print(highest_componentprob_indices[0:10], '\\\\')    
    print(' '.join(wizard_oz_tokenized[highest_componentprob_indices[0]]), '\\\\')


0 \\
[570 746 747 748 749 750 751 752 753 754] \\
“ I am Oz , the Great and Terrible . Who are you , and why do you seek me ? ” \\
1 \\
[570 746 747 748 749 750 751 752 753 754] \\
“ I am Oz , the Great and Terrible . Who are you , and why do you seek me ? ” \\
2 \\
[570 746 747 748 749 750 751 752 753 754] \\
“ I am Oz , the Great and Terrible . Who are you , and why do you seek me ? ” \\
3 \\
[570 746 747 748 749 750 751 752 753 754] \\
“ I am Oz , the Great and Terrible . Who are you , and why do you seek me ? ” \\
4 \\
[570 746 747 748 749 750 751 752 753 754] \\
“ I am Oz , the Great and Terrible . Who are you , and why do you seek me ? ” \\
5 \\
[ 170  792   65 1071  402  362  124  797  206  892] \\
“ ‘ Why , he is a man , ’ said the other , and I quite agreed with him . The farmer carried me under his arm to the cornfield , and set me up on a tall stick , where you found me . He and his friend soon after walked away and left me alone . \\
6 \\
[ 170  792   65 1071  402  362  124

In [315]:
len_longest_par = 0
longest_par = wizard_oz_texts[0]
number_of_longest_text = 0

for idx, par in enumerate(wizard_oz_texts):
    len_par = len(par)
    if len_par > len_longest_par:
        len_longest_par = len_par
        number_of_longest_text = idx
        longest_par = wizard_oz_texts[idx]

print(longest_par)
longest_par = wizard_oz_pruned_texts[number_of_longest_text]

She left Dorothy alone and went back to the others. These she also led to rooms, and each one of them found himself lodged in a very pleasant part of the Palace. Of course this politeness was wasted on the Scarecrow; for when he found himself alone in his room he stood stupidly in one spot, just within the doorway, to wait till morning. It would not rest him to lie down, and he could not close his eyes; so he remained all night staring at a little spider which was weaving its web in a corner of the room, just as if it were not one of the most wonderful rooms in the world. The Tin Woodman lay down on his bed from force of habit, for he remembered when he was made of flesh; but not being able to sleep, he passed the night moving his joints up and down to make sure they kept in good working order. The Lion would have preferred a bed of dried leaves in the forest, and did not like being shut up in a room; but he had too much sense to let this worry him, so he sprang upon the bed and rolled

In [301]:
n_docs = len(wizard_oz_pruned_texts) 
n_vocab = len(vocabulary_pruned) 
# Matrix of term frequencies 
tfmatrix_len_norm = scipy.sparse.lil_matrix((n_docs,n_vocab))
tfmatrix_log = scipy.sparse.lil_matrix((n_docs,n_vocab))
tfmatrix_max = scipy.sparse.lil_matrix((n_docs,n_vocab))

# Row vector of document frequencies 
dfvector = scipy.sparse.lil_matrix((1,n_vocab)) 
# Loop over documents 
for k in range(n_docs):    
    # Row vector of which words occurred in this document  
    temp_dfvector = scipy.sparse.lil_matrix((1,n_vocab))   
    if k == number_of_longest_text:
        temp_total_terms = len(np.unique(wizard_oz_pruned_texts[k]))
    # Loop over words    
    for l in range(len(wizard_oz_pruned_texts[k])):        
    # Add current word to term-frequency count and document-count        
        currentword = vocabulary_pruned_indices[k][l]   
        if k == number_of_longest_text:
            tfmatrix_len_norm[k,currentword] += 1  
            tfmatrix_log[k,currentword] += 1
            tfmatrix_max[k,currentword] += 1
        temp_dfvector[0,currentword] = 1  
    # Length normalization frequence of raw counts
    if k == number_of_longest_text:
        tfmatrix_len_norm /= temp_total_terms
        max_count_word = int(np.max(np.array(tfmatrix_max.todense())))
    # Add which words occurred in this document to overall document counts
    dfvector = dfvector + temp_dfvector
    # Use the count statistics to compute the tf-idf matrix 
tfmatrix_log = np.log(1 + np.array(tfmatrix_log.todense())[0])
alpha = 0.5
tfmatrix_max = alpha + (1 - alpha) * np.array(tfmatrix_max.todense())[0] / max_count_word
# Let's use length-normalized frequency term count, and smoothed logarithmic idf 
idfvector = 1 + np.log((1 / (np.array(dfvector.todense())[0] + 1)) * n_docs)
idfvector_max = np.log((n_docs - (np.array(dfvector.todense())[0])) / (np.array(dfvector.todense())[0]))
tfidfmatrix_len_norm = tfmatrix_len_norm * idfvector
tfidfmatrix_log = tfmatrix_log * idfvector
tfidfmatrix_max = tfmatrix_max * idfvector_max

  idfvector_max = np.log((n_docs - (np.array(dfvector.todense())[0])) / (np.array(dfvector.todense())[0]))


In [316]:
print('Length-normalized frequency (TF) and Smoothed logarithmic inverse document frequency (IDF) ', '\\\\')
highest_tfidf_indices_len_norm = np.argsort(-1 * tfidfmatrix_len_norm,axis=0)
top_20_tfidf_longest_par_len_norm = np.squeeze(vocabulary_pruned[highest_tfidf_indices_len_norm[:20]])
print(top_20_tfidf_longest_par_len_norm, '\\\\')
print('Logarithm of the count (TF) and Smoothed logarithmic inverse document frequency (IDF) ', '\\\\')
highest_tfidf_indices_log = np.argsort(-1 * tfidfmatrix_log,axis=0)
top_20_tfidf_longest_par_log = np.squeeze(vocabulary_pruned[highest_tfidf_indices_log[:20]])
print(top_20_tfidf_longest_par_log, '\\\\')
print('Count relative to most frequent term (TF) and Version proportional to most common term (IDF)', '\\\\')
highest_tfidf_indices_max = np.argsort(-1 * tfidfmatrix_max,axis=0)
top_20_tfidf_longest_par_max = np.squeeze(vocabulary_pruned[highest_tfidf_indices_max[:20]])
print(top_20_tfidf_longest_par_max,'\\\\')

Length-normalized frequency (TF) and Smoothed logarithmic inverse document frequency (IDF)  \\
['directly' '1900' 'furiously' 'funny' 'fun' 'fully' 'full' 'fulfillment'
 'fruit' 'front' 'frock' 'frightened' 'frighten' 'fright' 'friends'
 'furniture' 'future' 'garden' 'garret' 'gentleman'] \\
Logarithm of the count (TF) and Smoothed logarithmic inverse document frequency (IDF)  \\
['1900' 'rate' 'rat' 'rarely' 'rare' 'rapidly' 'rapid' 'rap' 'rank' 'rang'
 'ran' 'raise' 'rainstorm' 'rain' 'rage' 'rather' 'raft' 'quite' 'quietly'
 'quiet'] \\
Count relative to most frequent term (TF) and Version proportional to most common term (IDF) \\
['natural' 'tumble' 'twenty' 'twin' 'cannon' 'rumble' 'kick' 'drawer'
 'twinkling' 'ruler' 'drawing-room' 'cabinet' 'cabbage' 'twisted'
 'polished—and' 'kitten' 'knee' 'burst' 'policeman' 'kinder'] \\
