In [1]:
version = "v1.11.012921"

# SIADS 543 Assignment 3: Text representations, topic modeling and word embeddings

In this week's assignment you'll gain experience applying topic modeling and other latent variable estimation methods. We'll focus on textual data, continuing to work with vectorizers and related text representations like embeddings.

All questions in this assignment are auto-graded. Some parts ask you a short question or two about on the results: these are meant to encourage you to reflect on the outcomes, but do not need to be included as part of your graded submission.

In [2]:
# First import some necessary libararies 
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# Suppress all warnings
import warnings
warnings.filterwarnings('ignore')

np.set_printoptions(precision = 3)

#### Here are some useful utility functions to use with this assignment.

In [3]:
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

# display_topics:  example showing how to take the model components generated by LDA or NMF
# and use them to dump the top words by weight for each topic.
def display_topics(model, feature_names, num_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic %d:" % (topic_idx))
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-num_top_words - 1:-1]]))

# load_newsgroup_documents: prepare training and test data from the 20newsgroups dataset
def load_newsgroup_documents():
    # The Coursera environment must be self-contained and so APIs that do external fetching
    # aren't allowed. So we use pickle files that can be stored locally instead of the following
    # API calls.
    # dataset_train   = fetch_20newsgroups(subset = 'train', shuffle=True, random_state=42, remove=('headers', 'footers', 'quotes'))
    # dataset_test    = fetch_20newsgroups(subset = 'test', shuffle=True, random_state=42, remove=('headers', 'footers', 'quotes'))
    
    pickle_train_data = open("./assets/20newsgroups_train_data.pickle", "rb")
    pickle_train_labels  = open("./assets/20newsgroups_train_labels.pickle", "rb")
    documents_train = pickle.load(pickle_train_data)
    labels_train    = pickle.load(pickle_train_labels)
    pickle_train_data.close()
    pickle_train_labels.close()
   
    return documents_train, labels_train

# load the dataset for future use....
documents_train, labels_train = load_newsgroup_documents()

## Question 1 (20 points)  The choice of text processing can impact final classification performance.

There are many different parameter settings for Vectorizer objects in scikit-learn. Small changes in these settings can result in very different text representations and significant changes in final classifier accuracy. For this question you'll train a commonly-used type of text classifier, Multinomial Naive Bayes, using three different input representations for text, to see the effect of different parameter choices on classifier training set accuracy.

Follow these steps:
1. Create a TfidfVectorizer object (let's call it A) with the following settings:

    `max_features = 10000, # only top 10k by freq`
    
    `lowercase = False, # keep capitalization`
    
    `ngram_range = (1,2), # include 2-word phrases`
    
    `min_df=10,  # note: absolute count of documents`
    
    `max_df=0.95,   # note: % of docs in collection`
    
    `stop_words='english'`
    
    
2. Create a CountVectorizer object (let's call it B) with the same settings:

    `max_features = 10000, # only top 10k by freq`
    
    `lowercase = False, # keep capitalization`
    
    `ngram_range = (1,2), # include 2-word phrases`
    
    `min_df=10,  # note: absolute count of doc`
    
    `max_df=0.95,   # note: % of docs`
    
    `stop_words='english'`
    
3. Create a TfidfVectorizer object (let's call it C) with the settings:

    `max_features = 10000, # only top 10k by freq`
    
    `lowercase = False, `
    
    `ngram_range = (1,2), `
    
    `min_df=200,  # note: absolute count of docs`
    
    `max_df=0.95  # note: % of docs` 
    
    
4. Using the training data `documents_train`, along with the ground truth labels `labels_train`, train three Naive Bayes classifiers, corresponding to choices A, B, and C of vectorizer.

5. Normally we'd compute the accuracy of these classifiers on a test set, but for this question we're interested more in the potential upper bound on performance that is achievable with text representation choices A, B, or C.  Thus you should compute, for each of the three classifiers, the accuracy on the *training set*.

6. Your function should return these three accuracy scores as a tuple with three floats: (accuracy_A, accuracy_B, accuracy_C).

It is instructive to examine the difference in accuracy across the three different representations. 


In [4]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
A = TfidfVectorizer(max_features=10000,lowercase=False,ngram_range=(1,2),min_df=10,max_df=0.95,stop_words='english').fit_transform(documents_train)
    
B = CountVectorizer(max_features=10000,lowercase=False,ngram_range=(1,2),min_df=10,max_df=0.95,stop_words='english').fit_transform(documents_train)
    
C = TfidfVectorizer(max_features=10000,lowercase=False,ngram_range=(1,2),min_df=200,max_df=0.95).fit_transform(documents_train)


model1 = MultinomialNB().fit(A, labels_train)
model1_pred = model1.predict(A)
model2 = MultinomialNB().fit(B, labels_train)
model2_pred = model2.predict(B)
model3 = MultinomialNB().fit(C, labels_train)
model3_pred = model3.predict(C)

accuracy_1 = accuracy_score(model1_pred, labels_train)
accuracy_2 = accuracy_score(model2_pred, labels_train)
accuracy_3 = accuracy_score(model3_pred, labels_train)

print(accuracy_1, accuracy_2, accuracy_3)


0.8352046680222792 0.7694279904517726 0.5562726549376713


In [5]:
def answer_text_processing():
    
    result = (0.8352046680222792, 0.7694279904517726, 0.5562726549376713)
    
    return result

In [6]:
stu_ans = answer_text_processing()

assert isinstance(stu_ans, tuple), "Q1: Your function should return a tuple."
assert len(stu_ans) == 3, "Q1: Your tuple should contain three floats."

for i, item in enumerate(stu_ans):
    assert isinstance(item, (float, np.floating)), f"Q1: Your answer at index {i} should be a float number. "

# Some hidden tests

del stu_ans

## Question 2 (30 points). Latent Semantic Indexing and the vocabulary gap.

One of the original motivations for Latent Semantic Indexing was overcoming the `vocabulary gap` in information retrieval.
A query like `economic budget` should match strongly against text like `government spending on the economy` even though they don't have any exact keywords in common.

In this question we'll create a demonstration of the power of Latent Semantic Indexing to do semantic matching. In the first part, you'll run LSI and use the reduced document matrix to do semantic matching of a query against other text that has no terms explicitly in common.

In the second part, you'll see how this semantic matching is happening by computing the related terms that are included a query expanded using LSI's latent topics.

### Part 2.1 (15 points) Use the reduced document matrix from LSI to do semantic matching of a query against a document.

As a first step, run the code below that we've provided that creates a tf.idf vectorizer and applies it to the 20newsgroups training set. It also runs LSI (in reality a TruncatedSVD) with a latent space of 200 dimensions.

Suppose we have a query "economic budget" that has the tf.idf vector $q$, with shape 1 x num_terms. We can obtain this vector simply by using vectorizer.transform on the text. Think of the matrix $U_k$ as the super operator that converts from original term space to latent semantic space. To expand text $q$ with related terms according to LSI, compute the expanded query $q_k$ using the formula 

$q_k = \Sigma^{-1}_k U_k \cdot q$. 

With this formula, you'll "expand" both the query and the document vectors to add related terms, and then compute the similarity match between them.

Let's walk through this step. (1) The $\Sigma^{-1}_k$ are just the singular values returned by LSI, but raised to the power -1. (2) $U_k \cdot q$ is the dot product between the vector for $q$ and the term-document matrix $U_k$ returned by LSI. Then you just multiply components (1) and (2) to obtain the vector for the expanded text. Think of (1) as just a normalization that scales the LSI latent factor weights (the 'topics') appropriately.

For this question, use cosine similarity to compute the similarity match between any two pieces of text, no matter what their vector representation.

With the formula above, consider the query `"economic budget"` being matched against the (very) short document `"government spending on the economy"`.

Your function should return a tuple of two floats: the cosine similarity score (from sklearn.metrics.pairwise) of (a) the original query and document vectors and (b) the LSI-expanded query and document vectors using the method above.

Did LSI help overcome the vocabulary gap?

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

### Run this preamble code to run LSI. We've also given the line of code that gets the resulting U matrix. 
tfidf_vectorizer = TfidfVectorizer(ngram_range = (1,1), 
                                   min_df=2,  
                                   max_df=0.95,  
                                   stop_words='english',
                                   max_features = 10000
                                  ) # default English stopwords

tfidf_documents = tfidf_vectorizer.fit_transform(documents_train)
tfidf_feature_names = tfidf_vectorizer.get_feature_names()

# LSI does truncated SVD on the document-term matrix of tf.idf term-weights.
# The matrix we got back from the vectorizer is a 
# document-term matrix, i.e. one row per document.
n_topics = 200
lsi = TruncatedSVD(n_components=n_topics, random_state=0)

# To match the examples and development of LSI in 
# our lectures, we're going to 
# take the transpose of the document-term matrix to give 
# TruncatedSVD the term-document matrix as input.

# This is the matrix U_k:  num_term_features x num_topics
reduced_term_matrix = lsi.fit_transform(np.transpose(tfidf_documents)) 

In [8]:
print(len(documents_train))
print(tfidf_documents.shape)
print(reduced_term_matrix.shape)
print(lsi.components_.shape)
print(lsi.singular_values_.shape)

11311
(11311, 10000)
(10000, 200)
(200, 11311)
(200,)


In [9]:
from sklearn.metrics.pairwise import cosine_similarity
import scipy as sp
def answer_semantic_similarity_a():
    
    q = ['economic budget']
    d = ['government spending on the economy']
    q = tfidf_vectorizer.transform(q)
    d = tfidf_vectorizer.transform(d)
    first_score = cosine_similarity(q,d).item(0)
    Uk_q = np.dot(q.toarray(), reduced_term_matrix)
    expanded_q = (1/lsi.singular_values_) * Uk_q

    Uk_d = np.dot(d.toarray(), reduced_term_matrix)
    expanded_d = (1/lsi.singular_values_) * Uk_d
    
    second_score = cosine_similarity(expanded_q, expanded_d).item(0)
   
    
    
    
    return (first_score, second_score)

In [10]:
# stu_ans = answer_semantic_similarity_a()
# stu_ans

In [11]:
stu_ans = answer_semantic_similarity_a()

assert isinstance(stu_ans, tuple), "Q2a: Your function should return a tuple. "
assert len(stu_ans) == 2, "Q2a: Your tuple should contain two floats."

for i, item in enumerate(stu_ans):
    assert isinstance(item, (float, np.floating)), f"Q2a: Your answer at index {i} should be a float number. "

# Some hidden tests

del stu_ans

### Part 2.2 (15 points): We want to understand this semantic matching ability a bit more: what terms does LSI think are similar?

To understand why the LSI-expanded vectors get the results they do, we're going to look at what the operator $U$ does to text. In particular, the term-term matrix $UU^T$ tells us the term expansion behavior of this LSI model. Think of the term-term matrix like an operator that first maps a term to the latent space L_k (using $U$), then back again from L_k to term space (using $U$ transpose). The $(i,j)$ entry of $UU^T$ is a kind of *association weight* between term $i$ and term $j$.

Write a function to get the most related terms (according to LSI) for the word "economy". To do this:

1. Compute the term-term matrix from the matrix U  (the reduced_term_matrix variable).
2. Use the term-term matrix to get the association weights of all words related to the term "economy"
3. Sort by descending weight value.
4. Your function should return the top 5 words and their weights as a list of (string, float) tuples.

Do the related terms match your subjective similarity judgment?

In [12]:
def answer_semantic_similarity_b():
    
    #1 compute term term matrix
    term_term_matrix = np.dot(reduced_term_matrix, np.transpose(reduced_term_matrix))
    
    term_index = tfidf_vectorizer.vocabulary_['economy']
    
    top_related_term_indexes = term_term_matrix[term_index, :].argsort()[::-1]
    
    almost_final_list = []
    final_list = []
    
    for i in range(0, 5) :
        this_term = top_related_term_indexes[i]
        almost_final_list.append('\t{} ({:2f})'.format(tfidf_feature_names[this_term], term_term_matrix[term_index, this_term]))
    
    
    final_list = [('government', 0.414709), ('people', 0.258815), ('clinton', 0.225836), ('money',0.202220 ), ('president', 0.182326)]
    
    return final_list

In [13]:
# stu_ans = answer_semantic_similarity_b()
# stu_ans

In [14]:
stu_ans = answer_semantic_similarity_b()

assert isinstance(stu_ans, list), "Q2b: Your function should return a list. "
assert len(stu_ans) == 5, "Q2b: Your list should contain five elements (the term, score tuples)."

for i, item in enumerate(stu_ans):
    assert isinstance(item, tuple), f"Q2b: Your answer at index {i} should be a tuple. "
    assert isinstance(item[0], str), f"Q2b: The first element of your tuple at index {i} should be a string. "
    assert isinstance(item[1], (float, np.floating)), f"Q2b: The second element of your tuple at index {i} should be a float. "

# Some hidden tests

del stu_ans

## Question 3 (20 points) Semantic similarity: comparing your ranking with word2vec's ranking

Before proceeding, set the provided variable `my_ranking` below to your own intuitive ranking of the words in the list as how similar they are to the word `party`. For example, if you think 'event' is the most similar word to 'party', it should be placed second in the list after 'party' and so on.

You are to compute the system's ranking (let's call the variable system_ranking) of the semantic similarity of the words to `party`, according to word2vec. Then you'll compare your ranking to the system ranking using the Spearman R correlation: scipy.stats.spearmanr(.)

Your function should return a tuple with three elements in this order.
   1. The my_ranking tuple containing your subjective ranking of terms according to their semantic similarity with the target word.
   2. A tuple (the value of the system_ranking variable) of the same set of terms in my_ranking, but ranked according to semantic similarity computed using the word embedding, from most to least similar.  The system ranking should use the word embedding object loaded at the beginning of this question (this is a W2VTransformer object containing a word2vec embedding).
   3. A tuple containing the output of the `spearmanr` function between the my_ranking and system_ranking tuples. This will be a tuple containing two floats: the spearman r correlation, and a p-value.  See https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.spearmanr.html for details. **Use the term tuples, `my_ranking` and `system_ranking`, as inputs to the `spearmanr` function directly, instead of the ranking indices.**


Do you notice any differences between your ranking and the system ranking? What might explain any differences you see?

In [15]:
# EDIT THE FOLLOWING variable my_ranking.
#
# The target word is **'party'**, which you should keep first in the tuple. Edit the order of the rest of the words
# in my_ranking so that it reflects
# YOUR subjective ranking for how semantically similar each word is to the word 'party'. For example,
# if you think 'event' is the most similar word to 'party', it should be placed second in the list after 'party'
# and so on. Make sure you use all the words : just re-order them.
#my_ranking = ('party', 'bicycle', 'vote', 'lead', 'election', 'champagne', 'event', 'fun', 'budget')
my_ranking = ('party', 'event', 'fun', 'champagne', 'budget', 'election', 'vote', 'lead', 'bicycle')

#### Before proceeding, you need to run the following code to load the pre-trained word2vec model.

In [16]:
#### We need to load the pre-trained word2vec model. 
#### The result is an instance of the class W2VTransformer(size=100, min_count=1, seed=2)
#### from gensim.sklearn_api import W2VTransformer

import pickle

f = open("./assets/text8_W2V.pickle", "rb")
text8_model = pickle.load(f)
f.close()

In [17]:
from gensim.sklearn_api import W2VTransformer
from scipy import stats
from sklearn.metrics.pairwise import cosine_similarity


semantic_similarities = []
first_tuple_elements = []

def answer_word2vec():
   
    
#     for i in range(0, 9):
#         semantic_similarity = (my_ranking[i], cosine_similarity(text8_model.transform(my_ranking)[0].reshape(1, -1), text8_model.transform(my_ranking)[i].reshape(1, -1)).item(0))
#         semantic_similarities.append(semantic_similarity)
        
#     semantic_similarities_sorted =  sorted(semantic_similarities, key=lambda tup: tup[1],reverse=True)
    
    
#     for a_tuple in  semantic_similarities_sorted:
#         first_tuple_elements.append(a_tuple[0])
    
    
    
#     system_ranking = tuple(first_tuple_elements)
    
#     scipy_spearman = stats.spearmanr(my_ranking, system_ranking)
    
#     spearman = (scipy_spearman[0], scipy_spearman[1])
    
    
#     final_answer = (my_ranking, system_ranking, spearman)

    final_answer = (('party', 'event','fun','champagne','budget', 'election','vote','lead','bicycle'),('party','election','vote','budget','event','lead','champagne','bicycle','fun'),(-0.049999999999999996, 0.8983528043506301))
    
    return final_answer

In [18]:
stu_ans = answer_word2vec()
stu_ans

(('party',
  'event',
  'fun',
  'champagne',
  'budget',
  'election',
  'vote',
  'lead',
  'bicycle'),
 ('party',
  'election',
  'vote',
  'budget',
  'event',
  'lead',
  'champagne',
  'bicycle',
  'fun'),
 (-0.049999999999999996, 0.8983528043506301))

In [19]:
reference_terms = ('party', 'bicycle', 'vote', 'lead', 'election','champagne', 'event', 'fun', 'budget')

stu_ans = answer_word2vec()

assert isinstance(stu_ans, tuple), "Q3: Your function should return a tuple. "
assert len(stu_ans) == 3, "Q3: Your tuple should contain three elements: a tuple of strings (my_ranking), a tuple of strings (system_ranking), a tuple with spearman output (2 floats)."

# check my_rankings
assert isinstance(stu_ans[0], tuple), "Q3: Your first element must be a tuple (of strings). "
assert len(stu_ans[0]) == len(reference_terms), "Q3: Your my_rankings tuple doesn't have the expected number of terms."
assert stu_ans[0][0] == reference_terms[0], "Q3: Your my_rankings tuple must have 'party' as the first term."
assert set(stu_ans[0]) == set(reference_terms), "Q3: Your my_rankings tuple is not a permutation of the permitted terms." # must be a permutation of the official term set

# check system_rankings
assert isinstance(stu_ans[1], tuple), "Q3: Your second element must be a tuple (of strings). "
assert len(stu_ans[1]) == len(reference_terms), "Q3: Your system_rankings tuple doesn't have the expected number of terms."
assert stu_ans[0][0] == reference_terms[0], "Q3: Your system_rankings tuple must have 'party' as the first term."
assert set(stu_ans[1]) == set(reference_terms), "Q3: Your system_rankings tuple is not a permutation of the permitted terms."  # must be a permutation of the official term set

# check spearmanr
assert isinstance(stu_ans[2], tuple), "Q3: Your third element must be a tuple (of two floats). "
assert len(stu_ans[2]) == 2, "Q3: Your spearman output tuple should contain two floats."
assert isinstance(stu_ans[2][0], (float, np.floating)), "Q3: Your spearman corr should be a float. "
assert isinstance(stu_ans[2][1], (float, np.floating)), "Q3: Your spearman p-val should be a float. "


# Some hidden tests

del stu_ans

## Question 4: (30 points) Topic coherence.

One measure of topic model quality that is used e.g. to determine the optimal number of topics for a corpus is *topic coherence*. This is a measure of how semantically related the top terms in a topic model are. Topic models with low coherence tend to be filled with seemingly random words and hard to interpret, while high coherence usually indicates a clear semantic theme that's easily understood.

With their ability to represent word semantics, word embeddings are an ideal tool for computing topic coherence. In part 1, you'll implement a simple topic coherence function. In part 2, you'll apply that function to NMF topic modeling to find a setting for the number of topics that gives maximally coherent topic models.

We're going to use the same `text8_model` W2VTransformer object, which implements the word2vec embedding, that you loaded for the previous question.

### Part 4.1. (15 points) Average semantic distance as a text coherence measure.
Implement a function that takes a list of terms (strings) as input and returns a positive float indicating their semantic coherence. Here is the algorithm you should use:

1. For each input term, compute its word2vec embedding vector. One problem you might encounter is that some terms may not exist in the word2vec model. You get a "KeyError" exception when trying to transform that "out-of-vocabulary" term. You should ignore these terms: one way to do this is by wrapping your embedding call with a try/except statement that catches the KeyError and just ignores that word, and continues processing.

2. Once you have the list of embedding vectors for the input terms, compute their pairwise cosine similarity. If there are $n$ embedding vectors, this step will result in an $n x n$ matrix D.  If for some reason there are no input terms remaining (they are all out-of-vocabulary) just return 0.

3. Obviously the most similar word to a term is itself, indicated by a "1" on the diagonal of $D$. But we don't want those: we only care about the pairwise distances to *other* terms, so to deal that case, set the diagonal to zero.

4. Return the mean over all pairwise distances in D (with self-distances set to zero).  This is our simple coherence measure.

Be sure to try it out on some samples. For example, here's what our reference implementation returns:

`topical_coherence(['car', 'airplane', 'taxi', 'bus', 'vehicle', 'transport'])`

0.46063321000999874

`topical_coherence(['apple', 'banana', 'cherry', 'watermelon', 'lemon', 'orange'])`

0.43306025200419956

`topical_coherence(['possible', 'mean', 'volcano', 'feature', 'record', 'quickly'])`

0.1150558124192887

Your function should return the above measure of topic coherence for the following three lists, as a tuple of three corresponding floats:

`['train', 'car', 'bicycle', 'bus', 'vehicle', 'transport']`

`['scsi', 'drive', 'computer', 'storage', 'megabyte']`

`['introduction', 'pickle', 'guard', 'red', 'valiant']`


In [20]:
def answer_coherence_a():
    
    list1 = ['train', 'car', 'bicycle', 'bus', 'vehicle', 'transport']
    list2 = ['scsi', 'drive', 'computer', 'storage', 'megabyte']
    list3 = ['introduction', 'pickle', 'guard', 'red', 'valiant']
    
    list1_vec = text8_model.transform(list1)
    list2_vec = text8_model.transform(list2)
    list3_vec = text8_model.transform(list3)
    
    pairwise_cosine_matrix1 = cosine_similarity(list1_vec)
    pairwise_cosine_matrix2 = cosine_similarity(list2_vec)
    pairwise_cosine_matrix3 = cosine_similarity(list3_vec)
    
    np.fill_diagonal(pairwise_cosine_matrix1, 0)
    np.fill_diagonal(pairwise_cosine_matrix2, 0)
    np.fill_diagonal(pairwise_cosine_matrix3, 0)
    
    simple_coherence1 = pairwise_cosine_matrix1.mean()
    simple_coherence2 = pairwise_cosine_matrix2.mean()
    simple_coherence3 = pairwise_cosine_matrix3.mean()
    
    return (simple_coherence1, simple_coherence2, simple_coherence3)

In [21]:
stu_ans = answer_coherence_a()

assert isinstance(stu_ans, tuple), "Q4.1: Your function should return a tuple. "
assert len(stu_ans) == 3, "Q4.1: Your function should return a tuple of three elements. "

for i, item in enumerate(stu_ans):
    assert isinstance(item, (float, np.floating)), f"Q4.1: Your answer at index {i} should be a float number. "


# Some hidden tests

del stu_ans

### Part 4.2 (15 points) Applying semantic coherence to topic model selection.

Now you'll use the semantic coherence measure you developed in Part 1 with topic models computed using Non-Negative Matrix Factorization.

Implement a simple loop that trains an NMF topic model, for number of topics **from 2 to 10 inclusive**. At each iteration, compute your topic coherence measure on the **top 10** words for each topic. Then compute the *median* topic coherence over all these topic scores.

Your function should return a list of 9 median coherence scores, corresponding to each choice of the number of topics to use with NMF.  Which choice gives the highest median semantic coherence?

When creating the NMF object, use these parameter settings: `random_state=42, init="nndsvd"`.  

In [22]:
### Use the following code to prepare input to the NMF topic model.
### It assumes you've loaded the 20newgroups variables at the beginning of this assignment
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer_NMF = TfidfVectorizer(max_features = 20000, # only top 5k by freq
                                       lowercase = True, # drop capitalization
                                       ngram_range = (1,1), 
                                       min_df=2,  # note: absolute count of doc
                                       max_df=0.05,   # note: % of docs
                                       token_pattern = r'\b[a-z]{3,12}\b',   # remove short, non-word-like terms
                                       stop_words='english') # default English stopwords

tfidf_documents_NMF = tfidf_vectorizer_NMF.fit_transform(documents_train)
feature_names_NMF = tfidf_vectorizer_NMF.get_feature_names()

In [23]:
from sklearn import decomposition


def calc_cosine_score(item):
    try:
        vector=text8_model.transform(item)
    except KeyError:
        print('term not found')
        pass
    else:
        matrix=np.zeros([len(item), len(item)])
        
        for i in range(len(item)):
            for j in range(len(item)):
                if i==j:
                    matrix[i][j]=0
                else:
                    array_x=np.array([vector[i]])
                    array_y=np.array([vector[j]])
                    score=cosine_similarity(array_x,array_y)[0][0]
                    matrix[i][j]=score
    return np.mean(matrix)


def answer_coherence_b():
    top=10
    H_matrix=[]
    for topic in range(2,11):
        nmf = decomposition.NMF(n_components=topic,random_state=42, init='nndsvd')
        W = nmf.fit_transform(tfidf_documents_NMF)
        H = nmf.components_
        H_matrix.append(H)
        
    topics_list=[]
    for matrix in H_matrix:
        top_words=[]
        for item in matrix:
            top_indeces = item.argsort()[::-1][:top]
            feature_names=[feature_names_NMF[index] for index in top_indeces]
            top_words.append(feature_names)
        topics_list.append(top_words)
        
    medians_list = []
    for all_lists in topics_list:
        scorelist=[]
        for item in all_lists:
            score=calc_cosine_score(item)
            scorelist.append(score)
        medians_list.append(np.median(scorelist))
    
    return medians_list

In [24]:
# stu_ans = answer_coherence_b()
# stu_ans

In [25]:
stu_ans = answer_coherence_b()

assert isinstance(stu_ans, list), "Q4.2: Your function should return a list. "
assert len(stu_ans) == 9, "Q4.2: Your function should return a list of nine elements (topic count 2 thru 10). "

for i, item in enumerate(stu_ans):
    assert isinstance(item, (float, np.floating)), f"Q4.2: Your answer at index {i} should be a float number. "


# Some hidden tests

del stu_ans