In [1]:
# Disable Tensorflow warnings
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'  # or any {'0', '1', '2'}
import tensorflow as tf

In [2]:
# import dependencies
import json
import itertools
from collections import Counter

import spacy
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
# Load a Spacy Language model
sp = spacy.load("en_core_web_sm")

In [4]:
# Load data
with open('data/summaries.json', 'r') as outfile:
    summaries = json.load(outfile)

### Build a corpus vocabulary

In [5]:
# concatenate all tokenized texts into a single list
tokenized_texts = [i["tokenized_text"] for i in summaries]

# flatten the list of lists
vocab = list(itertools.chain(*tokenized_texts))

# remove duplicates
vocab = list(set(vocab))

In [6]:
# Save the vocabulary
with open('data/vocab.json', 'w') as outfile:
    json.dump(vocab, outfile)

In [7]:
# count how many times each token occurs in a document
docs_token_counter = []
for doc in summaries:
    doc_tokenized = doc["tokenized_text"]
    docs_token_counter.append(Counter(doc_tokenized))

### Find all unique tokens in the corpus

In [8]:
# For each token in corpus vocabulary, count in how many documents it occurs
number_docs_with_token  = {}
for token in vocab:
    count_docs = sum([1 for doc in docs_token_counter if token in doc.keys()])
    number_docs_with_token[token] = count_docs

In [9]:
number_docs_with_token['ebola']

2

### Compute TfIdfs of the documents

In [10]:
for i, doc in enumerate(docs_token_counter):
    doc_length = len(doc)
    tfidf_vec = []
    for token in vocab:
        
        # compute a term frequency (tf) per document
        tf = doc[token] / len(summaries[i]["tokenized_text"])
        
        # compute a log of inverse document frequency per document

        idf = np.log(len(summaries)/number_docs_with_token[token])

        tfidf = tf * idf
        tfidf_vec.append(tfidf)
    
    # add tf_idf vector to the dictionaries
    summaries[i]['tf_idf'] = tfidf_vec

In [11]:
# Save an updates summary with computed Tf-Idf vectors
with open('data/summaries.json', 'w') as json_file:
    json.dump(summaries, json_file)

### Vectorize query

In [12]:
query = "highest pandemic casualties"

In [13]:
# Reuse the tokenizer from Milestone 1 to tokenize search queries

def tokenizer(document):
    text_lowercased = sp(document.lower())
    tokens_without_stopwords = [word for word 
                     in text_lowercased 
                     if not word.is_stop 
                     and not word.is_punct
                     and len(word.dep_.strip())!=0]   
    
    token_lemmatized = [token.lemma_ 
               for token
               in tokens_without_stopwords]
    
    return token_lemmatized

In [14]:
# Reuse the workflow for article Tf-Idf calculation
# to build a vectorizer function for search queries

def vectorize(query, vocab = vocab):
    
    query_tokenized = tokenizer(query)
    query_token_counter = Counter(query_tokenized)
    query_vec = []
    for token in vocab:
        
        tf = query_token_counter[token] / len(query_tokenized)
        idf = np.log(len(summaries) /  number_docs_with_token[token])
        tfidf = tf * idf
        query_vec.append(tfidf)
            
    return query_vec

### Search documents with Sklearn

In [15]:
# Build a search function
def search_tfidf(query, docs):
    
    # vectorize query
    query_vec = vectorize(query)
    query_arr = np.array(query_vec)
    
    # Build a list of results and their cosine similarity scores
    rankings = []
    for doc in docs:
        doc_rank = {}
        doc_arr = np.array(doc['tf_idf'])
        rank = cosine_similarity(query_arr.reshape(1,-1), doc_arr.reshape(1, -1))[0][0]
        if rank > 0:
            doc_rank['title'] = doc['title']
            doc_rank['rank'] = rank
            rankings.append(doc_rank)

    #return sorted results
    return sorted(rankings, key=lambda k: k['rank'], reverse=True)
ranking = search_tfidf(query, summaries)

In [16]:
search_tfidf("ebola", summaries)

[{'title': 'Plague of Cyprian', 'rank': 0.11768601854662974},
 {'title': 'Science diplomacy and pandemics', 'rank': 0.07115769346687584}]

In [17]:
# Lets check if the article 'Plague of Cyprian' has a word "ebola" in it
for s in summaries:
    if s["title"] == 'Plague of Cyprian':
        print(s["text"])

The Plague of Cyprian was a pandemic that afflicted the Roman Empire about from AD 249 to 262. The plague is thought to have caused widespread manpower shortages for food production and the Roman army, severely weakening the empire during the Crisis of the Third Century. Its modern name commemorates St. Cyprian, bishop of Carthage, an early Christian writer who witnessed and described the plague. The agent of the plague is highly speculative because of sparse sourcing, but suspects have included smallpox, pandemic influenza and viral hemorrhagic fever (filoviruses) like the Ebola virus.
