<a href="https://colab.research.google.com/github/adrianmoses/text-search-nlp/blob/main/TextSearchNLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install spacy



In [2]:
!python -m spacy download en_core_web_md

Collecting en_core_web_md==2.2.5
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-2.2.5/en_core_web_md-2.2.5.tar.gz (96.4 MB)
[K     |████████████████████████████████| 96.4 MB 1.2 MB/s 
Building wheels for collected packages: en-core-web-md
  Building wheel for en-core-web-md (setup.py) ... [?25l[?25hdone
  Created wheel for en-core-web-md: filename=en_core_web_md-2.2.5-py3-none-any.whl size=98051302 sha256=b242a9b2ed28e4d1ec469eadac4aca40e99bb7d3180204b98f18f47d6bfc2742
  Stored in directory: /tmp/pip-ephem-wheel-cache-n8z3lcyc/wheels/69/c5/b8/4f1c029d89238734311b3269762ab2ee325a42da2ce8edb997
Successfully built en-core-web-md
Installing collected packages: en-core-web-md
Successfully installed en-core-web-md-2.2.5
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_md')


In [3]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [4]:
!ls

drive  sample_data


In [13]:
import json
def tokenize_cdc_data():
    with open('./drive/MyDrive/cdc_sample_data.json') as f:
         data = json.load(f)
    for item in data:
        doc = nlp(item['text'].lower())
        item['tokenized_text'] = [token.lemma_ 
                                  for token in doc 
                                  if not token.is_stop
                                  and not token.is_punct
                                  and token.dep_]

    with open('./cdc_tokenized_sample_data.json', 'w') as nf:
        json.dump(data, nf)
        

In [14]:
tokenize_cdc_data()

In [19]:
from itertools import chain
from collections import Counter


In [48]:
def build_vocabulary(documents):
    with open('./cdc_tokenized_sample_data.json') as f:
        data = json.load(f)
    all_tokens = list(chain(*[item['tokenized_text'] for item in documents]))
    token_counter = Counter(all_tokens)
    return token_counter

In [49]:
def count_docs_with_token(token):
    doc_counter = 0
    for item in data:
        if token in item['tokenized_text']:
            doc_counter += 1
    return doc_counter

In [50]:
def compute_tfidfs(document):
    vocab = build_vocabulary(data)
    tf_idf = []
    for token, token_count in vocab.most_common():
        docs_with_token = count_docs_with_token(token)
        count_in_doc = Counter(document)[token]
        tf = count_in_doc / token_count
        idf = len(data) / docs_with_token 
        tf_idf.append(tf * idf)
    return tf_idf

In [51]:
for item in data:
    item['tf_idfs'] = compute_tfidfs(item['tokenized_text'])

In [52]:
with open('vocab.json', 'w') as vocab_file:
    vocab = build_vocabulary(data)
    json.dump(vocab, vocab_file)

In [53]:
with open('cdc_vectorized.json', 'w') as vec_file:
    json.dump(data, vec_file)

In [54]:
def tokenizer(input_string):
    doc = nlp(input_string.lower())
    tokens = [token.lemma_ 
                                  for token in doc 
                                  if not token.is_stop
                                  and not token.is_punct
                                  and token.dep_]
    return tokens

In [55]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

def search_tfids(query, documents):
    tokens = tokenizer(query)
    tf_idfs = compute_tfidfs(tokens)
    doc_sim = [(doc, cosine_similarity(np.array([tf_idfs]), np.array([doc['tf_idfs']]))) for doc in documents]
    doc_sim.sort(key=lambda tup: tup[1])
    ranked_documents = [d[0] for d in doc_sim]
    return ranked_documents

In [56]:
docs = search_tfids("care", data)
[d['title'] for d in docs[:10]]

['Pandemic',
 'Antonine Plague',
 'Basic reproduction number',
 'Bills of mortality',
 'Cholera',
 'COVID-19 pandemic',
 'Crimson Contagion',
 'Disease X',
 'Event 201',
 'HIV/AIDS']

In [68]:
inverted_index = {}

for i, (token, count) in enumerate(vocab.most_common()):
    inverted_index[token] = []
    for item in data:
        tfidf = item['tf_idfs'][i]
        if tfidf != 0:
            inverted_index[token].append((item['title'], tfidf))

In [71]:
def search_inverted_index(query):
    tokens = tokenizer(query)
    doc_results = []
    for token in tokens:
        if token in inverted_index:
            doc_results.extend(inverted_index[token])
    title_results_map = {}
    for title, tf_idf in doc_results:
        if title in title_results_map:
            title_results_map[title] += tf_idf
        else:
            title_results_map[title] = tf_idf

    return sorted(title_results_map.items(), key=lambda tup: tup[1], reverse=True)

In [72]:
search_inverted_index("symptoms of swine flu")

[('Swine influenza', 16.34027777777778),
 ('Spanish flu', 3.25),
 ('Cholera', 2.4375),
 ('HIV/AIDS', 2.4375),
 ('COVID-19 pandemic', 0.8125),
 ('Pandemic', 0.3611111111111111),
 ('Unified Victim Identification System', 0.3611111111111111)]