# Imports

In [1]:
# Third-Party Imports
import spacy
import nltk
from nltk.stem import WordNetLemmatizer
nltk.download("wordnet")
import torch
from sentence_transformers import SentenceTransformer, util

# Standard Library Imports
import os
import sys
from math import inf

# Local Imports
from queries import get_text_cli
from get_documents import search
from get_documents import article_id

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/bhekimaenetja/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# Utility Functions

In [2]:
def get_text(term=None):
    if not term:
        term = get_text_cli('Enter a search term')
    return search(term)

def load_docs(dirname='corpus'):
    corpus = dict()
    main_path = os.path.join(os.path.dirname('__file__'), dirname)

    for file in os.listdir(main_path):
        with open(os.path.join(main_path, file), 'r') as f:
            id_and_name = tuple(file.split('.')[0].split('-'))
            corpus[id_and_name] = f.read()
    
    return corpus

def chunk_text(text, chunk_len):
    chunks = []
    current_chunk = ""
    sents = nltk.sent_tokenize(text)
    
    for sent in sents:
        if len(nltk.word_tokenize(current_chunk + f" {sent}")) >= chunk_len:
            chunks.append(current_chunk)
            current_chunk = ""
        else:
            current_chunk += f" {sent}"
    
    chunks.append(current_chunk)
    
    return chunks

def cosine_similarity(text_1, text_2, model=None):
    if not model:
        model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
    
    embedding_1 = model.encode(text_1, convert_to_tensor=True)
    embedding_2 = model.encode(text_2, convert_to_tensor=True)
    
    return float(util.pytorch_cos_sim(embedding_1, embedding_2))

# WikiBot Query Parsing

In [3]:
import wikipediaapi
from string import punctuation

## Getting Searchable Entites

In [4]:
def get_named_entities(query):
    # Intialise nlp model
    nlp = spacy.load("en_core_web_sm")
    
    # Get entities from queries
    doc = nlp(query)
    entities = { ent.text for ent in doc.ents }
    return entities

def word_tokenize(text):
    banned = list(punctuation) + nltk.corpus.stopwords.words("english")
    
    return [
        w for w in nltk.word_tokenize(text) 
        if w.lower() not in banned
    ]

def get_improper_nouns(query):
    lemma = WordNetLemmatizer()
    pos_tags = nltk.pos_tag(word_tokenize(query))
    return {
        lemma.lemmatize(tag[0]).lower() 
        for tag in pos_tags 
        if tag[-1] in ("NN", "NNS")
    }

def searchable_entities(query):
    improper_nouns = get_improper_nouns(query)
    named_entities = get_named_entities(query)
    return improper_nouns.union(named_entities)

In [5]:
query = "How many movies has Tom Cruise been in?"

In [6]:
searchable_entities(query)

{'Tom Cruise', 'movie'}

## Building Wikipedia Corpus

In [7]:
def build_wiki_corpus(search_ents, corpus=None):
    wiki = wikipediaapi.Wikipedia('en')
    
    if corpus:
        ids = [k[0] for k in corpus.keys()]
    else:
        corpus = dict()
        ids = []
    
    for ent in search_ents:
        page = wiki.page(ent)
        if page.exists():
            doc_id = article_id(ent)
            if doc_id not in ids:
                corpus[(doc_id, page.title)] = page.text
                ids.append(doc_id)
            else:
                print(f"Ha! {ent} is already in there!")
    
    return corpus    

In [8]:
ents = searchable_entities("Whats's the difference between a lion, tiger, leopard, cheetah, and meerkat?")
ents

{'cheetah', 'difference', 'leopard', 'lion', 'meerkat', 'tiger'}

In [9]:
old_corps = load_docs()
old_corps

{('Q34706',
  'Leopard'): 'See text\nThe leopard (Panthera pardus) is one of the five extant species in the genus Panthera, a member of the cat family, Felidae. It occurs in a wide range in sub-Saharan Africa, in some parts of Western and Central Asia, Southern Russia, and on the Indian subcontinent to Southeast and East Asia. It is listed as Vulnerable on the IUCN Red List because leopard populations are threatened by habitat loss and fragmentation, and are declining in large parts of the global range. The leopard is considered locally extinct in Hong Kong, Singapore, South Korea, Jordan, Morocco, Togo, the United Arab Emirates, Uzbekistan, Lebanon, Mauritania, Kuwait, Syria, Libya, Tunisia and most likely in North Korea, Gambia, Laos, Lesotho, Tajikistan, Vietnam and Israel.\nContemporary records suggest that the leopard occurs in only 25% of its historical global range.\nCompared to other wild cats, the leopard has relatively short legs and a long body with a large skull. Its fur is

In [10]:
new_corps = build_wiki_corpus(ents, old_corps)

for k, v in new_corps.items():
    print(k, v)
    print(f"\n\n{100*'='}\n")

Ha! tiger is already in there!
Ha! leopard is already in there!
Ha! lion is already in there!
Ha! cheetah is already in there!
('Q34706', 'Leopard') See text
The leopard (Panthera pardus) is one of the five extant species in the genus Panthera, a member of the cat family, Felidae. It occurs in a wide range in sub-Saharan Africa, in some parts of Western and Central Asia, Southern Russia, and on the Indian subcontinent to Southeast and East Asia. It is listed as Vulnerable on the IUCN Red List because leopard populations are threatened by habitat loss and fragmentation, and are declining in large parts of the global range. The leopard is considered locally extinct in Hong Kong, Singapore, South Korea, Jordan, Morocco, Togo, the United Arab Emirates, Uzbekistan, Lebanon, Mauritania, Kuwait, Syria, Libya, Tunisia and most likely in North Korea, Gambia, Laos, Lesotho, Tajikistan, Vietnam and Israel.
Contemporary records suggest that the leopard occurs in only 25% of its historical global r

# Document Clustering