In [10]:
#insert json
import json
import requests
import numpy as np
import matplotlib.pyplot as plt
from nltk.tokenize import TreebankWordTokenizer
from collections import defaultdict
import re
from nltk.corpus import stopwords 

In [2]:
#load json data
url = "http://api.apomden.com/v2/facilities"
hospitals=requests.get(url).json()
hospitals = hospitals["data"]

### Search Algorithm

In [3]:
SEARCH_FIELDS = ["name", "street", "city", "district", "regions", "services"]

def getTokens(sentence, tokenizer):
    return tokenizer.tokenize(sentence)

def extractJson(list_dicts, field_list):
    
    json_tokens = {field:[] for field in field_list}
    
    for item in list_dicts:
        for field in SEARCH_FIELDS:
            x = extract(item, field)
            json_tokens[field].extend(x)
    return json_tokens

def extract(obj, key):
    
    tokens = []
    if isinstance(obj,(str,bool)):
        return [obj]
    elif isinstance(obj,dict):
        for k,v in obj.items():
            if k==key:
                tokens.append(v)
            elif isinstance(v, dict):
                if k in v.keys():
                    tokens.extend(extract(v, key))
            elif isinstance(v,(dict,list)):
                tokens.extend(extract(v,key))
                
    elif isinstance(obj, list):
        for item in obj:
            tokens.extend(extract(item,key))
            
    return tokens


def vectorizeTokens(token_dict):
    
    """
    Returns a numpy matrix of vectorized tokens
    """
    
    pass


def buildIndices(token_list, key=None):
    
    """
    Returns dictionaries of variable to idx and vice versa pairing.
    token_list: list
    key = string
    """
              
    var_to_index = {}
    index_to_var = {}
    
    if key is not None:
        for idx, item in enumerate(token_list):
            var_to_index[item[key]] = idx
            index_to_var[idx] =  item[key]      

    else:
        for idx, item in enumerate(token_list):
            var_to_index[item] = idx
            index_to_var[idx] = item

    return var_to_index, index_to_var
   

### Method 1

In [4]:
def get_all_tokens(d, exclude):
    if not(isinstance(d, list)) and not(isinstance(d, dict)):
        return [d]
    tokens = []
    if isinstance(d, list):
        for i in d:
            tokens.extend(get_all_tokens(i,exclude))
    elif isinstance(d, dict):
        for k, v in d.items():
            if k not in exclude:
                tokens.extend(get_all_tokens(v,exclude))
    return tokens

def extract2(d, exclude):
    tokens = []
    if isinstance(d, list):
        for i in d:
            tokens.extend(get_all_tokens(i,exclude))
    elif isinstance(d, dict):
        for k, v in d.items():
            if k not in exclude:
                tokens.extend(extract2(v, k))
    return tokens
            


In [5]:
def print_dict(d):
    for k,v in d.items():
        print(f"------ {k} -----")
        if isinstance(v, list):
            for i in v:
                print(i)
        else:
            print(v)
        print("\n")

In [45]:
def getDocuments(list_dicts, exclude):
    
    def extractDocs(obj,key):
        if key not in exclude:
            if isinstance(obj,str):
                return [obj]
            elif isinstance(obj,(dict,list)) :
                x = extract2(obj,exclude)
                return x

    docs = {}
    all_words = []
    for idx,doc in enumerate(list_dicts):
        d = []
        for k,v in doc.items():
            if k not in exclude:
                #print(k)
                if not isinstance(v,(dict,list)):
                    d.append(v)
                    all_words.append(v)
                else:
                    val = extractDocs(v,k)
                    d.extend(val)
                    all_words.extend(val)
        docs[idx] = d     

    return docs, all_words


def convert_to_lowercase(docs):  
    new_docs = {}
    for ix, lst in docs.items():
        new_docs[ix] = list(map(lambda x:x.lower(),lst))
    return new_docs
    

In [58]:
def cleanTokens(wordlist, stopwords):
    x = []
    for word in wordlist:
        if isinstance(word,str) and word not in stopwords:
            x.extend([wd.lower() for wd in word.strip().split()])
    return x

def inverted_index(docs):
    
    inverted_index=defaultdict(list)
    
    for ix, doc in docs.items():
        for word in doc:
            if ix not in inverted_index[word]:
                inverted_index[word].append(ix)
    
    return inverted_index  

def retrieve_docs(query, docs, tokenizer):
    
    query_words = getTokens(query, tokenizer)
    
    
    
        

In [59]:
def main():
    treebank = TreebankWordTokenizer()
    #tokens = extractJson(hospitals, SEARCH_FIELDS)
    remove = ["_id", "createdAt", "updatedAt", "__v", "status", "isOccupied", "isVerified"]
    docs, words = getDocuments(hospitals, remove)
    docs = convert_to_lowercase(docs)
    
    stop_words = set(stopwords.words('english')) 
    tokens = list(set(cleanTokens(words, stop_words)))

    var_idx, idx_var = buildIndices(hospitals, "name")
    
    index = inverted_index(docs)
    print(index)

    #build an inverted index


In [60]:
if __name__ == "__main__":
    
    main()

defaultdict(<class 'list'>, {'secondary': [0, 1, 4, 6, 9, 10, 11, 13, 14], '5d5b70925c0f901f2d91d190': [0], 'cocoaclinic': [0], 'cocoa clinic': [0], 'hospital': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14], 'medical facility': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14], 'male': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14], 'gw-1-b1': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14], 'gw-1-b2': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14], 'gw-1-b3': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14], 'general ward - room 1': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14], 'female': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14], 'gw-2-b1': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14], 'gw-2-b2': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14], 'general ward - room 2': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14], 'unisex': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14], 'gw-3-b1': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14],