In [3]:
#insert json
import json
import requests
import numpy as np
import matplotlib.pyplot as plt
from nltk.tokenize import TreebankWordTokenizer

In [4]:
#load json data
url = "http://api.apomden.com/v2/facilities"
hospitals=requests.get(url).json()
hospitals = hospitals["data"]

### Search Algorithm

In [23]:
SEARCH_FIELDS = ["name", "street", "city", "district", "regions", "services"]

def getTokens(sentence, tokenizer):
    return tokenizer.tokenize(sentence)

def extractJson(list_dicts, field_list):
    
    json_tokens = {field:[] for field in field_list}
    
    for item in list_dicts:
        for field in SEARCH_FIELDS:
            x = extract(item, field)
            json_tokens[field].extend(x)
    return json_tokens

def extract(obj, key):
    
    tokens = []
    if isinstance(obj,(str,bool)):
        return [obj]
    elif isinstance(obj,dict):
        for k,v in obj.items():
            if k==key:
                tokens.append(v)
            elif isinstance(v, dict):
                if k in v.keys():
                    tokens.extend(extract(v, key))
            elif isinstance(v,(dict,list)):
                tokens.extend(extract(v,key))
                
    elif isinstance(obj, list):
        for item in obj:
            tokens.extend(extract(item,key))
            
    return tokens


def vectorizeTokens(token_dict):
    
    """
    Returns a numpy matrix of vectorized tokens
    """
    
    pass


def buildIndices(token_list, key=None):
    
    """
    Returns dictionaries of variable to idx and vice versa pairing.
    token_list: list
    key = string
    """
              
    var_to_index = {}
    index_to_var = {}
    
    if key is not None:
        for idx, item in enumerate(token_list):
            var_to_index[item[key]] = idx
            index_to_var[idx] =  item[key]      

    else:
        for idx, item in enumerate(token_list):
            var_to_index[item] = idx
            index_to_var[idx] = item

    return var_to_index, index_to_var
   

### Method 1

In [15]:
def get_all_tokens(d):
    if not(isinstance(d, list)) and not(isinstance(d, dict)):
        return [d]
    tokens = []
    if isinstance(d, list):
        for i in d:
            tokens.extend(get_all_tokens(i))
    elif isinstance(d, dict):
        for k, v in d.items():
            tokens.extend(get_all_tokens(v))
    return tokens

def extract2(d, key):
    tokens = []
    if isinstance(d, list):
        for i in d:
            tokens.extend(get_all_tokens(i))
    elif isinstance(d, dict):
        for k, v in d.items():
            #if k == key:
            #tokens.extend(get_all_tokens(v))
            #else:
            tokens.extend(extract2(v, key))
    return tokens
            


In [16]:
def getDocuments(list_dicts):
    
    def extractDocs(obj,key):
        if isinstance(obj,str):
            return [obj]
        elif isinstance(obj,(dict,list)):
            x = extract2(obj,key)
            return x

    docs = {} 
    for idx,doc in enumerate(list_dicts):
        #print(doc)
        d = []
        for k,v in doc.items():
            if not isinstance(v,(dict,list)):
                d.append(v)
            else:
                val = extractDocs(v,k)
                d.extend(val)
        docs[idx] = d
    
    return docs
    

In [None]:
def inverted_index():
    pass

In [17]:
def print_dict(d):
    for k,v in d.items():
        print(f"------ {k} -----")
        if isinstance(v, list):
            for i in v:
                print(i)
        else:
            print(v)
        print("\n")

In [37]:
def main():
    treebank = TreebankWordTokenizer()
    #tokens = extractJson(hospitals, SEARCH_FIELDS)
    #print(tokens)
    #print_dict(tokens)
    
    docs = getDocuments(hospitals)
    print(docs)
    var_idx, idx_var = buildIndices(hospitals, "name")
    print(var_idx, idx_var)
    
    #Take an input
    user_input = input("Please enter name: ")
    #print(user_input)
    
    #build an inverted index


In [None]:
if __name__ == "__main__":
    
    main()

{0: ['SECONDARY', False, '5d5b70925c0f901f2d91d190', '5d5b70925c0f901f2d91d16b', 'cocoaclinic', 'Cocoa Clinic', '5d5b70925c0f901f2d91d16f', 'hospital', '2019-08-20T04:01:22.346Z', '2019-08-20T04:01:22.346Z', '5d5b70925c0f901f2d91d16e', 'medical facility', '2019-08-20T04:01:22.346Z', '2019-08-20T04:01:22.346Z', 'MALE', True, '5d5b70925c0f901f2d91d189', 'GW-1-B1', 'OCCUPIED', '2019-08-20T04:01:22.347Z', '2019-09-26T19:00:51.229Z', True, '5d5b70925c0f901f2d91d188', 'GW-1-B2', 'OCCUPIED', '2019-08-20T04:01:22.347Z', '2019-08-20T04:02:00.601Z', False, '5d5b70925c0f901f2d91d187', 'GW-1-B3', 'AVAILABLE', '2019-08-20T04:01:22.347Z', '2019-08-20T04:01:22.347Z', '5d5b70925c0f901f2d91d186', 'General Ward - Room 1', '2019-08-20T04:01:22.346Z', '2019-09-26T19:00:51.229Z', 'FEMALE', False, '5d5b70925c0f901f2d91d185', 'GW-2-B1', 'AVAILABLE', '2019-08-20T04:01:22.347Z', '2019-08-20T04:01:22.347Z', True, '5d5b70925c0f901f2d91d184', 'GW-2-B2', 'OCCUPIED', '2019-08-20T04:01:22.347Z', '2019-08-20T21:11:43