Skip to content

Commit

Permalink
Merge pull request #1 from sdhutchins/master
Browse files Browse the repository at this point in the history
Added return as dataframe for duplicate and missing queries.
  • Loading branch information
newgene committed Oct 31, 2018
2 parents 2bba68c + fddb64f commit 521b073
Showing 1 changed file with 24 additions and 17 deletions.
41 changes: 24 additions & 17 deletions biothings_client/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,8 +50,9 @@ def alwayslist(value):
else:
return [value]


def safe_str(s, encoding='utf-8'):
'''if input is an unicode string, do proper encoding.'''
'''Perform proper encoding if input is an unicode string.'''
try:
_s = str(s)
except UnicodeEncodeError:
Expand Down Expand Up @@ -88,9 +89,9 @@ def iter_n(iterable, n, with_cnt=False):
else:
yield chunk


class BiothingClient(object):
'''This is the client for a biothing web service.
'''
'''This is the client for a biothing web service.'''
def __init__(self, url=None):
if url is None:
url = self._default_url
Expand All @@ -110,9 +111,7 @@ def __init__(self, url=None):
self._cached = False

def _dataframe(self, obj, dataframe, df_index=True):
"""
converts object to DataFrame (pandas)
"""
'''Converts object to DataFrame (pandas)'''
if not df_avail:
print("Error: pandas module must be installed for as_dataframe option.")
return
Expand All @@ -123,7 +122,7 @@ def _dataframe(self, obj, dataframe, df_index=True):
if dataframe == 1:
df = json_normalize(obj['hits'])
else:
df = DataFrame.from_dict(obj['hits'])
df = DataFrame.from_dict(obj)
else:
if dataframe == 1:
df = json_normalize(obj)
Expand Down Expand Up @@ -192,7 +191,7 @@ def _repeated_query_old(self, query_fn, query_li, verbose=True, **fn_kwargs):
time.sleep(self.delay)

def _repeated_query(self, query_fn, query_li, verbose=True, **fn_kwargs):
'''run query_fn for input query_li in a batch (self.step).
'''Run query_fn for input query_li in a batch (self.step).
return a generator of query_result in each batch.
input query_li can be a list/tuple/iterable
'''
Expand All @@ -212,7 +211,7 @@ def _repeated_query(self, query_fn, query_li, verbose=True, **fn_kwargs):

@property
def _from_cache_notification(self):
''' Notification to alert user that a cached result is being returned.'''
'''Notification to alert user that a cached result is being returned.'''
return "[ from cache ]"

def _metadata(self, verbose=True, **kwargs):
Expand All @@ -225,7 +224,7 @@ def _metadata(self, verbose=True, **kwargs):
return ret

def _set_caching(self, cache_db=None, verbose=True, **kwargs):
''' Installs a local cache for all requests.
'''Installs a local cache for all requests.
**cache_db** is the path to the local sqlite cache database.'''
if caching_avail:
Expand All @@ -241,7 +240,7 @@ def _set_caching(self, cache_db=None, verbose=True, **kwargs):
return

def _stop_caching(self):
''' Stop caching.'''
'''Stop caching.'''
if self._cached and caching_avail:
requests_cache.uninstall_cache()
self._cached = False
Expand All @@ -255,7 +254,7 @@ def _clear_cache(self):
pass

def _get_fields(self, search_term=None, verbose=True):
''' Wrapper for /metadata/fields
'''Wrapper for /metadata/fields
**search_term** is a case insensitive string to search for in available field names.
If not provided, all available fields will be returned.
Expand Down Expand Up @@ -419,7 +418,7 @@ def _query(self, q, **kwargs):
return out

def _fetch_all(self, url, verbose=True, **kwargs):
''' Function that returns a generator to results. Assumes that 'q' is in kwargs.'''
'''Function that returns a generator to results. Assumes that 'q' is in kwargs.'''
# function to get the next batch of results, automatically disables cache if we are caching
def _batch():
if caching_avail and self._cached:
Expand Down Expand Up @@ -521,13 +520,18 @@ def _querymany(self, qterms, scopes=None, **kwargs):
if len(out) == 1:
out = out[0]
return out
if dataframe:
out = self._dataframe(out, dataframe, df_index=df_index)

# check dup hits
if li_query:
li_dup = [(query, cnt) for query, cnt in list_itemcnt(li_query) if cnt > 1]
del li_query
del li_query

if dataframe:
out = self._dataframe(out, dataframe, df_index=df_index)
li_dup_df = DataFrame.from_records(li_dup,
columns=['query',
'duplicate hits'])
li_missing_df = DataFrame(li_missing, columns=['query'])

if verbose:
if li_dup:
Expand All @@ -537,7 +541,10 @@ def _querymany(self, qterms, scopes=None, **kwargs):
print("{0} input query terms found no hit:".format(len(li_missing)))
print("\t"+str(li_missing)[:100])
if returnall:
return {'out': out, 'dup': li_dup, 'missing': li_missing}
if dataframe:
return {'out': out, 'dup': li_dup_df, 'missing': li_missing_df}
else:
return {'out': out, 'dup': li_dup, 'missing': li_missing}
else:
if verbose and (li_dup or li_missing):
print('Pass "returnall=True" to return complete lists of duplicate or missing query terms.')
Expand Down

0 comments on commit 521b073

Please sign in to comment.