#Helper Functions

In [0]:
def company_run_match(df, company_name_field='receiver_name', row_id='transaction_id'):
  errors = []
  df = df.reset_index(drop=True)
  start_time = pd.Timestamp.now()
  match_results = []
  for ix, row in df.iterrows():
    if ix%100 == 0:
      print(ix)
    company_name = row[company_name_field]
    company_match_url =f'https://{endpoint}.altana.ai/atlas/v1/company/match/company_name={company_name}'
    resp = requests.get(company_match_url, headers=api_headers)
    if resp.status_code!=200:
      errors.append(resp)
    else:
      result_df = pd.DataFrame([resp.json()])
      result_df[row_id] = row[row_id]
      match_results.append(result_df)
  end_time = pd.Timestamp.now()

  out_df = pd.merge(df, pd.concat(match_results), on=row_id, how='left', indicator=True)
  avg_sec = ((end_time-start_time).total_seconds())/len(df)
  print(f'avg api response time in seconds: {avg_sec}')
  return out_df, errors

#---------------------------------------------------------------------------------------------------------------------------------------------------
def company_run_search(df, company_name_field='receiver_name', row_id='transaction_id'):
  errors = []
  df = df.reset_index(drop=True)
  start_time = pd.Timestamp.now()
  match_results = []
  for ix, row in df.iterrows():
    if ix%100 == 0:
      print(ix)
    company_name = row[company_name_field]
    company_match_url =f'https://{endpoint}.altana.ai/atlas/v1/company/search/company_name={company_name}'
    resp = requests.get(company_match_url, headers=api_headers)
    if resp.status_code!=200:
      errors.append(resp)
    else:
      result_df = pd.DataFrame(resp.json()['companies']).head(30)
      result_df[row_id] = row[row_id]
      match_results.append(result_df)
  end_time = pd.Timestamp.now()

  out_df = pd.merge(df, pd.concat(match_results), on=row_id, how='left', indicator=True)
  avg_sec = ((end_time-start_time).total_seconds())/len(df)
  print(f'avg api response time in seconds: {avg_sec}')
  return out_df, errors
#---------------------------------------------------------------------------------------------------------------------------------------------------

def facility_run_match(df, facility_name_field='receiver_name', address_string_field='receiver_full_address', row_id='transaction_id'):
  errors = []
  df = df.reset_index(drop=True)
  start_time = pd.Timestamp.now()
  match_results = []
  for ix, row in df.iterrows():
    if ix%100 == 0:
      print(ix)
    company_name = row[facility_name_field]
    address_str = row[address_string_field]
    facility_match_url =f'https://{endpoint}.altana.ai/atlas/v1/facility/match?company_name={company_name}&full_address={address_str}'
    resp = requests.get(facility_match_url, headers=api_headers)
    if resp.status_code!=200:
      errors.append(resp)
    else:
      result_df = pd.DataFrame([resp.json()])
      result_df[row_id] = row[row_id]
      match_results.append(result_df)
  end_time = pd.Timestamp.now()

  out_df = pd.merge(df, pd.concat(match_results), on=row_id, how='left', indicator=True)
  avg_sec = ((end_time-start_time).total_seconds())/len(df)
  print(f'avg api response time in seconds: {avg_sec}')
  return out_df, errors

#---------------------------------------------------------------------------------------------------------------------------------------------------

def facility_run_search(df, facility_name_field='receiver_name', address_string_field='receiver_full_address', row_id='transaction_id'):
  errors = []
  df = df.reset_index(drop=True)
  start_time = pd.Timestamp.now()
  match_results = []
  for ix, row in df.iterrows():
    if ix%100 == 0:
      print(ix)
    company_name = row[facility_name_field]
    address_str = row[address_string_field]
    facility_match_url =f'https://{endpoint}.altana.ai/atlas/v1/facility/search?company_name={company_name}&full_address={address_str}'
    resp = requests.get(facility_match_url, headers=api_headers)
    if resp.status_code!=200:
      errors.append(resp)
    else:
      result_df = pd.DataFrame(resp.json()['facilities']).head(30)
      result_df[row_id] = row[row_id]
      match_results.append(result_df)
  end_time = pd.Timestamp.now()

  out_df = pd.merge(df, pd.concat(match_results), on=row_id, how='left', indicator=True)
  avg_sec = ((end_time-start_time).total_seconds())/len(df)
  print(f'avg api response time in seconds: {avg_sec}')
  return out_df, errors


#---------------------------------------------------------------------------------------------------------------------------------------------------

def address_filter(df, min_geocode_level=18, street_level_match=True, min_geo_confidence=0.4):
  df_filtered = df[(df['address_model_output_level']=='geocode_str')]
  
  out_df = pd.concat([df_filtered.reset_index(drop=True),pd.DataFrame(list(df_filtered['geo_string_address_model_metadata']))], axis=1)
  out_df = pd.concat([out_df, pd.DataFrame(list(df_filtered['geocoder_metadata']))], axis=1)
  
  tdf = pd.DataFrame(list(out_df['layer_properties_in']))
  cols = []
  for c in tdf.columns:
    cols.append(c +'_in')
  tdf.columns = cols
  out_df = pd.concat([out_df, tdf], axis=1)
  
  tdf = pd.DataFrame(list(out_df['layer_properties_out']))
  cols = []
  for c in tdf.columns:
    cols.append(c +'_out')
  tdf.columns = cols
  out_df = pd.concat([out_df, tdf], axis=1)
  
  if street_level_match:
    out_df = out_df[out_df['street_in']==out_df['street_out']]
    if (out_df['housenumber_in'] is not None  & out_df['housenumber_out'] is not None):
      out_df = out_df[out_df['housenumber_in']==out_df['housenumber_out']]

  out_df = out_df[out_df['geo_confidence_in'] > min_geo_confidence]
  out_df = out_df[out_df['geo_confidence_out'] > min_geo_confidence]
  
  out_df = out_df[out_df['geo_level_in'] > min_geocode_level]
  out_df = out_df[out_df['geo_level_out'] > min_geocode_level]
  
  return out_df

#---------------------------------------------------------------------------------------------------------------------------------------------------
def awesome_cossim_top(A:csr_matrix, B:csr_matrix, ntop:int, lower_bound:float=0)->csr_matrix:
  '''
  Runs optimizes cosine similarity on two sparse matrices
  
  Parameters
  ----------
  A: "dirty" - companies we are trying to weed out 
  B: "clean" - company name (query) search
  ntop: stores ntop similar items (if compared against several strings)
  lower_bound: stores items with a similarity above lower_bound (if compared against several strings)
  
  Output
  ----------
  matches_sim: cosine similarity based on the TFIDF of an n-grams, metric between 0 and 1
  '''
  A = A.tocsr()
  B = B.tocsr()
  M, _ = A.shape
  _, N = B.shape

  idx_dtype = np.int32

  nnz_max = M * ntop

  indptr = np.zeros(M + 1, dtype=idx_dtype)
  indices = np.zeros(nnz_max, dtype=idx_dtype)
  data = np.zeros(nnz_max, dtype=A.dtype)

  ct.sparse_dot_topn(
      M, N, np.asarray(A.indptr, dtype=idx_dtype),
      np.asarray(A.indices, dtype=idx_dtype),
      A.data,
      np.asarray(B.indptr, dtype=idx_dtype),
      np.asarray(B.indices, dtype=idx_dtype),
      B.data,
      ntop,
      lower_bound,
      indptr, indices, data)
  return csr_matrix((data, indices, indptr), shape=(M, N))

#---------------------------------------------------------------------------------------------------------------------------------------------------
def find_similarity(query_name:str, canon_name:str, n_gram:int=3)-> float:
  '''
  Evaluates cosine similarity of two strings based on their n-gram TFIDF. Leverages sparse matrix for fast calculations.
  
  Parameters
  ----------
  query_name: company to search
  canon_name: output of company names best matched from database (companies we want to narrow down)
  n_gram: number of contiguous sequence of n characters, 3 is the default. Note: higher you go -> more precise matching and lower cosine similarity will be 
  
  Output
  ----------
  matches_sim: cosine similarity based on the TFIDF of an n-grams, metric between 0 and 1
  '''
  def ngrams(string, n=n_gram):
    #string = (re.sub(r'[,-./]',r'', string)).upper()
    string = (re.sub(r'[^A-Za-z0-9]+',r'', string)).upper()
    ngrams = zip(*[string[i:] for i in range(n)])
    return [''.join(ngram) for ngram in ngrams]
  
  # constructs your vectorizer for building the TF-IDF matrix
  vectorizer = TfidfVectorizer(min_df=1, analyzer=ngrams)

  # builds a sparse document term matrix of the query company name
  tf_idf_matrix_clean = vectorizer.fit_transform([query_name]) # convert to dense matrix tf_idf_matrix_clean.todense()

  # builds a sparse document term matrix of the canon company name
  tf_idf_matrix_dirty = vectorizer.transform([canon_name])

  # if there is completely no overlap between two strings, sparse matrix will be empty (bc dense matrix will have all zeros)
  # as the result, awesome_cossim_top will throw an error, need if statement to catch that 
  if tf_idf_matrix_clean.size == 0 or tf_idf_matrix_dirty.size == 0:
      matches_sim = [0]
  else:
      # runs optimizes cosine similarity on two sparse matrices
      matches = awesome_cossim_top(tf_idf_matrix_dirty, tf_idf_matrix_clean.transpose(), 1, 0.0)
      # unpacks results from matches 
      #matches_sim = get_matches_df(matches, tf_idf_matrix_dirty, tf_idf_matrix_clean, top=0)
      
      # convert sparse similarty to dense similarity
      matches_sim = matches.data
    
  return matches_sim[0]