In [51]:
import pandas as pd
import numpy as np
from collections import Counter
from tqdm.notebook import tqdm

In [52]:
# Load our data.
df = pd.read_csv("data/Normalized_Purchase_Card_Transactions.csv")
df_agencies = pd.read_csv("data/List_Agencies.csv")
df_vendors = pd.read_csv("data/List_Vendors.csv")

In [53]:
blank = np.zeros((df['AGENCY_NUM'].max() + 1, df['VENDOR_NAME_NUM'].max() + 1))

for index, row in tqdm(df.iterrows(), total = df.shape[0]):
    blank[row['AGENCY_NUM'], row['VENDOR_NAME_NUM']] += row['TRANSACTION_AMOUNT_NORM']

np.count_nonzero(blank)

  0%|          | 0/390109 [00:00<?, ?it/s]

44494

In [54]:
wordCounts = pd.DataFrame(blank)

bags = {i: df[df['AGENCY_NUM'] == i]['VENDOR_NAME_NUM'].unique() for i in df['AGENCY_NUM'].unique()}

In [55]:
def tf(counts):
    tfDF = pd.DataFrame()
    
    for index, row in counts.iterrows():
        tfDF[index] = row.div(row.sum())
        
    return tfDF

tf = pd.DataFrame(tf(wordCounts)).T

In [56]:
idf = tf.apply(lambda x: np.log((df['AGENCY_NUM'].max() + 1) / (np.count_nonzero(x.to_numpy())+1)), axis=0)

In [57]:
prod = tf*idf

In [58]:
from sklearn.metrics.pairwise import cosine_similarity

similarity = cosine_similarity(prod).astype("float")
for i in range(len(similarity)):
    similarity[i][i] = 0

In [59]:
mostsimilar = np.argmax(similarity, axis=1)
mostsimilar_verbose = np.array([df_agencies.iloc[i][1] for i in mostsimilar])
mostsimilar_vendors = np.array([bags[i] for i in mostsimilar], dtype = object)

In [60]:
def summarize(agency = 0):
    """
    Gives brief information on the agency and its most similar agency.
    """
    agency_bag = bags[agency]
    agency_bag_verbose = np.array([df_vendors.iloc[i][1] for i in agency_bag])
    similar_agency_bag = bags[mostsimilar[agency]]
    similar_agency_bag_verbose = np.array([df_vendors.iloc[i][1] for i in similar_agency_bag])
    
    print("Selected Agency:", df_agencies.iloc[agency][1])
    print("Most Similar Agency:", mostsimilar_verbose[agency])
    print()
    print("Agency Vendors:", agency_bag_verbose)
    print()
    print("Most Similar Agency Vendors:", similar_agency_bag_verbose)

In [61]:
summarize(3)

Selected Agency: Board of Real Property Assessment & Appeals
Most Similar Agency: Commission of Judicial Disabilities & Tenure

Agency Vendors: ['STANDARD_OFFICE_SUPPLY' 'LASER_ART_INC' 'FEDEX_'
 'SUPRETECH_INCORPORATED' 'PAYPAL_CORPORATEEX' 'DUPONT_COMPUTER_INC'
 'METROPOLITAN_OFFICE_PR' 'CAPITAL_SERVICES_AND_S' 'BENJAMIN_OFFICE_SU'
 'PROBAR' 'SUPERIOR_COURIERS_LLC' 'DELTA_ASSOCIATES'
 'GRAYMAR_BUSINESS_SOLUT' 'UNITEDBUSINESSTECH' 'STAR_OFFICE_PRODUCTS_I'
 'DMI_DELL_KGOVT' 'DS_WATERS' 'HIGDON_INC' 'FLASH_GLASS__MIRROR_C'
 'DS_WATERS_STANDARD_COF' 'THOMSON_WESTTCD' 'HENDERSON_PROFESSIO'
 'MCKISSOCK_LP' 'APPRAISAL_INSTITUTE' 'US_FACILITIES_INC'
 'BANNER_STAFFING' 'SQ_KARMIC_KOLLECTIONS' 'COSTAR_GROUP_INC'
 'STAR_OFFICE_PRODUCTS' 'SQ_BWH_CONSULTANTS' 'GRTR_CPTL_ASSOC_REALTO'
 'DC_BAR' 'MICROFRAME' 'PITNEY_BOWES' 'WASHINGTON_BIZ_JOURNAL'
 'EDUCATION_TO_GO' 'PBILEASEDEQUIPMENT' 'SQ_SYLVIA_PARKER_E'
 'DIGITAL_COPIER' 'MB_STAFFING_SERVIC' 'COMCAST_OF_WASHINGTON'
 'DS_SERVICES_STANDARD_C' 'SQ

In [64]:
grouped = df.groupby(['AGENCY_NUM', 'VENDOR_NAME_NUM']).sum('TRANSACTION_AMOUNT')

topn = pd.DataFrame(grouped['TRANSACTION_AMOUNT'].groupby('AGENCY_NUM', group_keys=False).nlargest(25))

In [65]:
topn

Unnamed: 0_level_0,Unnamed: 1_level_0,TRANSACTION_AMOUNT
AGENCY_NUM,VENDOR_NAME_NUM,Unnamed: 2_level_1
0,32622,16490.67
0,10283,3341.96
0,14975,2050.00
0,34339,1982.25
0,10897,1934.62
...,...,...
88,19337,23279.48
88,21777,21266.99
88,5448,20561.59
88,8467,20534.58


In [134]:
descriptions = df.loc[:, ['VENDOR_NAME_NUM', 'TRANSACTION_AMOUNT', 'MCC_DESCRIPTION']]
descriptions['MCC_DESCRIPTION'] = descriptions['MCC_DESCRIPTION'].str.replace('Not Elsewhere Classified', '').str.replace('NOT ELSEWHERE CLASSIFIED', '')
descriptions['MCC_DESCRIPTION_LIST'] = descriptions['MCC_DESCRIPTION'].str.split(",")

exploded = descriptions.explode('MCC_DESCRIPTION_LIST').replace(' ', np.nan).dropna(0)

grouped_descriptions = exploded.groupby('MCC_DESCRIPTION_LIST').sum('TRANSACTION_AMOUNT')
topn_descriptions = pd.DataFrame(grouped_descriptions['TRANSACTION_AMOUNT'].groupby('MCC_DESCRIPTION_LIST', group_keys=False).nlargest(5))

array([[ 164611.95],
       [  39074.87],
       [1245654.54],
       [  32626.57]])

In [139]:
topn_descriptions[topn_descriptions['MCC_DESCRIPTION_LIST']]

KeyError: 'MCC_DESCRIPTION_LIST'

In [None]:

merged = topn.join(descriptions.drop('TRANSACTION_AMOUNT', axis=1), on = 'VENDOR_NAME_NUM', how='left')
merged