In [26]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import numpy as np
import pandas as pd

In [27]:
# Load our data.
df = pd.read_csv("data/Purchase_Card_Transactions.csv")
df = df.dropna()

In [28]:
df['MCC_DESCRIPTION'].value_counts()[1:20]

Stationery, Office & School Supply Stores             22884
Stationery,Office Supplies,Printing/Writing Paper     16396
Business Services Not Elsewhere Classified            15096
BOOK STORES                                           14531
Local/Suburban Commuter Transportation                14522
Computers,Computer Peripheral Equipment, Software      9722
Charitable And Social Service Organizations            8200
Membership Organizations, Not Elsewhere Classified     7760
Educational Services, Not Elsewhere Classified         7721
Specialty Retail Stores-Miscellaneous                  7432
Government Services, Not Elsewhere Classified          6674
Industrial Supplies, Not Elsewhere Classified          6422
Professional Services Not Elsewhere Classified         6083
Direct Marketing-Not Elsewhere Classified              6082
Home Supply Warehouse Stores                           5762
Direct Marketing - Comb. Catalog &Retail Merchants     5713
Computer Software Stores                

In [29]:
index_names = df[df['TRANSACTION_AMOUNT'] <= 0 ].index 
# drop these row indexes
# from dataFrame 
df.drop(index_names, inplace = True) 

In [30]:
# Clean the vendor name.
# Get rid of all characters except for [wa-Z]
df['VENDOR_NAME_CLEAN'] = df['VENDOR_NAME'].str.replace(r'\s+', '_').str.replace(r'\W+', '').str.replace(r'\d+', '')

In [31]:
# Sort vendors and agencies alphabetically.
list_vendors = sorted(df['VENDOR_NAME_CLEAN'].unique().astype(str))
list_agencies = sorted(df['AGENCY'].unique().astype(str))

In [32]:
# Create mapping from vendor and agency names to indices.
list_vendors_map = {list_vendors[i]: i for i in range(len(list_vendors))}
list_agencies_map = {list_agencies[i]: i for i in range(len(list_agencies))}

In [33]:
# Create indices.
df['VENDOR_NAME_NUM'] = df['VENDOR_NAME_CLEAN'].apply(lambda i: list_vendors_map[i])
df['AGENCY_NUM'] = df['AGENCY'].apply(lambda i: list_agencies_map[i])

In [34]:
# Create new dataframe.
newdf = df[['AGENCY_NUM', 'VENDOR_NAME_NUM', 'TRANSACTION_AMOUNT', 'TRANSACTION_DATE', 'VENDOR_STATE_PROVINCE', 'MCC_DESCRIPTION']].copy(deep = True)

In [35]:
# Save the dataframe.
newdf.to_csv("data/Cleaned_Purchase_Card_Transactions.csv")

In [36]:
# Save vendors and agencies map as separate csv files.
list_vendors_df = pd.DataFrame(list_vendors)
list_agencies_df = pd.DataFrame(list_agencies)

list_vendors_df.to_csv("data/List_Vendors.csv")
list_agencies_df.to_csv("data/List_Agencies.csv")

In [37]:
# Normalization
grouper = df.groupby('VENDOR_NAME_NUM')['TRANSACTION_AMOUNT']
maxes = grouper.transform('max')
mins = grouper.transform('min')

In [38]:
newnormdf = newdf.assign(TRANSACTION_AMOUNT_NORM = (newdf.TRANSACTION_AMOUNT - mins) / (maxes - mins))
newnormdf = newnormdf.dropna()

In [39]:
# Save the dataframe.
newnormdf.to_csv("data/Normalized_Purchase_Card_Transactions.csv")