In [1]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import numpy as np
import pandas as pd

In [2]:
# Load our data.
df = pd.read_csv("data/Purchase_Card_Transactions.csv")
df = df.dropna()

In [25]:
df['MCC_DESCRIPTION'].value_counts()[1:20]

Stationery, Office & School Supply Stores             21561
Stationery,Office Supplies,Printing/Writing Paper     15427
Business Services Not Elsewhere Classified            14774
Local/Suburban Commuter Transportation                14503
BOOK STORES                                           13508
Computers,Computer Peripheral Equipment, Software      9316
Charitable And Social Service Organizations            8036
Membership Organizations, Not Elsewhere Classified     7596
Educational Services, Not Elsewhere Classified         7506
Specialty Retail Stores-Miscellaneous                  7234
Government Services, Not Elsewhere Classified          6345
Industrial Supplies, Not Elsewhere Classified          6173
Professional Services Not Elsewhere Classified         6024
Direct Marketing - Comb. Catalog &Retail Merchants     5536
Home Supply Warehouse Stores                           5485
Direct Marketing-Not Elsewhere Classified              5419
Computer Software Stores                

In [3]:
index_names = df[df['TRANSACTION_AMOUNT'] <= 0 ].index 
# drop these row indexes
# from dataFrame 
df.drop(index_names, inplace = True) 

In [4]:
# Clean the vendor name.
# Get rid of all characters except for [wa-Z]
df['VENDOR_NAME_CLEAN'] = df['VENDOR_NAME'].str.replace(r'\s+', '_').str.replace(r'\W+', '').str.replace(r'\d+', '')

In [5]:
# Sort vendors and agencies alphabetically.
list_vendors = sorted(df['VENDOR_NAME_CLEAN'].unique().astype(str))
list_agencies = sorted(df['AGENCY'].unique().astype(str))

In [6]:
# Create mapping from vendor and agency names to indices.
list_vendors_map = {list_vendors[i]: i for i in range(len(list_vendors))}
list_agencies_map = {list_agencies[i]: i for i in range(len(list_agencies))}

In [7]:
# Create indices.
df['VENDOR_NAME_NUM'] = df['VENDOR_NAME_CLEAN'].apply(lambda i: list_vendors_map[i])
df['AGENCY_NUM'] = df['AGENCY'].apply(lambda i: list_agencies_map[i])

In [8]:
# Create new dataframe.
newdf = df[['AGENCY_NUM', 'VENDOR_NAME_NUM', 'TRANSACTION_AMOUNT', 'TRANSACTION_DATE', 'VENDOR_STATE_PROVINCE']].copy(deep = True)

In [9]:
# Save the dataframe.
newdf.to_csv("data/Cleaned_Purchase_Card_Transactions.csv")

In [10]:
# Save vendors and agencies map as separate csv files.
list_vendors_df = pd.DataFrame(list_vendors)
list_agencies_df = pd.DataFrame(list_agencies)

list_vendors_df.to_csv("data/List_Vendors.csv")
list_agencies_df.to_csv("data/List_Agencies.csv")

In [11]:
# Normalization
grouper = df.groupby('VENDOR_NAME_NUM')['TRANSACTION_AMOUNT']
maxes = grouper.transform('max')
mins = grouper.transform('min')

In [12]:
newnormdf = newdf.assign(TRANSACTION_AMOUNT_NORM = (newdf.TRANSACTION_AMOUNT - mins) / (maxes - mins))
newnormdf = newnormdf.dropna()

In [13]:
# Save the dataframe.
newnormdf.to_csv("data/Normalized_Purchase_Card_Transactions.csv")