In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import numpy as np
import pandas as pd

In [None]:
# Load our data.
df = pd.read_csv("data/Purchase_Card_Transactions.csv")
df = df.dropna()

In [None]:
index_names = df[df['TRANSACTION_AMOUNT'] <= 0 ].index 
# drop these row indexes
# from dataFrame 
df.drop(index_names, inplace = True) 

In [None]:
# Clean the vendor name.
# Get rid of all characters except for [a-Z]
df['VENDOR_NAME_CLEAN'] = df['VENDOR_NAME'].str.replace(r'\s+', '_').str.replace(r'\W+', '').str.replace(r'\d+', '')

In [None]:
# Sort vendors and agencies alphabetically.
list_vendors = sorted(df['VENDOR_NAME_CLEAN'].unique().astype(str))
list_agencies = sorted(df['AGENCY'].unique().astype(str))

In [None]:
# Create mapping from vendor and agency names to indices.
list_vendors_map = {list_vendors[i]: i for i in range(len(list_vendors))}
list_agencies_map = {list_agencies[i]: i for i in range(len(list_agencies))}

In [None]:
# Create indices.
df['VENDOR_NAME_NUM'] = df['VENDOR_NAME_CLEAN'].apply(lambda i: list_vendors_map[i])
df['AGENCY_NUM'] = df['AGENCY'].apply(lambda i: list_agencies_map[i])

In [None]:
# Create new dataframe.
newdf = df[['AGENCY_NUM', 'VENDOR_NAME_NUM', 'TRANSACTION_AMOUNT', 'TRANSACTION_DATE', 'VENDOR_STATE_PROVINCE']].copy(deep = True)

In [None]:
# Save the dataframe.
newdf.to_csv("data/Cleaned_Purchase_Card_Transactions.csv")

In [None]:
# Save vendors and agencies map as separate csv files.
list_vendors_df = pd.DataFrame(list_vendors)
list_agencies_df = pd.DataFrame(list_agencies)

list_vendors_df.to_csv("data/List_Vendors.csv")
list_agencies_df.to_csv("data/List_Agencies.csv")

In [None]:
# Normalization
min_transaction_amount = newnormdf.groupby(['VENDOR_NAME_NUM'])['TRANSACTION_AMOUNT'].min()
max_transaction_amount = newnormdf.groupby(['VENDOR_NAME_NUM'])['TRANSACTION_AMOUNT'].max()

In [None]:
grouper = df.groupby('VENDOR_NAME_NUM')['TRANSACTION_AMOUNT']
maxes = grouper.transform('max')
mins = grouper.transform('min')

In [None]:
newnormdf = newdf.assign(TRANSACTION_AMOUNT_NORM = (newdf.TRANSACTION_AMOUNT - mins) / (maxes - mins))
newnormdf = newnormdf.dropna()

In [None]:
# Save the dataframe.
newnormdf.to_csv("data/Normalized_Purchase_Card_Transactions.csv")