In [1]:
import pandas as pd
import os
import re
import warnings
import pandas as pd
import yaml
from pandas.core.common import SettingWithCopyWarning
warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))

pd.set_option("display.max_rows", 2000)

In [16]:
os.listdir("./app")

['categories.yml', 'descriptions.yml', '__init__.py', 'main.py']

In [8]:
def load_yaml(path):
    with open(path, 'r') as stream:
        try:
            return yaml.safe_load(stream)
        except yaml.YAMLError as exc:
            print(exc)

descript_map = load_yaml("./app/descriptions.yml")

In [9]:
descript_map

{'description_maps': {'identity_phrases': ['ironnet',
   'robinhood',
   'home depot',
   'amazon',
   'venmo',
   'remote online deposit',
   'simple',
   'atm',
   'coinbase',
   'chase card',
   'citi',
   'usaa',
   'vhc',
   'parking',
   '5guys',
   'prime video',
   'cubesmart',
   'hbo',
   'bird app',
   'bund up',
   'total wine',
   'bread & water',
   'giant',
   'idego',
   'goat',
   'lim*ride',
   'onlyfans',
   'postmates',
   'philz coffee',
   'northside',
   'origin coffee lab',
   'citicards cash reward'],
  'phrase_maps': [{'key_phrase': 'electronic funds transfer',
    'description': 'transfer'},
   {'key_phrase': 'Payment Thank You - Web', 'description': 'transfer'},
   {'key_phrase': 'citi card online payment', 'description': 'transfer'},
   {'key_phrase': 'amzn', 'description': 'amazon'},
   {'key_phrase': 'cromwell', 'description': 'rent cromwell'},
   {'key_phrase': 'education student ln', 'description': 'student loans'},
   {'key_phrase': 'harris', 'descript

In [42]:
TRANSACT_KEY_COLS = ["date", "original_description", "amount"]

PATH_TO_RAW = "./data/raw/"

PATH_TO_CATEGORIZED = "./data/categorized/"

PATH_TO_DESCRIPTION_MAP = "./app/descriptions.yml"

def regex_filter(self, descr):
    new_descr = descr
    for pattern in self.regex_filters:
        new_descr = re.sub(pattern, '', new_descr)
    return new_descr


def to_snake(txt):
    words = txt.lower().split(" ")
    return "_".join(words)


def get_transact_keys(df):
    return df[TRANS_KEY_COLS].drop_duplicates()
    
    
def lookup_description(original_description, descr_map):
    # contain to hold any description matches
    # some descriptions might be subsets of other
    matched = []
    
    # if any phrases in identity_phrases are substring of original_description
    # pretty descript is phrase
    for phrase in descr_map['description_maps']['identity_phrases']:
        if phrase.lower() in original_description.lower():
            matched.append(phrase.title())
            
    for phrase_map in descr_map['description_maps']['phrase_maps']:
        if phrase_map['key_phrase'].lower() in original_description.lower():
            matched.append(phrase_map['description'].title())
            
    if len(matched) == 0:
        return original_description
    else:
        lengths = [len(txt) for txt in matched]
        max_len = max(lengths)
        max_len_idx = lengths.index(max_len)
        return matched[max_len_idx]
    
    
def get_pretty_descriptions(original_descriptions):
    descr_map = load_yaml(PATH_TO_DESCRIPTION_MAP)
    new_descriptions = []
    for orig in original_descriptions:
        new_descriptions.append(lookup_description(orig, descr_map))
    return new_descriptions
    

def load_raw_transactions():
    df = pd.read_csv(PATH_TO_RAW + "transactions.csv")
    df.columns = [to_snake(col) for col in df.columns]
    
    # remove repeated white space characters
    df['original_description'] = df['original_description'].apply(lambda txt: ' '.join(txt.split()))
    
    # group by all values to aggregate duplicate transactions
    group_df = df.groupby(["date", "original_description", "transaction_type", "account_name"], as_index = False).amount.sum()
    
    # parse original descriptions to get pretty descriptions
    group_df["description"] = get_pretty_descriptions(group_df["original_description"].values)

    return group_df
    

In [45]:
# TODO not getting longest description for transactions. try citcards cash rewards
raw_transact = load_raw_transactions()
raw_transact.loc[raw_transact["original_description"].apply(lambda x: "citi" in x.lower()), ]

Unnamed: 0,date,original_description,transaction_type,account_name,amount,description
106,1/27/2021,CITI CARD ONLINE PAYMENT 4203,debit,CHASE COLLEGE,386.73,Transfer
582,2/02/2021,CITI CARD ONLINE PAYMENT 4203,debit,CHASE COLLEGE,398.48,Transfer
659,2/18/2021,CITI CARD ONLINE PAYMENT 4303,debit,CHASE COLLEGE,674.57,Transfer
715,3/05/2021,CITI CARD ONLINE PAYMENT 4203,debit,CHASE COLLEGE,511.13,Transfer
789,3/16/2021,CITI CARD ONLINE PAYMENT 4303,debit,CHASE COLLEGE,983.74,Transfer
878,4/13/2021,CITI AUTOPAY PAYMENT 0804,debit,CHASE COLLEGE,76.2,Citi
888,4/15/2021,CITI CARD ONLINE PAYMENT 4304,debit,CHASE COLLEGE,109.92,Transfer
950,4/30/2021,CITI CARD ONLINE PAYMENT 4204,debit,CHASE COLLEGE,216.76,Transfer
1080,5/24/2021,CITI CARD ONLINE PAYMENT 4204,debit,CHASE COLLEGE,786.27,Transfer
1162,6/11/2021,CITI CARD ONLINE PAYMENT 4304,debit,CHASE COLLEGE,431.43,Transfer


In [46]:
raw_transact.columns

Index(['date', 'original_description', 'transaction_type', 'account_name',
       'amount', 'description'],
      dtype='object')

In [67]:
raw_transact['account_name'].value_counts()

CHASE COLLEGE                      1674
Venmo                                50
CREDIT CARD                          37
Citi® Double Cash Card               24
Fidelity Rewards Visa Signature      17
ETH Wallet                            5
PayPal Account                        4
My Wallet                             3
INDIVIDUAL                            2
ADA Wallet                            2
Name: account_name, dtype: int64

In [71]:
venmo_stuff = raw_transact.loc[(raw_transact['description'] == 'Venmo') | (raw_transact['account_name'] == 'Venmo'), ]

In [80]:
venmo_stuff.groupby(['description', 'transaction_type']).amount.sum().sort_values()

description                        transaction_type
Anotha One                         credit                 3.00
Lvkbbq                             credit                 3.00
Exvhange Rate Fees                 credit                 3.00
ATM fee                            debit                  5.00
Atm fees                           credit                 5.00
Chug                               debit                  5.00
Kai                                debit                 10.43
Victoria Garcia Paid               credit                12.00
Kai March                          credit                12.22
Airline Seat Change                credit                14.40
Rest mia                           debit                 17.88
Grill V Pescadores                 credit                20.00
Indian                             credit                21.50
Parking                            credit                24.00
Food                               debit                 24.57
Dra

In [48]:
categories = load_yaml("./app/categories.yml")

In [54]:
organized_cats = {}
for cat in categories['categories']:
    organized_cats[cat["name"]] = cat["subcategories"]

In [56]:
sorted(organized_cats)

['amusement',
 'eating & drinking out',
 'entertainment media',
 'government',
 'grocery store',
 'health',
 'home',
 'insurance',
 'lodging',
 'shopping',
 'student loans',
 'transportation',
 'travel',
 'uncategorized']

In [57]:
organized_cats

{'eating & drinking out': ['',
  'delivery',
  'resturants',
  'coffee shops',
  'bars'],
 'home': ['', 'rent', 'utilities', 'home items'],
 'grocery store': [''],
 'entertainment media': ['', 'movies', 'music', 'news', 'books'],
 'amusement': [''],
 'student loans': [''],
 'transportation': ['', 'gas', 'uber', 'public'],
 'lodging': ['', 'airbnb', 'hotel'],
 'shopping': ['', 'music', 'sports', 'clothing', 'electronics', 'durables'],
 'health': ['', 'doctor', 'pharmacy', 'hygene', 'gym', 'therapy'],
 'travel': ['', 'airplane'],
 'government': ['', 'taxes', 'dmv', 'tickets'],
 'insurance': ['', 'renters', 'auto'],
 'uncategorized': ['']}

In [22]:
# df[['date','original_description','amount']].value_counts().sort_values(ascending=False)

In [439]:
credits = df[df['transaction_type'] == 'credit']
debits = df[df['transaction_type'] == 'debit']

In [442]:
def regex_filter(descr):
    patterns = [
        # only digits word
        '\s\d+',
        
        # state and forward slash
        '\s[A-Z]{1,2}/[0-9]{2}',
        
        # filter LLC extension
        ', LLC.*',
        'LLC.*',
        
        # filter website extension
        '\sWWW.*',
        '\s\S\.com',
        
        # simple filters
        '\sWEB\s',
        'ID:',
        
        'TST\*\s',
        
        '\sHTTP.*',
    ]
    
    new_descr = descr
    for pattern in patterns:
        new_descr = re.sub(pattern, '', new_descr)
        
    return new_descr