In [4]:
import pandas as pd
import os
import re
import warnings
import pandas as pd
import yaml
from pandas.core.common import SettingWithCopyWarning
warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))

pd.set_option("display.max_rows", 2000)

In [89]:
descript_map = load_yaml("./app/descriptions.yml")

In [90]:
descript_map

{'description_maps': {'identity_phrases': ['ironnet',
   'robinhood',
   'home depot',
   'amazon',
   'venmo',
   'remote online deposit',
   'simple',
   'atm',
   'coinbase',
   'chase card',
   'citi',
   'usaa',
   'vhc',
   'parking',
   '5guys',
   'prime video',
   'cubesmart',
   'hbo',
   'bird app',
   'bund up',
   'total wine',
   'bread & water',
   'giant',
   'idego',
   'goat',
   'lim*ride',
   'onlyfans',
   'postmates',
   'philz coffee',
   'northside',
   'origin coffee lab'],
  'phrase_maps': [{'key_phrase': 'electronic funds transfer',
    'description': 'transfer'},
   {'key_phrase': 'citi card online payment',
    'description': 'credit card payment'},
   {'key_phrase': 'amzn', 'description': 'amazon'},
   {'key_phrase': 'cromwell', 'description': 'rent cromwell'},
   {'key_phrase': 'education student ln', 'description': 'student loans'},
   {'key_phrase': 'harris', 'description': 'harris teeter'},
   {'key_phrase': 'garage', 'description': 'parking'},
   {'ke

In [20]:
TRANS_KEY_COLS = ["date", "original_description", "amount"]

PATH_TO_RAW = "./data/raw/"

PATH_TO_CATEGORIZED = "./data/categorized/"

def load_yaml(path):
    with open(path, 'r') as stream:
        try:
            return yaml.safe_load(stream)
        except yaml.YAMLError as exc:
            print(exc)

def to_snake(txt):
    words = txt.lower().split(" ")
    return "_".join(words)


def get_trans_keys(df):
    return df[TRANS_KEY_COLS].drop_duplicates()


def get_net_amount(row):
    if row['transaction_type'] == 'debit':
        return row['amount']
    else:
        return row['amount'] * -1
    
    
def get_pretty_description(original_descr, descr_map):
    
    # contain to hold any description matches
    # some descriptions might be subsets of other
    matched = []
    
    # if any phrases in identity_phrases are substring of original_descript
    # pretty descript is phrase
    for phrase in descr_map['description_maps']['identity_phrases']:
        if phrase.lower() in original_descr.lower():
            matched.append(phrase.title())
            
    for phrase_map in descr_map['description_maps']['phrase_maps']:
        if phrase_map['key_phrase'].lower() in original_descr.lower():
            matched.append(phrase_map['description'].title())
            
    if len(matched) == 0:
        return original_descr
    else:
        lengths = [len(txt) for txt in matched]
        max_len = max(lengths)
        max_len_idx = lengths.index(max_len)
        return matched[max_len_idx]
    
    
def get_updated_descriptions(original_descriptions):
    descr_map = load_yaml("./app/descriptions.yml")
    new_descriptions = []
    for og in original_descriptions:
        new_descriptions.append(get_pretty_description(og, descr_map))
    return new_descriptions
    

def load_raw_transactions():
    # filter out certain transactions completely, 
    # this maybe isnt completely necessary since we will be looking at specific categories
    # maybe just include all the transactions and label some as tranfers and filter out transfers when doing trend analysis
    """
    description = robinhood
    description = credit card payment
    description = transfer
    description = venmo
    account     = venmo
    """
    
    # group value by transaction keys 
    df = pd.read_csv(PATH_TO_RAW + "transactions.csv")
    df.columns = [to_snake(col) for col in df.columns]
    df['description'] = get_updated_descriptions(df['original_description'].values)
    return df.drop(["category","labels","notes"], axis = 1)
    

In [6]:
# TODO not getting longest description for transactions. try citcards cash rewards
raw_transact = load_raw_transactions()
raw_transact.head(200)

Unnamed: 0,date,description,original_description,amount,transaction_type,account_name
0,6/19/2021,Origin Coffee Lab,AUTH : ORIGIN COFFEE LAB & KITCH,3.02,debit,Fidelity Rewards Visa Signature
1,6/18/2021,Amazon,AMZN Mktp US,7.41,debit,CREDIT CARD
2,6/18/2021,Northside,AUTH : TST* NORTHSIDE SOCIAL COF,17.88,debit,Fidelity Rewards Visa Signature
3,6/18/2021,Amazon,AMZN Mktp US,52.99,debit,CREDIT CARD
4,6/18/2021,Prime Video,Prime Video,3.99,debit,CREDIT CARD
5,6/17/2021,Amazon,AMZN Mktp US,31.79,debit,CREDIT CARD
6,8/14/2018,Amazon,POS DEBIT AMZN Mktp US ...,19.02,debit,CHASE COLLEGE
7,8/14/2018,Amazon,POS DEBIT AMZN Mktp US ...,245.4,debit,CHASE COLLEGE
8,6/18/2021,GOLDS GYM VA SOUTH AR 703-683-46,GOLDS GYM VA SOUTH AR 703-683-46,24.99,debit,CHASE COLLEGE
9,6/18/2021,FID BKG SVC LLC MONEYLINE,FID BKG SVC LLC MONEYLINE,1000.0,debit,CHASE COLLEGE


In [67]:
raw_transact['account_name'].value_counts()

CHASE COLLEGE                      1674
Venmo                                50
CREDIT CARD                          37
Citi® Double Cash Card               24
Fidelity Rewards Visa Signature      17
ETH Wallet                            5
PayPal Account                        4
My Wallet                             3
INDIVIDUAL                            2
ADA Wallet                            2
Name: account_name, dtype: int64

In [71]:
venmo_stuff = raw_transact.loc[(raw_transact['description'] == 'Venmo') | (raw_transact['account_name'] == 'Venmo'), ]

In [80]:
venmo_stuff.groupby(['description', 'transaction_type']).amount.sum().sort_values()

description                        transaction_type
Anotha One                         credit                 3.00
Lvkbbq                             credit                 3.00
Exvhange Rate Fees                 credit                 3.00
ATM fee                            debit                  5.00
Atm fees                           credit                 5.00
Chug                               debit                  5.00
Kai                                debit                 10.43
Victoria Garcia Paid               credit                12.00
Kai March                          credit                12.22
Airline Seat Change                credit                14.40
Rest mia                           debit                 17.88
Grill V Pescadores                 credit                20.00
Indian                             credit                21.50
Parking                            credit                24.00
Food                               debit                 24.57
Dra

In [22]:
# df[['date','original_description','amount']].value_counts().sort_values(ascending=False)

In [439]:
credits = df[df['transaction_type'] == 'credit']
debits = df[df['transaction_type'] == 'debit']

In [442]:
def regex_filter(descr):
    patterns = [
        # only digits word
        '\s\d+',
        
        # state and forward slash
        '\s[A-Z]{1,2}/[0-9]{2}',
        
        # filter LLC extension
        ', LLC.*',
        'LLC.*',
        
        # filter website extension
        '\sWWW.*',
        '\s\S\.com',
        
        # simple filters
        '\sWEB\s',
        'ID:',
        
        'TST\*\s',
        
        '\sHTTP.*',
    ]
    
    new_descr = descr
    for pattern in patterns:
        new_descr = re.sub(pattern, '', new_descr)
        
    return new_descr