In [24]:
import git
import os
import pandas as pd
import numpy as np

### Settings

In [25]:
def get_git_root(path):
    git_repo = git.Repo(path, search_parent_directories=True)
    git_root = git_repo.git.rev_parse("--show-toplevel")
    return git_root

data_root_dir_append = "data"
data_raw_dir_append = "data/raw"
data_interim_dir_append = "data/interim"
data_processed_dir_append = "data/processed"

In [26]:
git_root_path = get_git_root(os.getcwd())

raw_data_path = os.path.join(git_root_path, data_raw_dir_append)

txn_files = [f for f in os.listdir(raw_data_path) if os.path.isfile(os.path.join(raw_data_path, f))]

### Drop into config file

In [27]:
txn_date_col_name = "txn_date"
txn_description_col_name = "txn_description"
txn_amount_col_name = "txn_amount"
bank_col_name = "bank"
id_col_name = "unique_id"
txn_strftime_format = "%Y-%m-%d"

usaa_config = {
    "columns" : [
        "status", 
        None,
        txn_date_col_name,
        None,
        txn_description_col_name,
        "categorization",
        txn_amount_col_name
    ],
    "filter_positives" : True
}

citizens_config = {
    "columns" : [
        "Transaction Type",
        txn_date_col_name,
        "Account Type",
        txn_description_col_name,
        txn_amount_col_name,
        "Reference No.",
        "Credits",
        "Debits"
    ],
    "filter_positives" : True
}

### Classes - Txn and TxnFile

In [32]:
class TxnFile():
    
    def __init__(self, path, file):
        
        self.root_dir = path
        self.file = file
        self.full_path = os.path.join(path, file)
        self.citizens_bank_key = 'citizens'
        self.usaa_bank_key = 'usaa'
        self.fidelity_bank_key = 'fidelity'
        self.discover_key = 'discover'
        
        self.bank = self.get_bank()
        
        if self.bank == self.citizens_bank_key:
            self.bank_config = citizens_config
        elif self.bank == self.usaa_bank_key:
            self.bank_config = usaa_config
            
        self.account_type = self.get_account_type()
        self.txn_df = self.get_txn_df()
        
    def get_bank(self):
        
        if 'bk_download' in self.file:
            bank = self.usaa_bank_key
        elif 'EXPORT' in self.file:
            bank = self.citizens_bank_key
            
        return bank
                 
    def get_account_type(self):

        acceptable_inputs = ['credit', 'checking', 'investment']
        prompt = "What kind of account is this file? {}\nMust be one of {}".format(self.file, acceptable_inputs)
        
        while True:
            acct_type = input(prompt)
            
            if acct_type not in acceptable_inputs:
                print("ERROR! Input not one of {}\n".format(acceptable_inputs))
                continue
            else:
                break
                
        return acct_type
            
    def get_txn_df(self):

        bank = self.bank
        acct_type = self.account_type
        
        header = 0 if self.bank in [self.citizens_bank_key] else None
        raw_txn_df = pd.read_csv(self.full_path, header=header)
        
        columns = self.bank_config['columns']
        filter_positives = self.bank_config['filter_positives']
        raw_txn_df.columns = columns

        txn_df = raw_txn_df[[txn_date_col_name, txn_amount_col_name, txn_description_col_name]].copy()
        #print(txn_df.dtypes)
#        txn_df = txn_df
        txn_df[txn_amount_col_name] = txn_df[txn_amount_col_name].astype(str)
        txn_df[txn_amount_col_name] = txn_df[txn_amount_col_name].str.replace("--", "")
        txn_df[txn_amount_col_name] = txn_df[txn_amount_col_name].astype(float)
        txn_df[txn_date_col_name]= pd.to_datetime(txn_df[txn_date_col_name]) 
        
        txn_df["bank"] = self.bank
        txn_df["acct_type"] = self.account_type
        
        txn_df["unique_id"] = txn_df.apply(
            lambda x:"{}_{}_{}_{}".format(
                bank, 
                acct_type, 
                x[txn_date_col_name].strftime("%Y%m%d"), 
                abs(x[txn_amount_col_name])
            ), axis = 1
        )
        
#         if filter_positives is True:
#             txn_df = txn_df[txn_df[txn_amount_col_name] > 0]
        return txn_df
    
        

In [20]:
full_txn_df = pd.DataFrame()

for txn_file_path in txn_files:

    txn_file = TxnFile(raw_data_path, txn_file_path)
    full_txn_df = full_txn_df.append(txn_file.txn_df)

What kind of account is this file? bk_download.csv
Must be one of ['credit', 'checking', 'investment']checking
What kind of account is this file? bk_download_cc_20180101_20190101.csv
Must be one of ['credit', 'checking', 'investment']credit
What kind of account is this file? EXPORT.CSV
Must be one of ['credit', 'checking', 'investment']checking


In [33]:
full_txn_df

Unnamed: 0,txn_date,txn_amount,txn_description,bank,acct_type,unique_id
0,2019-12-17,-21.73,AMZN Mktp US*XA2802ID3 AMZN.COM BILLWA,usaa,checking,usaa_checking_20191217_21.73
1,2019-12-17,-8.00,TST* STIR COOKING SCHOOLTDENVER CO,usaa,checking,usaa_checking_20191217_8.0
2,2019-12-17,-4.20,SQ *ALLEGRO COFFEE DENVER CO,usaa,checking,usaa_checking_20191217_4.2
3,2019-12-16,-686.92,USAA CREDIT CARD PAYMENT,usaa,checking,usaa_checking_20191216_686.92
4,2019-12-16,-86.67,WAL-MART #0986840 SUMMIT FRISCO CO,usaa,checking,usaa_checking_20191216_86.67
...,...,...,...,...,...,...
7,2019-12-13,1000.00,ORACLE AMERICA SALARY,citizens,checking,citizens_checking_20191213_1000.0
8,2019-12-03,-900.00,USAA.COM PAYMNT CREDIT CRD,citizens,checking,citizens_checking_20191203_900.0
9,2019-12-02,-100.00,FID BKG SVC LLC MONEYLINE,citizens,checking,citizens_checking_20191202_100.0
10,2019-12-02,-500.00,TO CHECKING 6300106334,citizens,checking,citizens_checking_20191202_500.0


In [48]:
# txn_date_col_name = "txn_date"
# txn_description_col_name = "txn_description"
# txn_amount_col_name = "txn_amount"

class Txn():
    
    # date, amt, description, bank, acct
    def __init__(self, txn):
        
        self.txn = txn
        self.txn_date = txn[txn_date_col_name].strftime(txn_strftime_format)
        self.txn_description = txn[txn_description_col_name]
        self.txn_amount = txn[txn_amount_col_name]
        self.bank = txn[bank_col_name]
        self.id = txn[id_col_name]
        
        self.txn_categorization_config = self.get_txn_categorization_config()
        self.txn_config_df = self.get_txn_config()
        
    @staticmethod
    def get_txn_config():
        
        txn_config_path = os.path.join(git_root_path, "configs", "txn_config.csv")
        txn_config_df = pd.read_csv(txn_config_path)
        
        return txn_config_df
    
    @staticmethod
    def get_txn_categorization_config():
        
        return {
            "type" : ["discretionary", "fixed", "intermittent", "variable"],
            "mapping" : {
                "Auto and Transport" : [
                    "Gas", "Insurance", "Lease Payment",
                    "Maintenance", "Miscellaneous", "Parking",  "Parts",
                    "Public Transport", "Ride Share", "Ticket", "Tolls"
                ], 
                "Bills and Utilities" : [
                    "Credit Card Payment", "Energy", "Phone", "Rent", "TV", "Utilities (Energy, TV, Wifi)", "Wifi"
                ],
                "Dining Out" : [
                    "Bars", "Coffee", "Group", "Solo"
                ],
                "Education" : [
                    "Student Loan Payment",
                    "Miscellaneous",
                    "Tuition"
                ],
                "Experiences" : [
                    "Flight", "Group", "Hobbies", "Lodging", "Tickets"
                ],
                "Entertainment" : [
                    "Audible", "Amazon Prime", "Other", "Sports", "Spotify", "TV", "Gambling"
                ],
                "Fees and Charges" : [
                    "Bank Service Fee", "Late Fee"
                ],
                "Gifts and Donations" : [
                    "Charity", "Holiday/Birthday", "Political", "Other"
                ],
                "Groceries" : [
                    "Combo", "Food", "Non-Food", "Unknown"
                ],
                "Health and Fitness" : [
                    "Gym", "Medical Care", "Body Maintenance", "Miscellaneous" , "Therapist", "Yoga"
                ], 
                "Hobbies" : [
                    "MTB", "Other",
                    "Snowboarding - Gear", "Snowboarding - Miscellaneous", "Snowboarding - Pass"
                ],
                "Income" : [
                    "Gift", "Lyft", "Other", "Oracle"
                ],
                "Investments" : [
                    "Investment - Withdrawal", "Investment - Deposit"
                ],
                "Miscellaneous" : ["Shipping", "Tools", "Wifi", "Other"],
                "Personal Care" : [
                    "Drycleaning", "Haircut", "Laundry"
                ],
                "Shopping" : [
                    "Clothes", "Gear", "Alcohol", "Technology"
                ], 
                "Taxes" : [
                    "Taxes - Federal", "Taxes - State", "Taxes - Local"
                ],
                "Uncategorized" : [
                    "ATM Withdrawal", "Bank Transfer", "Credit Card Redemption" , "Other"
                ],
                "Vanlife" : [
                    "Accessories", "Auto Parts", "Gym", "Loan Payment", "Maintenance", "PO Box", "Storage", "Taxes"
                ]
            }
        }
    
    def categorize_txn(self):
        
        txn = self.txn
        txn_description = txn['txn_description'] if txn['txn_description'] == txn['txn_description'] else None
        txn_amount = - 1 * txn['txn_amount']
        bank = txn['bank']
        txn_mapping = self.txn_categorization_config['mapping']
        
        txn_config_subset = self.txn_config_df.copy()
        txn_config_subset["recipient_match"] = txn_config_subset.apply(
            lambda x: True if txn_description is not None and x["recipient"] in txn_description else False, axis = 1
        )
        txn_config_subset = txn_config_subset[txn_config_subset["recipient_match"] == True]
        
        if len(txn_config_subset) > 1:
        
            txn_config_subset["dynamic_logic_match"] = txn_config_subset.apply(
                lambda x: eval(x["dynamic_logic"], {'txn_amount' : txn_amount, 'bank' : bank}) \
                    if x["dynamic_logic"] not in [np.NaN, None] else True, axis = 1
            )
            txn_config_subset = txn_config_subset[txn_config_subset["dynamic_logic_match"] == True]
        
        assert len(txn_config_subset) <= 1, "multiple categorizations found in config for txn:\n{}\n{}".format(self.txn, txn_config_subset)
        
        if len(txn_config_subset) == 1:
            
            txn_category = txn_config_subset.iloc[0]['category']
            txn_sub_category = txn_config_subset.iloc[0]['sub_category']
            
        elif len(txn_config_subset) == 0:
            
            txn_category, txn_sub_category = self.get_categorization_prompt()
                
        return txn_category, txn_sub_category
    
    def get_categorization_prompt(self):
        
        txn_mapping = self.txn_categorization_config['mapping']
        
        cat_prompt = "----------${} / {} / {}\nWhat category is this txn? Must be one of:\n{}".format(
            self.txn_amount,
            self.txn_date, 
            self.txn_description,
            [cat for cat in txn_mapping.keys()]
        )
            
        while True:
            txn_category = input(cat_prompt)

            if txn_category not in txn_mapping.keys():
                print("ERROR! Input not one of {}\n".format(txn_mapping.keys()))
                continue
            else:
                sub_cats = txn_mapping[txn_category]
                break

        while True:

            sub_cats.append('Redo')
            subcat_prompt = "What sub-category is the above txn? Must be one of:\n{}".format(
                sub_cats
            )

            txn_sub_category = input(subcat_prompt)
            
            if txn_sub_category not in sub_cats:
                print("ERROR! Input not one of {}\n".format(sub_cats))
                continue
            else:
                if txn_sub_category == "Redo":
                    self.get_categorization_prompt()
                break
                
        return txn_category, txn_sub_category

In [49]:
categorized_txn_df = pd.DataFrame()
categorized_txn_path = os.path.join(git_root_path , data_interim_dir_append, "categorized_txn.csv")

previously_categorized_txn_df = pd.read_csv(categorized_txn_path)

full_txn_df_dupes_rm = full_txn_df.copy()
full_txn_df_dupes_rm['remove_flg'] = full_txn_df.apply(
    lambda x: x.unique_id in previously_categorized_txn_df.unique_id.tolist(), axis=1
)
full_txn_df_dupes_rm = full_txn_df_dupes_rm[full_txn_df_dupes_rm['remove_flg'] == False]

for idx, txn in full_txn_df_dupes_rm.iterrows():
    indiv_txn = Txn(txn)
    indiv_txn_cat, indiv_txn_subcat = indiv_txn.categorize_txn()
    
    categorized_txn = {
        id_col_name : indiv_txn.id,
        txn_date_col_name : indiv_txn.txn_date,
        txn_amount_col_name : indiv_txn.txn_amount,
        txn_description_col_name : indiv_txn.txn_description,
        "category" : indiv_txn_cat,
        "sub_category" : indiv_txn_subcat
    }
    
    categorized_txn_df = categorized_txn_df.append(categorized_txn, ignore_index=True)
        
    full_categorized_txn_df = categorized_txn_df.append(previously_categorized_txn_df).reset_index(drop = True)
    full_categorized_txn_df = full_categorized_txn_df.sort_values(by=txn_date_col_name, ascending=False)
    full_categorized_txn_df.to_csv(categorized_txn_path, index=False)
    


None
----------$-11.99 / 2019-12-17 / nan
What category is this txn? Must be one of:
['Auto and Transport', 'Bills and Utilities', 'Dining Out', 'Education', 'Experiences', 'Entertainment', 'Fees and Charges', 'Gifts and Donations', 'Groceries', 'Health and Fitness', 'Hobbies', 'Income', 'Investments', 'Miscellaneous', 'Personal Care', 'Shopping', 'Taxes', 'Uncategorized', 'Vanlife']Fees and Charges
What sub-category is the above txn? Must be one of:
['Bank Service Fee', 'Late Fee', 'Redo']Bank Service Fee
None
----------$0.01 / 2019-12-17 / nan
What category is this txn? Must be one of:
['Auto and Transport', 'Bills and Utilities', 'Dining Out', 'Education', 'Experiences', 'Entertainment', 'Fees and Charges', 'Gifts and Donations', 'Groceries', 'Health and Fitness', 'Hobbies', 'Income', 'Investments', 'Miscellaneous', 'Personal Care', 'Shopping', 'Taxes', 'Uncategorized', 'Vanlife']Fees and Charges
What sub-category is the above txn? Must be one of:
['Bank Service Fee', 'Late Fee', 'R

In [None]:
full_categorized_txn_df = categorized_txn_df.append(previously_categorized_txn_df).reset_index(drop = True)

