In [1]:
import git
import os
import pandas as pd

### Settings

In [2]:
def get_git_root(path):
    git_repo = git.Repo(path, search_parent_directories=True)
    git_root = git_repo.git.rev_parse("--show-toplevel")
    return git_root

data_root_dir_append = "data"
data_raw_dir_append = "data/raw"
data_interim_dir_append = "data/interim"
data_processed_dir_append = "data/processed"

In [3]:
git_root_path = get_git_root(os.getcwd())

raw_data_path = os.path.join(git_root_path, data_raw_dir_append)

txn_files = [f for f in os.listdir(raw_data_path) if os.path.isfile(os.path.join(raw_data_path, f))]

### Drop into config file

In [4]:
txn_date_col_name = "txn_date"
txn_description_col_name = "txn_description"
txn_amount_col_name = "txn_amount"

usaa_config = {
    "columns" : [
        "status", 
        None,
        txn_date_col_name,
        None,
        txn_description_col_name,
        "categorization",
        txn_amount_col_name
    ],
    "filter_positives" : True
}

citizens_config = {
    "columns" : [
        "Transaction Type",
        txn_date_col_name,
        "Account Type",
        txn_description_col_name,
        txn_amount_col_name,
        "Reference No.",
        "Credits",
        "Debits"
    ],
    "filter_positives" : True
}

In [5]:
class TxnFile():
    
    def __init__(self, path, file):
        
        self.root_dir = path
        self.file = file
        self.full_path = os.path.join(path, file)
        self.citizens_bank_key = 'citizens'
        self.usaa_bank_key = 'usaa'
        self.fidelity_bank_key = 'fidelity'
        self.discover_key = 'discover'
        
        self.bank = self.get_bank()
        
        if self.bank == self.citizens_bank_key:
            self.config = citizens_config
        elif self.bank == self.usaa_bank_key:
            self.config = usaa_config
            
        self.account_type = self.get_account_type()
        self.txn_df = self.get_txn_df()
        
    def get_bank(self):
        
        if 'bk_download' in self.file:
            bank = self.usaa_bank_key
        elif 'EXPORT' in self.file:
            bank = self.citizens_bank_key
            
        return bank
                 
    def get_account_type(self):

        acceptable_inputs = ['credit', 'checking', 'investment']
        prompt = "What kind of account is this file? {}\nMust be one of {}".format(self.file, acceptable_inputs)
        
        while True:
            acct_type = input(prompt)
            
            if acct_type not in acceptable_inputs:
                print("ERROR! Input not one of {}\n".format(acceptable_inputs))
                continue
            else:
                break
                
        return acct_type
            
    def get_txn_df(self):

        bank = self.bank
        acct_type = self.account_type
        
        header = 0 if self.bank in [self.citizens_bank_key] else None
        raw_txn_df = pd.read_csv(self.full_path, header=header)
        
        columns = self.config['columns']
        filter_positives = self.config['filter_positives']
        raw_txn_df.columns = columns

        txn_df = raw_txn_df[[txn_date_col_name, txn_amount_col_name, txn_description_col_name]].copy()
        #print(txn_df.dtypes)
#        txn_df = txn_df
        txn_df[txn_amount_col_name] = txn_df[txn_amount_col_name].astype(str)
        txn_df[txn_amount_col_name] = txn_df[txn_amount_col_name].str.replace("--", "")
        txn_df[txn_amount_col_name] = txn_df[txn_amount_col_name].astype(float)
        txn_df[txn_date_col_name]= pd.to_datetime(txn_df[txn_date_col_name]) 
        
        txn_df["bank"] = self.bank
        txn_df["acct_type"] = self.account_type
        
        txn_df["unique_id"] = txn_df.apply(
            lambda x:"{}_{}_{}_{}".format(
                bank, 
                acct_type, 
                x[txn_date_col_name].strftime("%Y%m%d"), 
                abs(x[txn_amount_col_name])
            ), axis = 1
        )
        
#         if filter_positives is True:
#             txn_df = txn_df[txn_df[txn_amount_col_name] > 0]
        return txn_df
    
        

In [6]:
full_txn_df = pd.DataFrame()

for txn_file_path in txn_files:

    txn_file = TxnFile(raw_data_path, txn_file_path)
    full_txn_df = full_txn_df.append(txn_file.txn_df)

What kind of account is this file? bk_download.csv
Must be one of ['credit', 'checking', 'investment']checking
What kind of account is this file? EXPORT.CSV
Must be one of ['credit', 'checking', 'investment']checking


In [7]:
full_txn_df

Unnamed: 0,txn_date,txn_amount,txn_description,bank,acct_type,unique_id
0,2019-12-17,-21.73,AMZN Mktp US*XA2802ID3 AMZN.COM BILLWA,usaa,checking,usaa_checking_20191217_21.73
1,2019-12-17,-8.00,TST* STIR COOKING SCHOOLTDENVER CO,usaa,checking,usaa_checking_20191217_8.0
2,2019-12-17,-4.20,SQ *ALLEGRO COFFEE DENVER CO,usaa,checking,usaa_checking_20191217_4.2
3,2019-12-16,-686.92,USAA CREDIT CARD PAYMENT,usaa,checking,usaa_checking_20191216_686.92
4,2019-12-16,-86.67,WAL-MART #0986840 SUMMIT FRISCO CO,usaa,checking,usaa_checking_20191216_86.67
5,2019-12-16,-81.33,SAFEWAY #0322 GOLDEN CO,usaa,checking,usaa_checking_20191216_81.33
6,2019-12-16,-56.70,EXXONMOBIL 48225817 FRISCO CO,usaa,checking,usaa_checking_20191216_56.7
7,2019-12-16,-53.20,KiwiCo Inc. 800-7144828 CA,usaa,checking,usaa_checking_20191216_53.2
8,2019-12-16,-23.65,WILDWOOD VAIL CO,usaa,checking,usaa_checking_20191216_23.65
9,2019-12-16,-14.20,BRECKENRIDGE PASSPORT 414-4316555 CO,usaa,checking,usaa_checking_20191216_14.2


In [10]:
# txn_date_col_name = "txn_date"
# txn_description_col_name = "txn_description"
# txn_amount_col_name = "txn_amount"

class Txn():
    
    # date, amt, description, bank, acct
    def __init__(self, txn):
        
        self.txn = txn
        self.txn_date = txn[txn_date_col_name]
        self.txn_description = txn[txn_description_col_name]
        self.txn_amount = txn[txn_amount_col_name]
        
        self.txn_categorization_config = self.get_txn_categorization_config()
        self.txn_config = self.get_txn_config()
        
    @staticmethod
    def get_txn_config():
        
        txn_config = [
            {"recipient" : "AMZN", "type" : "discretionary", "category" : None, "sub_category" : None, "dynamic_logic": None},
            {"recipient" : "7-ELEVEN", "type" : "variable", "category" : "Auto and Transport", "sub_category" : "Auto - Gas", "dynamic_logic": "{} >= 10".format(txn_amount_col_name)},
            {"recipient" : "CONOCO", "type" : "variable", "category" : "Auto and Transport", "sub_category" : "Auto - Gas", "dynamic_logic": "{} >= 10".format(txn_amount_col_name)},
            {"recipient" : "CONOCO", "type" : "variable", "category" : "Food and Dining", "sub_category" : "Groceries", "dynamic_logic": "{} < 10".format(txn_amount_col_name)}, 
            {"recipient" : "7-ELEVEN", "type" : "variable", "category" : "Food and Dining", "sub_category" : "Groceries", "dynamic_logic": "{} < 10".format(txn_amount_col_name)},
            {"recipient" : "SAFEWAY", "type" : "variable", "category" : "Food and Dining", "sub_category" : "Groceries", "dynamic_logic": None},  
            {"recipient" : "EXXONMOBIL", "type" : "variable", "category" : "Auto and Transport", "sub_category" : "Auto - Gas", "dynamic_logic": "{} >= 10".format(txn_amount_col_name)},
            {"recipient" : "EXXONMOBIL", "type" : "variable", "category" : "Food and Dining", "sub_category" : "Groceries", "dynamic_logic": "{} < 10".format(txn_amount_col_name)},
        ]
        
        txn_config_df = pd.DataFrame(txn_config)
        
        return txn_config_df
    
    @staticmethod
    def get_txn_categorization_config():
        
        return {
            "type" : ["discretionary", "fixed", "intermittent", "variable"],
            "category" : ["Auto and Transport", "Bills and Utilities", "Education", "Entertainment", 
                          "Fees and Charges", "Food and Dining", "Gifts and Donations", "Health and Fitness", 
                          "Hobbies", "Investments", "Personal Care", "Shopping", "Taxes", "Travel", "Uncategorized"],
            "sub_category" : [
                "Alcohol",
                "ATM - Withdrawal",
                "Bank Service Fee",
                "Bars",
                "Ride Share",
                "Auto - Gas"
                "Auto - Insurance",
                "Auto - Loan Payment",
                "Auto - Maintenance",
                "Auto - Miscellaneous",
                "Auto - Parking",
                "Auto - Tolls",
                "Coffee",
                "Credit Card Payment",
                "Drycleaning",
                "Education - Loan Payment",
                "Education - Miscellaneous",
                "Education - Tuition",
                "Flight",
                "Groceries",
                "Gym",
                "Haircut",
                "Investment - Withdrawal",
                "Investment - Deposit",
                "Late Fee",
                "Laundry",
                "Lodging",
                "Medical Care",
                "MTB - Gear",
                "MTB - Miscellaneous",
                "Parking",
                "Restaurant - Solo",
                "Restaurant - Group",
                "Public Transport",
                "Snowboarding - Gear",
                "Snowboarding - Miscellaneous",
                "Snowboarding - Pass",
                "Taxes - Federal",
                "Taxes - State",
                "Taxes - Local",
                "Therapist",
                "Vanlife - Miscellaneous",
                "Vanlife - Storage",
                "Yoga"
            ]
        }
    def categorize_txn(self):
        
        txn = self.txn
        txn_description = txn['txn_description']
        txn_amount = abs(txn['txn_amount'])
        
        txn_config_subset = txn_config_df.copy()
        txn_config_subset["recipient_match"] = txn_config_subset.apply(
            lambda x: True if x["recipient"] in txn_description else False, axis = 1
        )
        txn_config_subset = txn_config_subset[txn_config_subset["recipient_match"] == True]
        txn_config_subset["dynamic_logic_match"] = txn_config_subset.apply(
            lambda x: eval(x["dynamic_logic"]), axis = 1
        )
        txn_config_subset = txn_config_subset[txn_config_subset["dynamic_logic_match"] == True]

        

In [22]:
row = full_txn_df.iloc[6]


In [23]:
row

txn_date                                2019-12-16 00:00:00
txn_amount                                            -56.7
txn_description    EXXONMOBIL    48225817   FRISCO       CO
bank                                                   usaa
acct_type                                          checking
unique_id                       usaa_checking_20191216_56.7
Name: 6, dtype: object

In [27]:
txn_category = txn_config_subset.iloc[0]['category']
txn_sub_category = txn_config_subset.iloc[0]['sub_category']

'Auto - Gas'