<a href="https://colab.research.google.com/github/axjasf/YNAB-Categorizer/blob/bugfixes/budget.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 0 - Setup

## Paths

In [65]:
# Path settings
HOME_PATH = "/content/drive/MyDrive/Colab Notebooks/budget/"
CONFIG_PATH = HOME_PATH + "config/"
TRANSACTIONS_PATH = HOME_PATH + "transactions/"
ORDERS_PATH = HOME_PATH + "orders/"

## Loading of Libraries

In [66]:
import json
import pandas as pd
import numpy as np

## Define global Variables

In [67]:
# Define the transactions dataframe and load the JSON configuration for the different banks
bank_transactions = {}

transactions_file = "transactions.csv"

config_files = {
    "Payee Matching" : "payee_matching.json",
    "Split Distribution" : "payee_category_split.csv",
    "Exchange Rates EUR USD": 'eur_usd_exchange_rates.csv',
    "Amazon Item Categories": 'amazon_item_categories.csv'
}

# If we are using Google Drive, prefix each value in the dictionary with the ..._PATH variable
config_files = {key: f"{CONFIG_PATH}{value}" for key, value in config_files.items()}

# 1 - Payees

## 1.1 - Read Transaction files

In [68]:
# Reset variables before reading the file
all_transactions = []

# Define overall transactions dataframe
all_transactions = pd.read_csv(TRANSACTIONS_PATH + transactions_file)

# Convert 'Amount (USD)' to float
all_transactions['Amount (USD)'] = pd.to_numeric(all_transactions['Amount (USD)'], errors='coerce')

# Convert the date columns to consistent datetime format
all_transactions['Date'] = pd.to_datetime(all_transactions['Date'])

print(len(all_transactions))

1610


## 1.2 - Payee mapping and Category assignment

In [69]:
import pandas as pd
import numpy as np
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

class MerchantMatcher:
    def __init__(self, data_df):
        self.data = data_df
        self.vectorizer = self._train_vectorizer()
        self.payee_vectors = self._compute_payee_vectors()
        self.positive_list_descriptions = self._get_positive_list_descriptions()

    def _match_prefix(self, description, merchant_details):
        prefix_length = merchant_details.get('PrefixLength', 50)
        for category in merchant_details['Categories']:
            for known_description in category['Descriptions']:
                truncated_payee = known_description.lower()[:prefix_length]
                if description.lower().startswith(truncated_payee):
                    return True
        return False

    def _train_vectorizer(self):
        all_descriptions = [desc.lower() for category_list in self.data['Categories'] for category in category_list for desc in category['Descriptions']]
        return TfidfVectorizer().fit(all_descriptions)

    def _compute_payee_vectors(self):
        payee_vectors = {}
        for merchant, details in self.data.iterrows():
            descriptions = [desc.lower() for category in details['Categories'] for desc in category['Descriptions']]
            if not descriptions:
                continue
            tfidf_matrix = self.vectorizer.transform(descriptions)
            avg_vector = np.asarray(tfidf_matrix.mean(axis=0))
            payee_vectors[merchant] = avg_vector
        return payee_vectors

    def _get_positive_list_descriptions(self):
        return set(desc.lower() for category_list in self.data['Categories'] for category in category_list for desc in category['Descriptions'])

    def predict_payees(self, transaction_df):
        mg_values = []
        chkpayee_values = []
        category_types = []
        categories = []
        candidates = []

        for _, row in transaction_df.iterrows():
            description_lower = row['Description'].lower() if row['Description'] else None
            current_merchant = None
            current_chkpayee = None
            current_category_type = None
            current_category = None
            max_match_length = 0  # To keep track of the length of the matching description

            if pd.isna(description_lower) or not description_lower.strip():
                mg_values.append(None)
                chkpayee_values.append(None)
                category_types.append(None)
                categories.append(None)
                continue

            for merchant, details in self.data.iterrows():
                for category in details.get('Categories', []):
                    for desc in category.get('Descriptions', []):
                        if desc.lower() in description_lower and len(desc) > max_match_length:
                            current_merchant = merchant
                            current_chkpayee = 'A'
                            current_category_type = category.get('Category Type')
                            current_category = category.get('Category')
                            max_match_length = len(desc)  # Update the max_match_length
                            break

                    # Check for prefix matching if no match is found yet
                    if not current_merchant and self._match_prefix(description_lower, details):
                        current_merchant = merchant
                        current_chkpayee = 'P'
                        current_category_type = category.get('Category Type')
                        current_category = category.get('Category')
                        break

                if current_merchant:  # Break the outer loop if a match is found
                    break

            if not current_merchant:
                description_vector = self.vectorizer.transform([description_lower])
                similarities = {merchant: linear_kernel(description_vector, np.asarray(vector))[0][0] for merchant, vector in self.payee_vectors.items()}
                predicted_merchant = max(similarities, key=similarities.get)
                max_similarity = similarities[predicted_merchant]

                if max_similarity > self.data.loc[predicted_merchant, 'Threshold']:
                    candidates.append({'Payee': predicted_merchant, 'Description': row['Description'], 'Probability': max_similarity})

            mg_values.append(current_merchant)
            chkpayee_values.append(current_chkpayee or 'C')
            category_types.append(current_category_type)
            categories.append(current_category)

        transaction_df['Payee'] = mg_values
        transaction_df['chkPayee'] = chkpayee_values
        transaction_df['Category Type'] = category_types
        transaction_df['Category'] = categories
        candidates_df = pd.DataFrame(candidates)
        return transaction_df, candidates_df

data_df = pd.read_json(config_files["Payee Matching"], orient="index")

matcher = MerchantMatcher(data_df)
payees_identified_df, payees_candidates_df = matcher.predict_payees(all_transactions)
payees_identified_df = payees_identified_df[payees_identified_df['chkPayee'] != 'C']

file_payees_identified = "1_2_chk_payees_identified.csv"
file_payees_candidates = "1_2_chk_payees_candidates.csv"

if os.path.exists(file_payees_identified): os.remove(file_payees_identified)
if os.path.exists(file_payees_candidates): os.remove(file_payees_candidates)
payees_identified_df.to_csv(file_payees_identified, index=False)
payees_candidates_df.to_csv(file_payees_candidates, index=False)

print(len(all_transactions)) #Achtung: M und S Transaktionen, Splits!

all_transactions.to_csv("1_2_df_all_payee_mapping_done.csv", index=False)

1610


# 2 - Categories

## 2.1 - Split assignment

In [70]:
class SplitProcessor:
    def __init__(self, all_transactions, split_data):
        self.all_transactions = all_transactions
        self.split_data = split_data
        self.split_transactions = []

    def _mark_as_master(self, idx):
        self.all_transactions.at[idx, 'Category Type'] = ''
        self.all_transactions.at[idx, 'Category'] = ''
        self.all_transactions.at[idx, 'SplitID'] = str(self.all_transactions.at[idx, 'Account-ID']) + '-' + 'M'
        self.all_transactions.at[idx, 'chkSplit'] = 'M'
        self.all_transactions.at[idx, 'chkCategory'] = 'A'

    def _add_split_rows(self, row, payee):
        split_details = self.split_data[self.split_data['Payee'] == payee]
        for _, category in split_details.iterrows():
            new_row = row.copy()
            new_row['Category Type'] = category['Category Type']
            new_row['Category'] = category['Category']
            new_row['SplitID'] = str(row['Account-ID']) + '-' + 'S'
            new_row['chkSplit'] = 'S'
            new_row['chkCategory'] = 'A'
            new_row['Amount (USD)'] = row['Amount (USD)'] * category['Percentage']
            self.split_transactions.append(new_row)

    def process_splits(self):
        for idx, row in self.all_transactions.iterrows():
            payee = row['Payee']
            split_details = self.split_data[self.split_data['Payee'] == payee]
            if not split_details.empty:
                self._mark_as_master(idx)  # Mark the original row in all_transactions as Master
                self._add_split_rows(row, payee)
        # Add the split rows to the all_transactions dataframe
        self.all_transactions = pd.concat([self.all_transactions, pd.DataFrame(self.split_transactions)], ignore_index=True)
        return len(self.all_transactions)

    def get_updated_transactions(self):
        return self.all_transactions

split_data = pd.read_csv(config_files['Split Distribution'])
processor = SplitProcessor(all_transactions, split_data)
processor.process_splits()
all_transactions = processor.get_updated_transactions()

all_transactions.to_csv("2_1_df_all_split_done.csv", index=False)

## 2.2 Amazon Order Matching

### 2.2.1 Header to Transaction Matching

In [71]:
amazon_order_headers = pd.read_csv(ORDERS_PATH + "amazon_headers.csv", parse_dates=['Order Date', 'Payment Date'])

In [72]:
class AmazonDataMatcher:
    def __init__(self, headers_df, transactions_df):
        self.headers_df = headers_df
        self.transactions_df = transactions_df
        self.matched_df = None

    def match_records(self):
        # Filter transactions based on Payee criteria
        valid_payees = ["Amazon", "Amazon Grocery", "Amazon Prime Video"]
        filtered_transactions = self.transactions_df[self.transactions_df['Payee'].isin(valid_payees)]

        # Create an empty dataframe to store the matched records
        matched_records = pd.DataFrame()

        # Iterate over each row in the Amazon order headers
        for idx, row in self.headers_df.iterrows():
            # Filter for transactions within a 5-day range
            date_range_mask = (filtered_transactions['Date'] >= row['Payment Date'] - pd.Timedelta(days=5)) & \
                              (filtered_transactions['Date'] <= row['Payment Date'] + pd.Timedelta(days=5))

            # Filter for matching payment amount
            amount_mask = filtered_transactions['Amount (USD)'] == -row['Payment Amount']

            # Get the matching transaction if it exists
            matching_transaction = filtered_transactions[date_range_mask & amount_mask]

            # If a matching transaction is found, concatenate to the matched_records dataframe
            if not matching_transaction.empty:
                matched_row = pd.concat([row, matching_transaction.iloc[0]])
                matched_records = pd.concat([matched_records, matched_row.to_frame().T], ignore_index=True)

        self.matched_df = matched_records

    def save_to_csv(self, output_path):
        if self.matched_df is not None:
            self.matched_df.to_csv(output_path, index=False)
            print(f"Matching complete and saved to {output_path}!")
        else:
            print("No matched data to save.")

    def verify_match(self):
        # Verification on the Amount (USD) level
        matched_amounts_sum = self.matched_df['Payment Amount'].sum()
        transactions_amounts_sum = self.transactions_df['Amount (USD)'].sum()

        # Check if the sums are approximately equal, considering potential rounding errors
        if abs(matched_amounts_sum + transactions_amounts_sum) < 0.05:
            print("Verification passed: Matched amounts are consistent with the transaction amounts.")
        else:
            print("Verification failed: There's a discrepancy in the matched amounts.")

    def update_transactions_with_matches(self, all_transactions):
        # Left merge to get matching 'Order ID' into the all_transactions DataFrame
        merged = all_transactions.merge(self.matched_df[['Account-ID', 'Order ID']],
                                        on='Account-ID',
                                        how='left')

        # Update the 'Category' column with 'Order ID' where matches occurred
        merged['Category'] = merged['Order ID'].combine_first(merged['Category'])

        # Drop the 'Order ID' column as it's no longer needed
        merged.drop('Order ID', axis=1, inplace=True)

        return merged

# Create an instance of the class
matcher = AmazonDataMatcher(amazon_order_headers, all_transactions)

# Match the records
matcher.match_records()

# Verify the match
matcher.verify_match()

# Save the matched data (optional)
matcher.save_to_csv('2_2_1_amazon_order_matched_records.csv')

matched_transactions_with_headers = matcher.matched_df

all_transactions.to_csv("2_2_1_df_all_header_to_txn_matching_done.csv", index=False)

Verification failed: There's a discrepancy in the matched amounts.
Matching complete and saved to 2_2_1_amazon_order_matched_records.csv!


### 2.2.2 Item to Transaction Categorizer

In [73]:
### 2.2.2.1 Item to Transaction mapping

amazon_order_items = pd.read_csv(ORDERS_PATH + "amazon_items.csv", parse_dates=['Order Date'])
keyword_df = pd.read_csv(config_files['Amazon Item Categories'])

# Merge the dataframes
final_df = pd.merge(matched_transactions_with_headers, amazon_order_items[['Order ID', 'Item Description', 'Item Category', 'Price', 'Quantity', 'chkMatch']], on='Order ID', how="left")

final_df = final_df[[
    'Date',
    'Account-ID',
#    'SplitID',
#    'Payee',
#    'Category Type',
#    'Category',
#    'Amount (USD)',
    'Order ID',
    'Payment Date',
    'Payment Amount',
#    'Category Type',
#    'Category',
#    'Description',
    'Item Description',
    'Item Category',
    'chkMatch',
    'Price',
    'Quantity',
    'Total',
    'Shipping',
    'Shipping Refund',
    'Gift',
    'Tax',
    'Refund',
    'Origin',
#    'Currency',
    'chkPayee',
    'chkCategory',
#    'chkEURUSD'
    ]]

final_df.to_csv("2_2_2_1_df_item_to_txn_mapping_done.csv", index=False)

In [74]:
### 2.2.2.2 - Transaction splitting and amount reconciliation by category group and item

class AmazonProcessorDebugStep2V5Revised:
    def __init__(self, final_df, all_transactions):
        self.final_df = final_df.sort_values(by='Order ID')
        self.all_transactions = all_transactions

    def modify_all_transactions_step2(self):
        modified_amazon_transactions = []

        # Process Amazon Transactions by grouping by Account-ID
        for account_id, account_group in self.final_df.groupby(['Account-ID']):
            total_summe = (account_group['Price'] * account_group['Quantity']).sum()
            actual_payment = self.all_transactions[self.all_transactions['Account-ID'] == account_id]['Amount (USD)'].iloc[0]

            # For single record
            if len(account_group) == 1:
                master_row = self.all_transactions[self.all_transactions['Account-ID'] == account_id].iloc[0].copy()
                master_row['Category'] = account_group['Item Category'].iloc[0]
                master_row['chkSplit'] = "U"
                master_row['SplitID'] = master_row['Account-ID'] + "-U"
                master_row['Item Description'] = account_group['Item Description'].iloc[0]  # This line is the adjustment
                modified_amazon_transactions.append(master_row.to_dict())
            else:
                # Master record (only if there are splits)
                master_row = self.all_transactions[self.all_transactions['Account-ID'] == account_id].iloc[0].copy()
                master_row['chkSplit'] = "M"
                master_row['SplitID'] = master_row['Account-ID'] + "-M"
                master_row['Category'] = np.nan  # Removing category for master record with splits
                modified_amazon_transactions.append(master_row.to_dict())

                # Splits for categorized items
                split_counter = 0
                for _, category_group in account_group.groupby('Item Category'):
                    category_summe = (category_group['Price'] * category_group['Quantity']).sum()
                    category_percentage = category_summe / total_summe
                    adjusted_amount = actual_payment * category_percentage

                    # Create a new row for the modified transaction
                    split_row = category_group.iloc[0].copy()
                    split_row['Payee'] = 'Amazon'
                    split_row['SplitID'] = split_row['Account-ID'] + "-S" + str(split_counter)
                    split_row['chkSplit'] = "S"
                    split_row['Category'] = category_group['Item Category'].iloc[0]
                    split_row['Amount (USD)'] = adjusted_amount
                    split_row['Item Description'] = np.nan
                    modified_amazon_transactions.append(split_row.to_dict())

                    split_counter += 1

                # Additional Splits for uncategorized items
                uncategorized_items = account_group[account_group['Item Category'].isna()]
                for _, row in uncategorized_items.iterrows():
                    uncategorized_split = row.copy()
                    uncategorized_split['Payee'] = 'Amazon'
                    uncategorized_split['SplitID'] = uncategorized_split['Account-ID'] + "-S" + str(split_counter)
                    uncategorized_split['chkSplit'] = "S"
                    uncategorized_split['Category'] = np.nan

                    # Adjusting the amount for uncategorized splits
                    uncategorized_summe = uncategorized_split['Price'] * uncategorized_split['Quantity']
                    uncategorized_percentage = uncategorized_summe / total_summe
                    uncategorized_adjusted_amount = actual_payment * uncategorized_percentage
                    uncategorized_split['Amount (USD)'] = uncategorized_adjusted_amount

                    modified_amazon_transactions.append(uncategorized_split.to_dict())

                    split_counter += 1

        # Combine the modified Amazon transactions with the all_transactions dataframe
        modified_amazon_df = pd.DataFrame(modified_amazon_transactions)
        # Remove original Amazon records
        self.all_transactions = self.all_transactions[~self.all_transactions['Account-ID'].isin(modified_amazon_df['Account-ID'])]
        # Combine with modified Amazon transactions
        combined_df = pd.concat([self.all_transactions, modified_amazon_df], ignore_index=True)

        return combined_df


# Apply the method
processor_debug_step2_v5_revised = AmazonProcessorDebugStep2V5Revised(final_df, all_transactions)
combined_transactions_step2_v5_revised = processor_debug_step2_v5_revised.modify_all_transactions_step2()

# Output to CSV
combined_transactions_step2_v5_revised.to_csv("2_2_2_2_txn_splitting_done.csv", index=False)

all_transactions = combined_transactions_step2_v5_revised

  for account_id, account_group in self.final_df.groupby(['Account-ID']):


# 3 - Output

## 3.1 - Dataframe preparation

In [75]:
# Reorder Columns
all_transactions = all_transactions[[
    'Date',
    'Account-ID',
#    'Order ID', #just for testing issue #16
    'SplitID',
    'Payee',
    'Category Type',
    'Category',
    'Amount (USD)',
    'Item Description',
    'Description',
    'chkPayee',
    'chkCategory',
    'chkSplit',
    'chkEURUSD']]

# Sort rows
all_transactions = all_transactions.sort_values(by=['Date', 'Account-ID', 'SplitID'], ascending=[False, True, True])

# Formating
all_transactions['Amount (USD)'] = all_transactions['Amount (USD)'].round(2)
all_transactions['Amount (USD)'] = all_transactions['Amount (USD)'].apply(lambda x: "${:,.2f}".format(x))


## 3.2 - Output file generation

In [76]:
if os.path.exists("z_output.csv"): os.remove("z_output.csv")
all_transactions.to_csv("z_output.csv", index=False)