<a href="https://colab.research.google.com/github/axjasf/YNAB-Categorizer/blob/main/YNAB.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup

## Libraries

In [19]:
# Installing libraries
!pip install fuzzywuzzy
!pip install python-Levenshtein



In [20]:
# Importing libraries
import pandas as pd
import numpy as np
import os
from ipywidgets import widgets
from IPython.display import display
import json
import io
from fuzzywuzzy import process


## Files

In [21]:
# Folder names and File names

HOME_PATH = "/content/drive/MyDrive/Colab Notebooks/YNAB/"
CONFIG_PATH = HOME_PATH + "/config/"
TRANSACTIONS_PATH = HOME_PATH + "/transactions/"

TRANSACTIONS_FILE = "chase.csv"


In [22]:
# Mounting Google Drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [23]:
### Config Files ###

# Read Grocery keywords
grocery_keywords_df = pd.read_csv(CONFIG_PATH + 'grocery_keywords.csv')
grocery_keywords = grocery_keywords_df.iloc[:, 0].str.lower().tolist()

# Checking, if data is valid
# grocery_keywords_df.head()

In [24]:
### Amazon Orders ###

# Read Amazon Order Headers and Items
amazon_order_headers_df = pd.read_csv(TRANSACTIONS_PATH + 'amazon_order_headers.csv')
amazon_order_items_df = pd.read_csv(TRANSACTIONS_PATH + 'amazon_order_items.csv')

# Checking, if data is valid
# amazon_order_headers_df.head()
# amazon_order_items_df.head()

In [25]:
### Transactions ###

# Read bank transactions
transactions_df = pd.read_csv(TRANSACTIONS_PATH + TRANSACTIONS_FILE)

# Checking, if data is valid
# transactions_df.head()


## Global Variables

In [26]:
### CONSTANTS ###

# Major categories
category= {
    "grocery" : "Food, Sports, Entertainment:Food - Groceries & Errands"
}

# Transactions preparation

## Amazon Payee identification

In [31]:
# interactive


def identify_candidates(transactions_df, config):

    def fuzzy_match(transactions_df, patterns, payee_field):
        """Identify all fuzzy matches based on patterns."""
        matched_entries_with_scores = []
        for payee in transactions_df[payee_field].unique():
            for pattern in patterns:
                score = process.extractOne(pattern, [payee])[1]
                if score >= threshold:
                    matched_entries_with_scores.append((payee, score))
        return matched_entries_with_scores

    matches = fuzzy_match(transactions_df, patterns, payee_field)

    # Filter out any payees present in the negative_list from the all_matches
    matches = [match for match in matches if match[0] not in negative_list]

    # For the purpose of the widgets, exclude items from the positive list
    widget_matches = [match for match in matches if match[0] not in positive_list]

    # Assign matched_df to all matches
    matched_df = transactions_df[transactions_df[payee_field].isin([match[0] for match in matches])]

    return matches, widget_matches, matched_df


def create_feedback_widgets(matches_with_scores, transactions_df):

    def process_feedback(entry, action, matches):
        """Process user feedback."""
        if action == "Approve":
            config['positive_list'].append(entry)
        elif action == "Reject":
            config['negative_list'].append(entry)
            matches = [match for match in matches if match[0] != entry]
        elif action == "Pending":
            config['pending_list'].append(entry)
            matches = [match for match in matches if match[0] != entry]
        return matches

    def on_submit(button):
        global matched_payees

        for hbox in widgets_list:
            checkbox, action_dropdown = hbox.children
            entry = checkbox.description.split(" (Score:")[0]

            # If the checkbox is checked, handle feedback
            if checkbox.value:
                action = action_dropdown.value
                matched_payees = process_feedback(entry, action, matched_payees)

        with open(CONFIG_PATH + filename, 'w') as file:
            json.dump(config, file, indent=4)

    # Initialize an empty list to store the widgets
    widgets_list = []

    # Filter matches_with_scores based on the threshold
    filtered_matches_with_scores = [(entry, score) for entry, score in matches_with_scores if score >= threshold]

    for entry, score in filtered_matches_with_scores:
        checkbox = widgets.Checkbox(value=False, description=f"{entry} (Score: {score})")

        # Preselect dropdown based on score
        preselected_action = 'Approve'

        action_dropdown = widgets.Dropdown(
            options=['Approve', 'Reject', 'Pending'],
            value=preselected_action,
            description='Action:'
        )

        hbox = widgets.HBox([checkbox, action_dropdown])
        widgets_list.append(hbox)  # Append the HBox to the widgets list
        # display(hbox)

    submit_button = widgets.Button(description="Submit Feedback")
    submit_button.on_click(lambda button: on_submit(button))

    return widgets.VBox(widgets_list + [submit_button])

# Load the config
filename = "gas_stations_patterns_config.json"
# filename = 'amazon_patterns_config.json'
config = json.load(open(CONFIG_PATH + filename))
patterns = config["patterns"]
threshold = config["fuzzy_threshold"]
payee_field = "Description"
positive_list = config.get("positive_list",[])
negative_list = config.get("negative_list", [])

# Identify potential outliers and matches
matched_payees, widget_payees, matched_df = identify_candidates(transactions_df, config)
# Display the feedback widgets for all fuzzy matches above the threshold
feedback_widgets = create_feedback_widgets(widget_payees, transactions_df)
display(feedback_widgets)






VBox(children=(Button(description='Submit Feedback', style=ButtonStyle()),))

In [33]:
matched_df.head()

Unnamed: 0,Transaction Date,Post Date,Description,Category,Type,Amount,Memo
17,8/21/2023,8/23/2023,SHELL OIL 57444216105,Gas,Sale,-83.03,
34,8/15/2023,8/16/2023,CHEVRON 0090878,Gas,Sale,-64.01,
61,8/8/2023,8/10/2023,Aral Station 140980178,Gas,Sale,-110.53,
68,8/3/2023,8/6/2023,Aral Station 140486127,Gas,Sale,-82.11,
126,7/10/2023,7/11/2023,CHEVRON 0090878,Gas,Sale,-78.66,


## NEXT / General ideas
General algo
*   For all Amazon transactions
  * run the Amazon Orders program and put them into a QIF file
*   Go through all other payees
  *   List item

* Submit Feedback - does it actually work?


# Amazon Orders

## Pricing

In [None]:
### PRICE TOTALS CALCULATIONS ###
### HEADER ###

# Extract the individual payments for one row and put them into columns
def extract_individual_payments_into_columns(df, column_name):
    # Extract dollar values from the specified column
    payments_dollar_values = df[column_name].str.findall(r'\$(\d+\.\d+)')

    # Determine the maximum number of payments in any row
    max_payments = payments_dollar_values.apply(len).max()

    # Pad the lists to ensure they all have a length of max_payments
    padded_payments = payments_dollar_values.apply(lambda x: x + [None] * (max_payments - len(x)))

    # Convert the padded lists to a DataFrame with dynamic column names
    payment_columns = [f'payment_{i+1}' for i in range(max_payments)]
    df[payment_columns] = pd.DataFrame(padded_payments.tolist(), index=df.index)

    # Convert string values to float for further analysis if needed
    for col in payment_columns:
        df[col] = df[col].astype(float)

# Apply the function to the original dataframe
extract_individual_payments_into_columns(amazon_order_headers, "payments")

# amazon_order_headers.head()

In [None]:
### PRICE TOTALS CALCULATIONS ###
### ITEMS ###

# Item Totals (Quantity * Price) #
amazon_order_items = amazon_order_items.copy()
if amazon_order_items['price'].dtype == 'object':
    amazon_order_items['price'] = pd.to_numeric(amazon_order_items['price'].str.replace('[\$,]', '', regex=True), errors='coerce')
amazon_order_items['quantity'] = amazon_order_items['quantity'].replace([' ', ''], np.nan)
amazon_order_items['quantity'] = pd.to_numeric(amazon_order_items['quantity'], errors='coerce').fillna(1)
amazon_order_items['total'] = amazon_order_items['price'] * amazon_order_items['quantity']

### Distribution of taxes etc. to individual order items ###

# Group the items dataframe by 'order id' and compute the sum of the 'total' column for each order
# Merge the summed items total with the headers dataframe on 'order id'
# Convert 'total_header' column to numeric format, setting non-numeric values to NaN
# Compute the difference between the 'total' from the headers dataframe and the computed sum of items for each order
# Compute the count of items for each order
# Merge the count of items with the merged dataframe
# Compute the 'overall' value for each order
# Merge the 'overall' value with the original items dataframe to create the 'overall' column

items_sum = amazon_order_items.groupby('order id')['total'].sum().reset_index()
merged_df = amazon_order_headers.merge(items_sum, on='order id', how='left', suffixes=('_header', '_items'))
merged_df['total_header'] = pd.to_numeric(merged_df['total_header'].str.replace('[\$,]', '', regex=True), errors='coerce')
merged_df['difference'] = merged_df['total_header'] - merged_df['total_items']
items_count = amazon_order_items.groupby('order id').size().reset_index(name='count')
merged_df = merged_df.merge(items_count, on='order id', how='left')
merged_df['residual'] = merged_df['difference'] / merged_df['count']
amazon_order_items = amazon_order_items.merge(merged_df[['order id', 'residual']], on='order id', how='left')
amazon_order_items['grand_total'] = amazon_order_items['total'] + amazon_order_items['residual'].round(2)

amazon_order_items.head()

## Item categorization

In [None]:
# Initialize a category column with default value as 'non-grocery'
amazon_order_items['category'] = 'n/a'

# Check each item description against the keywords
for keyword in grocery_keywords:
    amazon_order_items.loc[amazon_order_items['description'].str.lower().str.contains(keyword, na=False), 'category'] = 'grocery'

amazon_order_items.head()


# Output

## Transaction creation

In [None]:
### SPLIT TRANSACTION ###

def create_split(category, memo, amount):
    return (category, memo, amount)

def create_transaction(date, total_amount, payee, memo, category=None, splits=None):
    return {
        "date": date,
        "total_amount": total_amount,
        "payee": payee,
        "memo": memo,
        "category": category,
        "splits": splits
    }

def export_qif_transactions(filename, transactions):
    qif_content = "!Type:Cash\n"

    for transaction in transactions:
        # Add the transaction details
        qif_content += f"D{transaction['date']}\n"
        qif_content += f"T{transaction['total_amount']}\n"
        qif_content += f"P{transaction['payee']}\n"
        qif_content += f"M{transaction['memo']}\n"

        # If a category is provided for a simple transaction, add it
        if transaction.get('category') and not transaction.get('splits'):
            qif_content += f"L{transaction['category']}\n"

        # If splits exist for the transaction, add them
        splits = transaction.get('splits')
        if splits:
            qif_content += "LSplit Transaction\n"
            for split in splits:
                category, memo, amount = split
                qif_content += f"S{category}\n"
                qif_content += f"E{memo}\n"
                qif_content += f"${amount}\n"

        # End the transaction
        qif_content += "^\n"

    # Export to a file in Google Drive
    with open(HOME_PATH + "/results/" + filename, "w") as file:
        file.write(qif_content)

def export_qif_transactions_by_chunk(transactions, x, base_filename="transactions"):
    """
    Export the transactions in QIF format, splitting them into separate files,
    each containing up to x transactions.

    Parameters:
    - transactions: List of transaction dictionaries.
    - x: Number of transactions per file.
    - base_filename: Base name for the output files. Files will be named as base_filename_1.qif, base_filename_2.qif, ...

    Returns:
    - List of saved file paths.
    """

    # Validate x
    if not (1 <= x <= 100):
        raise ValueError("x should be between 1 and 100")

    saved_files = []

    for idx, i in enumerate(range(0, len(transactions), x)):
        chunk = transactions[i:i+x]
        qif_content = "!Type:Cash\n"

        for transaction in chunk:
            # Add the transaction details
            qif_content += f"D{transaction['date']}\n"
            qif_content += f"T{transaction['total_amount']}\n"
            qif_content += f"P{transaction['payee']}\n"
            qif_content += f"M{transaction['memo']}\n"

            # If a category is provided for a simple transaction, add it
            if transaction.get('category') and not transaction.get('splits'):
                qif_content += f"L{transaction['category']}\n"

            # If splits exist for the transaction, add them
            splits = transaction.get('splits')
            if splits:
                qif_content += "LSplit Transaction\n"
                for split in splits:
                    category, memo, amount = split
                    qif_content += f"S{category}\n"
                    qif_content += f"E{memo}\n"
                    qif_content += f"${amount}\n"

            # End the transaction
            qif_content += "^\n"

        # Define filename for this chunk
        filename = f"{base_filename}_{idx+1}.qif"
        filepath = HOME_PATH + "/results/" + filename

        # Export to a file
        with open(filepath, "w") as file:
            file.write(qif_content)

        saved_files.append(filepath)

    return saved_files


## QIF File creation

In [None]:
# Using the provided functions to regenerate the QIF transactions and export them

# Generate the QIF transactions

transactions = []

from datetime import datetime

# Iterate over each unique order
for order_id, group in amazon_order_items.groupby('order id'):
    order_date = group['order date'].iloc[0]
    total_amount = group['grand_total'].sum()
    payee = "Amazon" + " " + order_id

    if len(group) == 1: # Check for single item transactions
        row = group.iloc[0]
        if row['category'] == 'grocery':
            transaction = create_transaction(order_date, f"{row['grand_total']:.2f}", "Amazon " + order_id, "", category=category["grocery"])
        else:
            transaction = create_transaction(order_date, f"{total_amount:.2f}", payee, memo=row['description'])
        transactions.append(transaction)

    else:
        # For multiple items in the transaction:
        # Grocery split amount
        grocery_total = group.loc[group['category'] == 'grocery', 'grand_total'].sum()

        # Non-grocery splits
        non_grocery_splits = []
        for _, row in group[group['category'] != 'grocery'].iterrows():
          # Ensure row has valid data
          if 'description' in row and 'grand_total' in row:
            split = create_split("", row['description'], f"{row['grand_total']:.2f}")
            non_grocery_splits.append(split)

        # If grocery total is not zero, add it as a split
        if grocery_total != 0:
            grocery_split = create_split(category["grocery"], "", f"{grocery_total:.2f}")
            non_grocery_splits.append(grocery_split)

        # Create the transaction
        transaction = create_transaction(order_date, f"{total_amount:.2f}", "Amazon " + order_id, "", splits=non_grocery_splits)
        transactions.append(transaction)

# Export the transactions to a QIF file by chunks
export_qif_transactions_by_chunk(transactions, 100)
