<a href="https://colab.research.google.com/github/axjasf/YNAB-Categorizer/blob/main/budget.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# About

* This project is meant to bring all my personal finance related transactions into one easy to understand view.
* Scope / Value descriptoon
    * ...
* Mechanism
    * It reads CSV files from several US and German banks and Credit Card processors and harmonizes them into one dataframe.
    * It maps fields such as descriptions into payees
        * Lookup mechanism (direct hit and prefix hit) against a payee config JSON file
        * Matching against similarity vectors per payee to identify candidates (manual adjustment of payee JSON afterwards)
    * It categorizes each transaction or splits it into several categories
        * by payee
        * by pre-determination of a percentage split (e.g. for Walgreens that should be sufficient, given that I have categorized transactions since 2014)
        * by semi-automatic order-item review split (e.g. for Apple or Amazon transactions where these files exist and where a split between utility and subscription or grocery, household products or general shopping is of interest)
    * It works with a set of indicator field to mark aspects of interest
        * Indicator for transactions in which automatic determinations have been taken place
        * Task field to address open tasks
        * ...

# Setup

## Paths

In [None]:
# Path settings
HOME_PATH = "/content/drive/MyDrive/Colab Notebooks/budget/"
CONFIG_PATH = HOME_PATH + "config/"
TRANSACTIONS_PATH = HOME_PATH + "transactions/"
ORDERS_PATH = HOME_PATH + "orders/"

## Loading of Libraries

In [None]:
import json
import pandas as pd
import numpy as np

## Define global Variables

In [None]:
# Define the transactions dataframe and load the JSON configuration for the different banks
bank_transactions = {}

bank_files = {
        "Chase": "chase.csv",
        "Wells Fargo Checking": "wellsfargo_checking.csv",
        "Apple": "apple.csv",
        "Commerzbank": "commerzbank.csv"
    }

config_files = {
    "Payee Matching" : "payee_matching.json",
    "Exchange Rates EUR USD": 'eur_usd_exchange_rates.csv',
    "Amazon Item Categories": 'amazon_item_categories.csv'
}

# If we are using Google Drive, prefix each value in the dictionary with the ..._PATH variable
bank_files = {key: f"{TRANSACTIONS_PATH}{value}" for key, value in bank_files.items()}
config_files = {key: f"{CONFIG_PATH}{value}" for key, value in config_files.items()}

# Define overall transactions dataframe
all_transactions = []

# File Conversion

* For each bank file:
    * Load file into individual df
    * Basic quality control on the individual df level
    * Transform columns into target columns
        * Add Bank ID field as well as numberical ID field
    * Add individual df to transactions df

* Special transformations for non-US banks:
    * Date conversion
    * EUR to USD conversion based on an existing file (date and exchange rate or an API call to a free service)

In [None]:
def quality_control(df):
    missing_values = df.isnull().sum()
    column_data_types = df.dtypes

    return missing_values, column_data_types

In [None]:
def adjust_field_names(df, bank=""):

    if 'Category' in df.columns:
        df = df.rename(columns={"Category" : "oldCategory"})

    df.insert(4, 'SplitID',"")
    df.insert(0, 'Date','')
    df.insert(1, 'Payee','')
    df.insert(2, 'Category Type','')
    df.insert(3, 'Category','')
    df.insert(4, 'chkPayee','')
    df.insert(5, 'chkCategory','')
    df.insert(6, 'chkSplit','')
    df.insert(7, 'chkEURUSD','')

#    if bank == "Commerzbank":
#        df.insert("Amount (USD)")
#        df = df.rename(columns={"Booking text" : "Description"})

    return df

## Wells Fargo

### Wells Fargo Checking

In [None]:
# Read the Wells Fargo Checking CSV
bank = 'Wells Fargo Checking'
bank_transactions[bank] = pd.read_csv(bank_files[bank], header=None, names=["Transaction Date", "Amount (USD)", "Status", "Memo", "Description"])

# Adjust field names (if any specific adjustments are required)

# Convert 'Transaction Date' column to datetime
bank_transactions[bank]['Date'] = pd.to_datetime(bank_transactions[bank]['Transaction Date'], errors='coerce')

# Check for problematic dates (rows where the date conversion failed)
problematic_dates = bank_transactions[bank][bank_transactions[bank]['Date'].isna()]

# Perform quality control checks
missing_values, column_data_types = quality_control(bank_transactions[bank])

# Final touches for Wells Fargo only
bank_transactions[bank] = bank_transactions[bank].drop(columns=['Transaction Date', 'Status', 'Memo'])  # Assuming 'Status' is not needed, adjust as necessary
bank_transactions[bank]['Amount (USD)'] *= -1

## Chase

In [None]:
bank = 'Chase'
bank_transactions[bank] = pd.read_csv(bank_files[bank])

bank_transactions[bank] = adjust_field_names(bank_transactions[bank])

bank_transactions[bank]['Date'] = pd.to_datetime(bank_transactions[bank]['Transaction Date'], errors='coerce')
problematic_dates = bank_transactions[bank][bank_transactions[bank]['Date'].isna()]
missing_values, column_data_types = quality_control(bank_transactions[bank])

bank_transactions[bank] = bank_transactions[bank].drop(columns=['Post Date', 'oldCategory', 'Type', 'Memo', 'Transaction Date'])
bank_transactions[bank] = bank_transactions[bank].rename(columns={"Amount" : "Amount (USD)"})


## Apple

In [None]:
bank = 'Apple'
bank_transactions[bank] = pd.read_csv(bank_files[bank])

bank_transactions[bank] = adjust_field_names(bank_transactions[bank], bank)

bank_transactions[bank]['Date'] = pd.to_datetime(bank_transactions[bank]['Transaction Date'], errors='coerce')
problematic_dates = bank_transactions[bank][bank_transactions[bank]['Date'].isna()]
missing_values, column_data_types = quality_control(bank_transactions[bank])

# Final touches for Apple Card only
bank_transactions[bank] = bank_transactions[bank].drop(columns=['Transaction Date', 'Clearing Date', 'Merchant', 'oldCategory', 'Type', 'Purchased By'])
bank_transactions[bank]['Amount (USD)'] *= -1

#bank_transactions[bank]

## Commerzbank

In [None]:
bank = 'Commerzbank'
bank_transactions[bank] = pd.read_csv(bank_files[bank])

bank_transactions[bank] = adjust_field_names(bank_transactions[bank])


bank_transactions[bank]['Date'] = pd.to_datetime(bank_transactions[bank]['Transaction date'], errors='coerce', format='%d.%m.%Y') # For Commerzbank, Day.Month.Year
problematic_dates = bank_transactions[bank][bank_transactions[bank]['Date'].isna()]
missing_values, column_data_types = quality_control(bank_transactions[bank])

bank_transactions[bank] = bank_transactions[bank][bank_transactions[bank]['Amount'] != 0]


bank_transactions[bank].insert(7, "Amount (USD)","")

bank_transactions[bank] = bank_transactions[bank].rename(columns={"Booking text" : "Description"})

### EUR to USD conversion
# https://www.wsj.com/market-data/quotes/fx/EURUSD/historical-prices

exchange_rates_data = pd.read_csv(config_files['Exchange Rates EUR USD'])

# Convert the date columns to consistent datetime format
exchange_rates_data['Date'] = pd.to_datetime(exchange_rates_data['Date'], format='%m/%d/%Y')

# Merge on the date columns to add the exchange rate to bank_transactions[bank]
bank_transactions[bank] = bank_transactions[bank].merge(exchange_rates_data[['Date', ' Close']], on='Date', how='left')

# Add the chkEURUSD column based on the ' Close' column value
bank_transactions[bank]['chkEURUSD'] = np.where(bank_transactions[bank][' Close'].isna(), 'E', 'A')

# Convert the Amount from EUR to USD
bank_transactions[bank]['Amount (USD)'] = bank_transactions[bank]['Amount'] * bank_transactions[bank][' Close']

# Drop the ' Close' column as it's not needed anymore in bank_transactions[bank]
bank_transactions[bank].drop(' Close', axis=1, inplace=True)

### End of currency conversion

#bank_transactions[bank].drop(bank_transactions[bank].columns[[16, 15, 14, 13, 12, 10, 9, 8]], axis=1, inplace=True)
bank_transactions[bank] = bank_transactions[bank].drop(columns=['Transaction date', 'Value date', 'Transaction type', 'Amount', 'Account of initiator', 'Bank code of account of initiator', 'IBAN of account of initiator'])


In [None]:
all_transactions = pd.concat(bank_transactions, keys=bank_transactions.keys())
all_transactions['Account-ID'] = all_transactions.index.get_level_values(0) + "-" + all_transactions.index.get_level_values(1).astype(str)

# Payees

## Payee Harmonization

In [150]:
import pandas as pd
import numpy as np
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

class MerchantMatcher:
    def __init__(self, data_df):
        self.data = data_df
        self.vectorizer = self._train_vectorizer()
        self.payee_vectors = self._compute_payee_vectors()
        self.positive_list_descriptions = self._get_positive_list_descriptions()

    def _match_prefix(self, description, merchant_details):
        prefix_length = merchant_details.get('Prefix Length', 50)
        for known_description in merchant_details['Positive List']:
            truncated_payee = known_description.lower()[:prefix_length]
            if description.lower().startswith(truncated_payee):
                return True
        return False


    def _train_vectorizer(self):
        all_descriptions = [desc.lower() for descriptions in self.data['Positive List'] for desc in descriptions]
        return TfidfVectorizer().fit(all_descriptions)

    def _compute_payee_vectors(self):
        payee_vectors = {}
        for merchant, details in self.data.iterrows():
            tfidf_matrix = self.vectorizer.transform([desc.lower() for desc in details['Positive List']])
            avg_vector = np.asarray(tfidf_matrix.mean(axis=0))
            payee_vectors[merchant] = avg_vector
        return payee_vectors

    def _get_positive_list_descriptions(self):
        return set(desc.lower() for descriptions in self.data['Positive List'] for desc in descriptions)

    def predict_payees(self, transaction_df):
        mg_values = []
        chkpayee_values = []
        candidates = []

        for _, row in transaction_df.iterrows():
            description_lower = row['Description'].lower() if row['Description'] else None
            current_merchant = None
            current_chkpayee = None

            if pd.isna(description_lower) or not description_lower.strip():
                mg_values.append(None)
                chkpayee_values.append(None)
                continue

            for merchant, details in self.data.iterrows():
                if description_lower in [desc.lower() for desc in details['Positive List']]:
                    current_merchant = merchant
                    current_chkpayee = 'A'
                    break

                # Check for prefix matching
                if self._match_prefix(description_lower, details):
                    current_merchant = merchant
                    current_chkpayee = 'P'
                    break

            if not current_merchant:
                description_vector = self.vectorizer.transform([description_lower])
                similarities = {merchant: linear_kernel(description_vector, np.asarray(vector))[0][0] for merchant, vector in self.payee_vectors.items()}
                predicted_merchant = max(similarities, key=similarities.get)
                max_similarity = similarities[predicted_merchant]

                if max_similarity > self.data.loc[predicted_merchant, 'Threshold']:
                    candidates.append({'Payee': predicted_merchant, 'Description': row['Description'], 'Probability': max_similarity})

            mg_values.append(current_merchant)
            chkpayee_values.append(current_chkpayee or 'C')

        transaction_df['Payee'] = mg_values
        transaction_df['chkPayee'] = chkpayee_values
        candidates_df = pd.DataFrame(candidates)
        return transaction_df, candidates_df



data_df = pd.read_json(config_files["Payee Matching"], orient="index")

matcher = MerchantMatcher(data_df)
payees_identified_df, payees_candidates_df = matcher.predict_payees(all_transactions)
payees_identified_df = payees_identified_df[payees_identified_df['chkPayee'] != 'C']

file_payees_identified = "z_payees_identified.csv"
file_payees_candidates = "z_payees_candidates.csv"

if os.path.exists(file_payees_identified): os.remove(file_payees_identified)
if os.path.exists(file_payees_candidates): os.remove(file_payees_candidates)
payees_identified_df.to_csv(file_payees_identified, index=False)
payees_candidates_df.to_csv(file_payees_candidates, index=False)



# Categories

* Direct assignment Transactions <--> Payee mapping (1:1)
* Transactions <--> Amazon Orders mapping and splitting
* Transactions <--> Apple Orders mapping and splitting
* Transactions <--> Walgreens splitting

## Direct assignment

In [151]:
# Transactions <--> Payee mapping (1:1)

with open(config_files['Payee Matching'], 'r') as file:
    payee_data = json.load(file)

# List to hold split transactions
split_transactions = []

# Iterate over each row in the dataframe
for idx, row in all_transactions.iterrows():
    payee = row['Payee']

    # Check if payee exists in the JSON data
    if payee in payee_data:
        categories = payee_data[payee]['Categories']

        # If no category exists, update the row's category columns
        if len(categories) == 0:
            all_transactions.at[idx, 'chkCategory'] = 'E'

        # If only one category exists, update the row's category columns
        if len(categories) == 1:
            all_transactions.at[idx, 'Category Type'] = categories[0]['Category Type']
            all_transactions.at[idx, 'Category'] = categories[0]['Category']
            all_transactions.at[idx, 'chkCategory'] = 'A'

        # If multiple categories exist, create split transactions
        elif len(categories) > 1:
            all_transactions.at[idx, 'Category Type'] = ''  # Empty the master row's category columns
            all_transactions.at[idx, 'Category'] = ''
            all_transactions.at[idx, 'SplitID'] = str(row['Account-ID']) + '-' + 'M'
            all_transactions.at[idx, 'chkCategory'] = 'A'

            for idx_split, category in enumerate(categories, start=1):
                new_row = row.copy()
                new_row['Category Type'] = category['Category Type']
                new_row['Category'] = category['Category']
                new_row['SplitID'] = str(row['Account-ID']) + '-' + 'S' + str(idx_split-1)
                new_row['chkCategory'] = 'A'

                # Update the 'Amount (USD)' based on the percentage split from the JSON
                new_row['Amount (USD)'] = row['Amount (USD)'] * category.get('Percentage', 1)

                split_transactions.append(new_row)

# Append the split transactions to the main dataframe
all_transactions = pd.concat([all_transactions, pd.DataFrame(split_transactions)], ignore_index=False)

## Amazon categorization

1. **Identification of Amazon Transactions**:
* Filter transactions with the Payee set to "Amazon" or "Amazon Grocery".
* From this subset, take those transactions that don't already have a chkCategory flag.
   
2. **Match the Transactions to Orders**:
   - For each identified Amazon transaction, we need to match it with an order from the Amazon order file. This matching will be based on the transaction date (with a tolerance of a few days) and the payment amount.
   
3. **Extract Items for the Matched Orders**:
   - Once we have identified the matching order, we will then look up the items related to that order from the Amazon order items file.
   
4. **Categorize the Items**:
   - We will categorize the items into two groups:
     - Groceries (Split 1)
     - All other line items (Split 2...n)
   
5. **Modify the Transactions**:
   - We will then modify the transactions to reflect these splits, updating the description for each split with the appropriate line item description.

Step 1: Split and Identify Payments
1.1. Split the payments up from the Amazon header data.
1.2. Identify and match the transactions associated with these individual payments.
1.3. Retire the identified transactions and replace them with a new, merged transaction that sums up these payments. This merged transaction will have the chkmerged marker and the associated order ID.

Step 2: Categorize Items
2.1. For each item in an order, identify its category as we've done before.

Step 3: Compute Total for Each Order
3.1. For each item, calculate the sum of the product of quantity and price. If quantity is NaN, assume it to be 1.
3.2. Add taxes and shipping, and subtract the gift card amount. The result is the computed total for the order.
3.3. Compare this computed total with the calculated overall payment for the order from the merged transaction. These two values should match.

Step 4: Create Splits for Each Item
4.1. Make the merged transaction the master transaction.
4.2. Associate each item as a split transaction, where the split amount is the item price plus a portion of taxes, shipping, etc.
4.3. The splits should add up to match the master transaction amount.

Step 5: Roll-Up Splits by Category
5.1. Group the splits by their category for each order.
5.2. Sum up the amounts within these groups.
5.3. Retain only one split per category, with the summed amount.
5.4. These rolled-up splits should still add up to match the master transaction amount.

Going forward plan

Thank you for the comprehensive code. Let's go through it step by step and re-engineer based on our new plan:

1. AmazonProcessor (Cell 1)
This class is responsible for processing the Amazon order headers, particularly to split multiple payments.

**Usefulness**: This class is essential. We need to split multiple payments to associate them with individual transactions.

2. AmazonTransactionMatcher (Cell 2)
This class matches the processed Amazon payments (from the AmazonProcessor) with the actual bank transactions.

**Usefulness**: This class is crucial. After processing the Amazon headers, we need to match them to actual transactions.

3. CategoryIdentification (Cell 3)
This class categorizes each item based on a keyword list.

**Usefulness**: Still essential. We want to identify each item's category.

4. AmazonProcessor (Cell 4)
This class processes the merged DataFrame (`final_df`) to handle grocery and non-grocery splits and integrate them into the `all_transactions`.

**Usefulness**: This needs restructuring. The logic here is to be modified as per our new approach.

---

Proposed Structure:
1. **AmazonPaymentProcessor**: Refactor `AmazonProcessor` from Cell 1. This will process the Amazon headers and split multiple payments.
   
2. **TransactionMatcher**: Refactor `AmazonTransactionMatcher` from Cell 2. This will match Amazon payments to actual transactions. After matching, create a single merged transaction for each order, marking it with `chkmerged`.

3. **ItemCategorizer**: Refactor `CategoryIdentification` from Cell 3. This class will assign categories to items.

4. **TransactionUpdater**: Create a new class (replacing `AmazonProcessor` from Cell 4). This class will:
    - Calculate the total for each order.
    - Check the totals against the merged transaction values.
    - Create splits for each item.
    - Roll up splits by category.

---

Steps for Implementation:

1. Rename and restructure `AmazonProcessor` from Cell 1 to `AmazonPaymentProcessor`.
   
2. Rename and restructure `AmazonTransactionMatcher` from Cell 2 to `TransactionMatcher`. Modify it to create the merged transactions.

3. Rename and slightly modify `CategoryIdentification` from Cell 3 to `ItemCategorizer`.

4. Replace `AmazonProcessor` from Cell 4 with a new class named `TransactionUpdater`. Modify the logic as per our new approach.

Would you like to proceed with this structure?

### Amazon Payment Processor

In [152]:
amazon_order_headers = pd.read_csv(ORDERS_PATH + "amazon_headers_combined.csv", parse_dates=['Order Date', 'Payment Date'])

### Amazon Transaction Matcher

In [153]:
class AmazonDataMatcher:
    def __init__(self, headers_df, transactions_df):
        self.headers_df = headers_df
        self.transactions_df = transactions_df
        self.matched_df = None

    def match_records(self):
        # Filter transactions based on Payee criteria
        valid_payees = ["Amazon", "Amazon Grocery", "Amazon Prime"]
        filtered_transactions = self.transactions_df[self.transactions_df['Payee'].isin(valid_payees)]

        # Sort DataFrames for merge_asof
        self.headers_df = self.headers_df.sort_values('Payment Date')
        filtered_transactions = filtered_transactions.sort_values('Date')

        # Basic merge on the closest date. This doesn't consider the +/- 2 days condition yet.
        merged_df = pd.merge_asof(self.headers_df, filtered_transactions,
                                left_on='Payment Date',
                                right_on='Date',
                                direction='nearest')

        # Filter out rows where the date difference is more than 2 days.
        merged_df = merged_df[abs((merged_df['Payment Date'] - merged_df['Date']).dt.days) <= 5]

        # Convert 'Amount (USD)' to float
        merged_df['Amount (USD)'] = pd.to_numeric(merged_df['Amount (USD)'], errors='coerce')

        # Handle the sign difference between 'Payment Amount' and 'Amount (USD)'
        merged_df = merged_df[(merged_df['Payment Amount'] == -merged_df['Amount (USD)']) |
                            (merged_df['Payment Amount'] == merged_df['Amount (USD)'])]

        self.matched_df = merged_df

    def save_to_csv(self, output_path):
        if self.matched_df is not None:
            self.matched_df.to_csv(output_path, index=False)
            print(f"Matching complete and saved to {output_path}!")
        else:
            print("No matched data to save.")

    def verify_match(self):
        # Verification on the Amount (USD) level
        matched_amounts_sum = self.matched_df['Payment Amount'].sum()
        transactions_amounts_sum = self.transactions_df['Amount (USD)'].sum()

        # Check if the sums are approximately equal, considering potential rounding errors
        if abs(matched_amounts_sum + transactions_amounts_sum) < 0.05:
            print("Verification passed: Matched amounts are consistent with the transaction amounts.")
        else:
            print("Verification failed: There's a discrepancy in the matched amounts.")

    def update_transactions_with_matches(self, all_transactions):
        # Left merge to get matching 'Order ID' into the all_transactions DataFrame
        merged = all_transactions.merge(self.matched_df[['Account-ID', 'Order ID']],
                                        on='Account-ID',
                                        how='left')

        # Update the 'Category' column with 'Order ID' where matches occurred
        merged['Category'] = merged['Order ID'].combine_first(merged['Category'])

        # Drop the 'Order ID' column as it's no longer needed
        merged.drop('Order ID', axis=1, inplace=True)

        return merged

In [154]:
# Create an instance of the class
matcher = AmazonDataMatcher(amazon_order_headers, all_transactions)


# Match the records
matcher.match_records()

# Verify the match
matcher.verify_match()

# Save the matched data (optional)
matcher.save_to_csv('matched_records.csv')

#all_transactions = matcher.update_transactions_with_matches(all_transactions)
matched_transactions_with_headers = matcher.matched_df


Verification failed: There's a discrepancy in the matched amounts.
Matching complete and saved to matched_records.csv!


### Amazon Categorizer

In [155]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

class CategoryIdentification:
    def __init__(self, order_items_df, config_df, similarity_threshold=0.3):
        self.order_items = order_items_df
        self.config = config_df
        self.similarity_threshold = similarity_threshold

    def identify_category(self, item_description):
        # Check if item_description is a string
        if not isinstance(item_description, str):
            return None, None

        # Direct match with config
        matched_category = self.config[self.config['Item Desciption'].str.lower() == item_description.lower()]['Item Category'].values
        if matched_category.size > 0:
            return matched_category[0], "A"  # A for Automatic/Exact

        # Compute vector for item description
        vectorizer = TfidfVectorizer().fit(self.config['Item Keyword'].tolist() + [item_description])
        item_vector = vectorizer.transform([item_description])

        # Compute vectors for all keywords in config
        keyword_vectors = vectorizer.transform(self.config['Item Keyword'])

        # Compute cosine similarities
        similarities = cosine_similarity(item_vector, keyword_vectors)

        # Identify best match above threshold
        best_match_index = similarities.argmax()
        if similarities[0, best_match_index] > self.similarity_threshold:
            return self.config.iloc[best_match_index]['Item Category'], "S"  # S for Similar

        return None, None

    def categorize_order_items(self):
        self.order_items['Item Category'], self.order_items['chkMatch'] = zip(*self.order_items['Description'].map(self.identify_category))
        return self.order_items

    def get_uncategorized_items(self):
        return self.order_items[self.order_items['Item Category'].isnull()]



amazon_order_items = pd.read_csv(ORDERS_PATH + "amazon_items_combined.csv", parse_dates=['Order Date'])
order_items_df = amazon_order_items # refactor this later
keyword_df = pd.read_csv(config_files['Amazon Item Categories'])
category_identifier = CategoryIdentification(order_items_df, keyword_df)

categorized_order_items = category_identifier.categorize_order_items()

categorized_order_items = categorized_order_items.rename(columns={'Description': 'Item Description'})
categorized_order_items = categorized_order_items.rename(columns={'category': 'Item Category'})

# Rename the description column in matched_transactions to avoid conflict
#matched_transactions = matched_transactions.rename(columns={'description': 'Transaction Description'})

# Merge the dataframes
final_df = pd.merge(matched_transactions_with_headers, categorized_order_items[['Order ID', 'Item Description', 'Item Category', 'Price', 'Quantity', 'chkMatch']], on='Order ID', how="left")

print(categorized_order_items.columns)
#final_df = pd.merge(matched_transactions, categorized_order_items, on="order id", how="left")
#final_df = pd.merge(matched_transactions, categorized_order_items[['order id', 'description', 'category']], on="order id", how="left")

uncategorized_items = category_identifier.get_uncategorized_items()

final_df = final_df[[
    'Date',
    'Account-ID',
    'SplitID',
    'Payee',
    'Category Type',
    'Category',
    'Amount (USD)',
    'Payment Date',
    'Payment Amount',
    'Category Type',
    'Category',
    'Description',
    'Item Description',
    'Item Category',
    'chkMatch',
    'Price',
    'Quantity',
    'Total',
    'Shipping',
    'Shipping Refund',
    'Gift',
    'Tax',
    'Refund',
    'Origin',
    'Order ID',
    'Currency',
    'chkPayee',
    'chkCategory',
    'chkEURUSD']]


if os.path.exists("final_df.csv"): os.remove("final_df.csv")
final_df.to_csv("final_df.csv", index=False)


Index(['Origin', 'ID', 'Order ID', 'Order Date', 'Quantity',
       'Item Description', 'Price', 'chkQuantity', 'Item Category',
       'chkMatch'],
      dtype='object')


In [158]:
class AmazonProcessorDebugStep2V4:
    def __init__(self, final_df, all_transactions):
        self.final_df = final_df.sort_values(by='Order ID')
        self.all_transactions = all_transactions

    def modify_all_transactions_step2(self):
        modified_amazon_transactions = []

        # Process Amazon Transactions by grouping by Account-ID and Item Category
        for (account_id, item_category), group in self.final_df.groupby(['Account-ID', 'Item Category']):
            main_transaction = group.iloc[0]  # Use the first row for the main transaction details

            # If Item Category exists, aggregate by category
            if pd.notna(item_category):
                total_amount = (group['Price'] * group['Quantity']).sum()
                main_row = main_transaction.copy()
                main_row['Payee'] = np.nan
                main_row['chkSplit'] = "SPLIT"  # Set chkSplit to "SPLIT"
                main_row['Amount (USD)'] = np.nan  # Set Amount (USD) to NaN for splits
                main_row['Item Description'] = np.nan  # Set Amount (USD) to NaN for splits
                modified_amazon_transactions.append(main_row.to_dict())

        # Set chkSplit to "MSTR" for master transactions in all_transactions
        amazon_account_ids = self.final_df['Account-ID'].unique()
        self.all_transactions.loc[self.all_transactions['Account-ID'].isin(amazon_account_ids), 'chkSplit'] = "MSTR"

        # Combine the modified Amazon transactions with the all_transactions dataframe
        modified_amazon_df = pd.DataFrame(modified_amazon_transactions)
        combined_df = pd.concat([self.all_transactions, modified_amazon_df], ignore_index=True, sort=False)

        return combined_df

# Debug Step 2
processor_debug_step2_v4 = AmazonProcessorDebugStep2V4(final_df, all_transactions)
combined_transactions_step2_v4 = processor_debug_step2_v4.modify_all_transactions_step2()

combined_transactions_step2_v4 = combined_transactions_step2_v4[[
    'Date',
    'Account-ID',
    'SplitID',
    'chkSplit',
    'Payee',
    'Category Type',
    'Category',
    'Amount (USD)',
    'Payment Date',
    'Payment Amount',
    'Category Type',
    'Category',
    'Item Description',
    'Item Category',
    'Description',
    'chkMatch',
    'Price',
    'Quantity',
    'Total',
    'Shipping',
    'Shipping Refund',
    'Gift',
    'Tax',
    'Refund',
    'Origin',
    'Order ID',
    'Currency',
    'chkPayee',
    'chkCategory',
    'chkEURUSD']]

# Output
if os.path.exists("all_df.csv"): os.remove("all_df.csv")
combined_transactions_step2_v4.to_csv("all_df.csv", index=False)

# Return a message indicating the completion
"Updated transactions have been saved to all_df.csv"


'Updated transactions have been saved to all_df.csv'

In [157]:
### later or not
class AmazonProcessorStep123:
    def __init__(self, final_df, all_transactions):
        self.final_df = final_df
        self.all_transactions = all_transactions

    def process_transactions_step123(self):
        # Step 1: Calculate SUMME for each Account-ID and Item Category
        self.final_df['line_total'] = self.final_df['Price'] * self.final_df['Quantity']
        category_summe = self.final_df.groupby(['Account-ID', 'Item Category'])['line_total'].sum()

        # Step 2: Calculate Overall SUMME for each Order
        order_summe = self.final_df.groupby('Account-ID')['line_total'].sum()

        # Step 3: Retrieve Paid Amount
        paid_amounts = self.all_transactions.set_index('Account-ID')['Amount (USD)']

        # Joining the results for verification
        results = pd.concat([order_summe, category_summe, paid_amounts], axis=1)
        results.columns = ['Order SUMME', 'Category SUMME', 'Paid Amount']

        return results

# Debug Steps 1-3
processor_123 = AmazonProcessorStep123(final_df, all_transactions)
results_123 = processor_123.process_transactions_step123()
print(results_123)


  results = pd.concat([order_summe, category_summe, paid_amounts], axis=1)


ValueError: ignored

In [None]:
### backup before summe
class AmazonProcessorRevisedV2:
    def __init__(self, final_df, all_transactions):
        self.final_df = final_df.sort_values(by='Order ID')
        self.all_transactions = all_transactions

    def modify_all_transactions(self):
        # Step 1: Mark Amazon-related transactions in all_transactions with chkSplit as "MSTR"
        amazon_account_ids = self.final_df['Account-ID'].unique()
        self.all_transactions.loc[self.all_transactions['Account-ID'].isin(amazon_account_ids), 'chkSplit'] = "MSTR"

        modified_amazon_transactions = []

        # Step 2: Process Amazon Transactions by grouping by Account-ID and Item Category
        for (account_id, item_category), group in self.final_df.groupby(['Account-ID', 'Item Category']):
            main_transaction = group.iloc[0]  # Use the first row for the main transaction details

            print(account_id + " " + item_category)
            # If Item Category exists, aggregate by category
            if pd.notna(item_category):
                total_amount = (group['Price'] * group['Quantity']).sum()
                main_row = main_transaction.copy()
                main_row['Payee'] = 'Split Transaction'
#                main_row['Description'] = f'Amazon {item_category}'
#                main_row['Amount (USD)'] = total_amount
                main_row['chkSplit'] = "SPLIT"  # Set chkSplit to "SPLIT"
                modified_amazon_transactions.append(main_row)
            else:  # For items without an Item Category, create separate rows
                for _, row in group.iterrows():
                    item_row = row.copy()
                    item_row['Payee'] = 'Split Transaction'
#                    item_row['Description'] = row['Description']
#                    item_row['Amount (USD)'] = row['Price'] * row['Quantity']
                    item_row['chkSplit'] = "SPLIT"  # Set chkSplit to "SPLIT"
                    modified_amazon_transactions.append(item_row)

        # Step 3: Append the modified Amazon transactions back to all_transactions
        amazon_df = pd.DataFrame(modified_amazon_transactions)
        # Resetting indices before concatenation
        self.all_transactions = self.all_transactions.reset_index(drop=True)
        amazon_df = amazon_df.reset_index(drop=True)
        self.all_transactions = pd.concat([self.all_transactions, amazon_df], ignore_index=True, sort=False)

        # Dropping any potential duplicate rows
        self.all_transactions.drop_duplicates(inplace=True)

        return self.all_transactions


# Apply the revised method
processor_revised_v2 = AmazonProcessorRevisedV2(final_df, all_transactions)
updated_all_transactions_revised_v2 = processor_revised_v2.modify_all_transactions()

# Output
if os.path.exists("all_df.csv"): os.remove("all_df.csv")
updated_all_transactions_revised_v2.to_csv("all_df.csv", index=False)

# Return a message indicating the completion
"Updated transactions have been saved to all_df.csv"


In [None]:
### BACKUP

class AmazonProcessor:
    def __init__(self, final_df, all_transactions):
        self.final_df = final_df.sort_values(by='Order ID')
        self.all_transactions = all_transactions

    def modify_all_transactions(self):
        # Step 1: Remove Amazon-related transactions from the all_transactions DataFrame
        self.all_transactions = self.all_transactions[~self.all_transactions['Payee'].str.contains('Amazon', na=False)]

        modified_amazon_transactions = []

        # Step 2: Process Amazon Transactions
        for order_id, group in self.final_df.groupby('order id'):
            main_transaction = group.iloc[0]  # Use the first row for the main transaction details

            # Main transaction row: ensure it has all columns from all_transactions
            main_row = main_transaction.to_dict()
            modified_amazon_transactions.append(main_row)

            # Splits for grocery items
            grocery_rows = group[group['category'] == 'grocery']
            if not grocery_rows.empty:
                total_grocery_amount = grocery_rows['price'].sum()
                grocery_row = main_transaction.copy()
                grocery_row['Payee'] = 'Split Transaction'
                grocery_row['Description'] = 'Amazon Groceries'
                grocery_row['Amount (USD)'] = total_grocery_amount
                modified_amazon_transactions.append(grocery_row)

            # Splits for non-grocery items
            non_grocery_rows = group[group['category'] != 'grocery']
            for _, row in non_grocery_rows.iterrows():
                non_grocery_row = row.copy()
                non_grocery_row['Payee'] = 'Split Transaction'
                non_grocery_row['Description'] = row['description']
                non_grocery_row['Amount'] = row['price']
                modified_amazon_transactions.append(non_grocery_row)

        # Step 3: Append the modified Amazon transactions back to all_transactions
        amazon_df = pd.DataFrame(modified_amazon_transactions)
        self.all_transactions = pd.concat([self.all_transactions, amazon_df], ignore_index=True)

        return self.all_transactions

processor = AmazonProcessor(final_df, all_transactions)
updated_all_transactions = processor.modify_all_transactions()

processor = AmazonProcessor(final_df, all_transactions)
updated_all_transactions = processor.modify_all_transactions()





if os.path.exists("final_df.csv"): os.remove("final_df.csv")
updated_all_transactions.to_csv("final_df.csv", index=False)


# Output

## Dataframe preparation

In [None]:
# Reorder Columns

all_transactions = all_transactions[[
    'Date',
    'Account-ID',
    'SplitID',
    'Payee',
    'Category Type',
    'Category',
    'Amount (USD)',
    'Description',
    'chkPayee',
    'chkCategory',
    'chkEURUSD']]

# Sort rows
all_transactions = all_transactions.sort_values(by=['Date', 'Account-ID', 'SplitID'], ascending=[False, True, True])

# Formating
all_transactions['Amount (USD)'] = all_transactions['Amount (USD)'].round(2)
all_transactions['Amount (USD)'] = all_transactions['Amount (USD)'].apply(lambda x: "${:,.2f}".format(x))


## Output file generation

In [None]:
if os.path.exists("z_output.csv"): os.remove("z_output.csv")
all_transactions.to_csv("z_output.csv", index=False)