<a href="https://colab.research.google.com/github/axjasf/YNAB-Categorizer/blob/main/Combine_Amazon_Files.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup

In [15]:
# Libraries
import os
import pandas as pd
import re

# Path settings
HOME_PATH = "/content/drive/MyDrive/Colab Notebooks/budget/"
CONFIG_PATH = HOME_PATH + "config/"
TRANSACTIONS_PATH = HOME_PATH + "transactions/"
ORDERS_PATH = HOME_PATH + "orders/"

In [16]:
# Define the filenames along with their respective prefixes in a dictionary
file_dict = {
    "header_files": [
        {"filename": "amazon_order_headers_axel.csv", "origin": "A"},
        {"filename": "amazon_order_headers_danielle.csv", "origin": "D"}
    ],
    "item_files": [
        {"filename": "amazon_order_items_axel.csv", "origin": "A"},
        {"filename": "amazon_order_items_danielle.csv", "origin": "D"}
    ]
}

# Combine Header and Item files

In [17]:
class AmazonOrderFilesProcessor:
    def __init__(self, base_path, file_dict):
        self.base_path = base_path
        self.file_dict = file_dict
        self.combined_header_df = None
        self.combined_item_df = None

    @staticmethod
    def normalize_date_format(date_str):
        if not isinstance(date_str, str):  # Add a check for non-string values
            print(f"Warning: Encountered non-string date value: {date_str}")
            return date_str

        pattern = r'(\d{1,2})\s*(\d{1,2})\s*(\d{4})'
        match = re.match(pattern, date_str)
        if match:
            month, day, year = match.groups()
            return f"{year}-{int(month):02d}-{int(day):02d}"
        else:
            return date_str

    @staticmethod
    def extract_multiple_payments(payment_string):
        if not isinstance(payment_string, str):
            return []

        pattern = r"(?:MasterCard|Visa)?(?:\s*ending\s*in\s*\d{4})?:?\s*([A-Za-z]*\s*\d{1,2},?\s*\d{4}|\d{4}-\d{1,2}-\d{1,2})[^$]*\$\s*([\d,]+\.\d{2})"
        matches = re.findall(pattern, payment_string)

        month_names = ["January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December"]
        month_map = {month: str(index + 1) for index, month in enumerate(month_names)}

        processed_matches = []
        for date, amount in matches:
            for month, month_num in month_map.items():
                date = date.replace(month, month_num)
            date = date.replace(",", "").replace(" ", "-")
            date = AmazonOrderFilesProcessor.normalize_date_format(date)  # normalize the date format here
            processed_matches.append((date, float(amount.replace(',', ''))))

        return processed_matches

    def split_multiple_payments(self, df):
        new_rows = []
        for _, row in df.iterrows():
            payments = self.extract_multiple_payments(row['Payments'])

            for date, amount in payments:
                new_row = row.copy()
                new_row['Payment Date'] = pd.to_datetime(date)
                new_row['Payment Amount'] = amount
                new_rows.append(new_row)

        return pd.DataFrame(new_rows)

    def process_header_file(self, file_info):
        df = pd.read_csv(self.base_path + file_info["filename"])

        # Standardize column spelling and drop unnecessary columns
        df.columns = df.columns.str.capitalize()
        df = df.rename(columns={'Order id': 'Order ID', 'Shipping_refund': 'Shipping Refund', 'Date': 'Order Date'})
        df = df.drop(columns=['Items', 'To'])

        # Remove header rows
        df = df[(df['Order Date'] != 'date') & (df['Payments'] != 'payments')]

        # Remove pending rows
        df = df[(df['Total'] != 'pending')]

        # Normalize the 'Date' column and turn it into a Date object
        df['Order Date'] = df['Order Date'].apply(self.normalize_date_format)
        df['Order Date'] = pd.to_datetime(df['Order Date'])

        # Add Origin and ID
        df["Origin"] = file_info["origin"]
        df["ID"] = range(1, len(df) + 1)

        return self.split_multiple_payments(df)

    def process_item_file(self, file_info):
        df = pd.read_csv(self.base_path + file_info["filename"])

        # Standardize column spelling
        df.columns = df.columns.str.capitalize()
        df = df.rename(columns={'Order id': 'Order ID', 'Order date': 'Order Date'})

        # Remove header rows
        df = df[(df['Price'] != 'price')]

        # Normalize the 'Date' column and turn it into a Date object
        df['Order Date'] = df['Order Date'].apply(self.normalize_date_format)
        df['Order Date'] = pd.to_datetime(df['Order Date'])

        # Check for empty or NaN values in the 'Quantity' column and update accordingly
        is_empty_quantity = df['Quantity'].isnull() | (df['Quantity'] == '')
        df.loc[is_empty_quantity, 'Quantity'] = 1
        df['chkQuantity'] = 'N'
        df.loc[is_empty_quantity, 'chkQuantity'] = 'Y'

        # Remove $ sign and format as float
        df['Price'] = df['Price'].str.replace('$', '', regex=True).astype(float)

        # Add Origin and ID
        df["Origin"] = file_info["origin"]
        df["ID"] = range(1, len(df) + 1)

        return df

    def process_files(self):
        header_dfs = [self.process_header_file(file_info) for file_info in self.file_dict["header_files"]]
        self.combined_header_df = pd.concat(header_dfs, ignore_index=True)

        # Rearrange columns for header df
        cols = ['Origin', 'ID'] + [col for col in self.combined_header_df if col not in ['Origin', 'ID']]
        self.combined_header_df = self.combined_header_df[cols]

        item_dfs = [self.process_item_file(file_info) for file_info in self.file_dict["item_files"]]
        self.combined_item_df = pd.concat(item_dfs, ignore_index=True)

        # Rearrange columns for item df
        cols = ['Origin', 'ID'] + [col for col in self.combined_item_df if col not in ['Origin', 'ID']]
        self.combined_item_df = self.combined_item_df[cols]

        return self.combined_header_df, self.combined_item_df

# Create an instance of the class and process the files
processor = AmazonOrderFilesProcessor(ORDERS_PATH, file_dict)
combined_headers_df, combined_items_df = processor.process_files()



# Export Combined Files

In [18]:
# Export both files into Google Drive
combined_headers_df.to_csv(f"{ORDERS_PATH}amazon_headers_combined.csv", index=False)
combined_items_df.to_csv(f"{ORDERS_PATH}amazon_items_combined.csv", index=False)