In [1]:
from datetime import datetime
import pandas as pd
import numpy as np

In [362]:
def convert_written_date_to_number(date_str):
    # Define mapping of month abbreviations to month numbers
    month_map = {
        'JAN': 1, 'FEB': 2, 'MAR': 3, 'APR': 4,
        'MAY': 5, 'JUN': 6, 'JUL': 7, 'AUG': 8,
        'SEP': 9, 'OCT': 10, 'NOV': 11, 'DEC': 12
    }
    
    # Extract day, month abbreviation, and year from date string
    day = int(date_str[:2])
    month_abbr = date_str[2:5]
    year = int('20' + date_str[5:])
    
    # Convert month abbreviation to month number
    month_num = month_map[month_abbr]
    
    # Create a datetime object with the extracted values
    date = datetime(year, month_num, day)
    
    return date.strftime('%d/%m/%Y')

In [363]:
df = pd.read_csv(f"~/Desktop/Bank Statements/csv/tabula-Statements_Mar-20.csv", header=None, names=["Date", "Date2", "Description", "Value"])

# Initialize empty lists to store cleaned data
cleaned_dates = []
cleaned_descriptions = []
cleaned_values = []

# Initialize variables to hold current transaction data
current_date = None
current_description = ""
current_value = None

In [364]:
# Loop through each row in the DataFrame
for index, row in df.iterrows():
    if not pd.isnull(row["Date"]):
        # If a new date is found, it indicates the end of the previous transaction
        # Add the cleaned data from the previous transaction into the lists
        if current_date is not None:
            cleaned_dates.append(current_date)
            cleaned_descriptions.append(current_description)
            cleaned_values.append(current_value)
        
        # Update the current transaction data with the values from the current row
        current_date = row["Date"]
        current_description = row["Description"]
        current_value = row["Value"]
    else:
        # If the row does not have a date, it contains part of the description
        # Append the description to the current transaction's description
        current_description += " " + str(row["Description"])

In [365]:
# Add the last transaction's data after reaching the end of the loop
if current_date is not None:
    cleaned_dates.append(current_date)
    cleaned_descriptions.append(current_description)
    cleaned_values.append(current_value)


In [366]:
# Create a new DataFrame from the cleaned data
cleaned_df = pd.DataFrame({
    "Date": cleaned_dates,
    "Txn Date": cleaned_dates,
    "Narrative": cleaned_descriptions,
    "Value": cleaned_values
})

In [367]:
cleaned_df["Credit"] = np.nan
cleaned_df["Debit"] = np.nan


In [368]:
cleaned_df

Unnamed: 0,Date,Txn Date,Narrative,Value,Credit,Debit
0,23/02,23/02,MARKUP FEES CAIRO,4.95,,
1,23/02,23/02,MARKUP FEES CAIRO,1.50,,
2,24/02,24/02,MARKUP FEES CAIRO,2.25,,
3,8/03,8/03,MARKUP FEES CAIRO,1.44,,
4,17/03,17/03,MARKUP FEES CAIRO,.57,,
5,20/02,20/02,STARBUCKS DRIVE THRU CAIRO N. -07A,40.00,,
6,21/02,21/02,Netflix.com INTERNET,165.00,,
7,21/02,21/02,APPLE.COM/BILL ITUNES.COM,49.99,,
8,22/02,22/02,APPLE.COM/BILL ITUNES.COM,74.99,,
9,23/02,23/02,EMARAT MISR-RING ROAD CAIRO N. -07A,215.00,,


In [369]:
for index, row in cleaned_df.iterrows():
    value = row["Value"]
    if "CR" in value:
        credit_value = value.replace("CR", "").replace(",", "").strip()
        cleaned_df.loc[index, "Credit"] = credit_value
    else:
        cleaned_df.loc[index, "Debit"] = value.replace(",", "").strip()

In [370]:
cleaned_df.drop(["Value",], axis=1, inplace=True)

In [371]:
cleaned_df

Unnamed: 0,Date,Txn Date,Narrative,Credit,Debit
0,23/02,23/02,MARKUP FEES CAIRO,,4.95
1,23/02,23/02,MARKUP FEES CAIRO,,1.5
2,24/02,24/02,MARKUP FEES CAIRO,,2.25
3,8/03,8/03,MARKUP FEES CAIRO,,1.44
4,17/03,17/03,MARKUP FEES CAIRO,,0.57
5,20/02,20/02,STARBUCKS DRIVE THRU CAIRO N. -07A,,40.0
6,21/02,21/02,Netflix.com INTERNET,,165.0
7,21/02,21/02,APPLE.COM/BILL ITUNES.COM,,49.99
8,22/02,22/02,APPLE.COM/BILL ITUNES.COM,,74.99
9,23/02,23/02,EMARAT MISR-RING ROAD CAIRO N. -07A,,215.0


In [372]:
cleaned_df["Credit"].replace(np.nan, "", inplace=True)
cleaned_df["Debit"].replace(np.nan, "", inplace=True)

In [373]:
cleaned_df.to_csv(f"../bank_statements/raw/15.csv", index=False)