In [62]:
def refactor_monthly_data_frame(data_frame_name):
    print("Refactoring ", data_frame_name, " as per project requirement")
    import pandas as pd
    df = pd.read_csv('input_data/2023/'+data_frame_name, sep=";")

    df_category = pd.read_csv('input_data/category_mapper.csv', sep=';')
    empty_categories = set(df_category["Sub-category"])-set(df["Sub-category"])

    if list(empty_categories):
        for sub_cat in empty_categories:
            temp_dict = {"Description": sub_cat + "_empty_entry", 
            "Debit": 0.0, 
            "Credit": 0.0, 
            "Sub-category": sub_cat}
            df = pd.concat([df, pd.DataFrame(temp_dict, index=[0])], ignore_index=True)

    # fill down the NaN values in Date column
    df['Date'] = df['Date'].fillna(method='ffill')

    try:
        df["Date"] = pd.to_datetime(df["Date"], format="%d.%m.%y")
    except:
        pass

    # extract month from date column
    df['Date'] = pd.to_datetime(df['Date'])
    df['day'] = df['Date'].dt.day
    df['month'] = df['Date'].dt.month_name()
    df['year'] = df['Date'].dt.year
    df.drop(['Date'], axis=1, inplace=True)

    df["Credit"] = df["Credit"].astype(str).str.replace(",", ".").astype(float)
    df["Debit"] = df["Debit"].astype(str).str.replace(",", ".").astype(float)
    df = df.fillna(0)

    # add Category and Category Type column to df from df_category based on Sub-category
    df['Category'] = df['Sub-category'].map(df_category.set_index('Sub-category')['Category'])
    df['Category Type'] = df['Sub-category'].map(df_category.set_index('Sub-category')['Category Type'])


    df.to_csv('processed_data/2023/'+ data_frame_name, sep=";", index=False)
    print("Refactoring of ", data_frame_name, " is completed !! ")
    return None

In [63]:
import pandas as pd

considered_months = ["01_Jan.csv", "02_Feb.csv", "03_March.csv", "04_April.csv", "05_May.csv", "06_June.csv", "07_July.csv"]



for location in considered_months:
    refactor_monthly_data_frame(location)

df = pd.DataFrame()
for location in considered_months:
    df_temp = pd.read_csv('processed_data/2023/' + location, sep=';')
    df = pd.concat([df, df_temp], ignore_index=True)

df["effective_amount"] = df["Credit"] - df["Debit"]
df["Balance_Amount"] = df["effective_amount"].cumsum()

df.tail(50)

Refactoring  01_Jan.csv  as per project requirement
Refactoring of  01_Jan.csv  is completed !! 
Refactoring  02_Feb.csv  as per project requirement
Refactoring of  02_Feb.csv  is completed !! 
Refactoring  03_March.csv  as per project requirement
Refactoring of  03_March.csv  is completed !! 
Refactoring  04_April.csv  as per project requirement
Refactoring of  04_April.csv  is completed !! 
Refactoring  05_May.csv  as per project requirement
Refactoring of  05_May.csv  is completed !! 
Refactoring  06_June.csv  as per project requirement
Refactoring of  06_June.csv  is completed !! 
Refactoring  07_July.csv  as per project requirement
Refactoring of  07_July.csv  is completed !! 


Unnamed: 0,Description,Debit,Credit,Sub-category,day,month,year,Category,Category Type,effective_amount,Balance_Amount
189,Udemy Course TensorFlow,0.0,109.99,PDP Income,16,June,2023,Extra Income,Income,109.99,-765.68
190,PayPal Berlin Room,70.41,0.0,Office Travel Spending,19,June,2023,PDP Expenses,Expense,-70.41,-836.09
191,PayPal Berlin Travel,59.98,0.0,Office Travel Spending,23,June,2023,PDP Expenses,Expense,-59.98,-896.07
192,LOHN / GEHALT 06/23,0.0,5667.0,Salary,26,June,2023,Salary,Income,5667.0,4770.93
193,Gift card,0.0,50.0,Tax less Income,26,June,2023,Extra Income,Income,50.0,4820.93
194,Gift card,50.0,0.0,Groceries,26,June,2023,Living Expenses,Expense,-50.0,4770.93
195,Tax less Income,0.0,150.0,Tax less Income,26,June,2023,Extra Income,Income,150.0,4920.93
196,Tax less Pension Income,0.0,43.8,Tax less Income,26,June,2023,Extra Income,Income,43.8,4964.73
197,Company Pension,292.0,0.0,Company Pension,26,June,2023,Europe Savings,Savings,-292.0,4672.73
198,Income Tax,1015.5,0.0,Tax,26,June,2023,Income Tax,State Cuttings,-1015.5,3657.23


In [89]:
Income_sources = ['Salary', 'Tax less Income', 'PDP Income', 'Office Travel Income']
yearly_expenses = pd.pivot_table(df, index=['Sub-category'], values=['Debit'], aggfunc=np.sum).reset_index()
yearly_expenses = yearly_expenses[~yearly_expenses['Sub-category'].isin(Income_sources)]
yearly_expenses.columns = ['Sub-category', 'amount']


yearly_income = pd.pivot_table(df, index=['Sub-category'], values=['Credit'], aggfunc=np.sum).reset_index()
yearly_income = yearly_income[yearly_income['Sub-category'].isin(Income_sources)]
yearly_income.columns = ['Sub-category', 'amount']

df_yearly_detailed_metrics = pd.concat([yearly_income, yearly_expenses], axis=0)


temp_dict = dict(zip(df_yearly_detailed_metrics["Sub-category"], df_yearly_detailed_metrics["amount"]))
total_income = temp_dict["Salary"]+temp_dict["Tax less Income"]
df_yearly_detailed_metrics["percentage"]=round((df_yearly_detailed_metrics["amount"]/total_income)*100,2)


df_yearly_detailed_metrics.to_csv("processed_data/yearly_detailed_metrics.csv", sep=";", index=False)
df_yearly_detailed_metrics

Unnamed: 0,Sub-category,amount,percentage
10,Office Travel Income,130.39,0.32
12,PDP Income,524.25,1.27
18,Salary,40087.22,97.12
20,Tax less Income,1188.85,2.88
0,Cloths,15.0,0.04
1,Company Pension,1168.0,2.83
2,Entertainment,425.59,1.03
3,Family & Kids,258.6,0.63
4,Furnishings,0.0,0.0
5,Gifts,26.09,0.06


In [65]:
Income_sources = ['Salary', 'Tax less Income', 'PDP Income', 'Office Travel Income']
monthly_expenses = pd.pivot_table(df, index=['month','Sub-category'], values=['Debit'], aggfunc=np.sum).reset_index()
monthly_expenses = monthly_expenses[~monthly_expenses['Sub-category'].isin(Income_sources)]
monthly_expenses.columns = ['month','Sub-category', 'amount']


monthly_income = pd.pivot_table(df, index=['month','Sub-category'], values=['Credit'], aggfunc=np.sum).reset_index()
monthly_income = monthly_income[monthly_income['Sub-category'].isin(Income_sources)]
monthly_income.columns = ['month','Sub-category', 'amount']

df_monthly_detailed_metrics = pd.concat([monthly_income, monthly_expenses], axis=0)
df_monthly_detailed_metrics.to_csv("processed_data/monthly_detailed_metrics.csv", sep=";", index=False)
df_monthly_detailed_metrics

Unnamed: 0,month,Sub-category,amount
10,April,Office Travel Income,0.00
12,April,PDP Income,117.80
18,April,Salary,5667.00
20,April,Tax less Income,243.80
35,February,Office Travel Income,0.00
...,...,...,...
169,May,Tax,1015.50
171,May,Taxi,0.00
172,May,To India,2600.00
173,May,Travel Pass,98.00
