# Personal Expenditure Analysis
## Data gathered from "Money Manager", to calculate personal income and expenditure

## Python Flask App Integration to Mutual Fund Backend, to get personal finance visualization

### Problem Statement

The daily finaces are recorded on an Android Based Application, "Money Manager"
The data from the same is exxported and loaded to Google Sheet, to achieve the following -

1. Identify MoM Expenditure sum change.
2. Calculate the overall expenditure change every Month (SUM, MEAN & STD).
3. For each month get the different categories of expenditure and income.
4. For each month get the Top 5 Spend Categories (Grouped Column Chart)
5. Once the datapoints cross a certain threshold, expenditure prediction (Time Series)
6. Compare the sum change of income and expenditure every month

### All imports at one place

In [1]:
import gspread
from oauth2client.service_account import ServiceAccountCredentials
import pandas as pd
from datetime import datetime,timedelta
import smtplib
from email.message import EmailMessage
import json
import numpy as np

  if sys.version_info[0] is 2 and  sys.version_info[1] is 1:
  if sys.version_info[0] is 2 and  sys.version_info[1] is 1:


### Fetch Data from Google Sheet

In [2]:
#Defining the scope of the OAuth Authentication
scope = ["https://spreadsheets.google.com/feeds",'https://www.googleapis.com/auth/spreadsheets',"https://www.googleapis.com/auth/drive.file","https://www.googleapis.com/auth/drive"]
         
#Getting the credentials
creds = ServiceAccountCredentials.from_json_keyfile_name("./flask_app/src/secret_config/google_credentials.json", scope)
#Connecting to the Google Spreadsheet Client
client = gspread.authorize(creds)

#Getting the spreadsheet
sheet = client.open("Personal_Expenditure").sheet1

### Convert the data to a base data-frame

In [3]:
all_personal_finance_data = sheet.get_all_records()

#Create the dataframe
base_data = pd.DataFrame(all_personal_finance_data)

### Exporatory Ananlysis, Data cleanup

In [4]:
base_data.replace([''],'Unknown',inplace=True)

print(base_data)

           Date Income/Expenses   Category              Memo   Amount
0    11/28/2020        Expenses  Cigarette           Unknown   -311.0
1    11/14/2020        Expenses  Cigarette           Unknown   -480.0
2    11/13/2020        Expenses  Cigarette           Unknown    -72.0
3    11/12/2020        Expenses  Telephone  Airtel_My_Second    -49.0
4     11/9/2020        Expenses      Bills         Rentickle   -986.0
..          ...             ...        ...               ...      ...
182    5/1/2020        Expenses      Bills         Rentickle   -200.0
183    5/1/2020        Expenses  Insurance             Paytm  -1000.0
184    5/1/2020        Expenses      Bills       Electricity    -40.0
185    5/1/2020        Expenses       Home                Ma  -5000.0
186    5/1/2020          Income     Salary           Unknown  28158.0

[187 rows x 5 columns]


### Addition of extra columns

The followings columns are added to aid calculations -
1. Month-Year (MM-YYYY)
2. Day of the Week

The following columns are removed -
1. Details of each category

The expenses column is converted to Integer

In [5]:
#Dropping the Memo column
base_data = base_data.drop(['Memo'],axis=1)

#Renaming the columns to maintain consistency in snake_case
base_data.columns = ['date','type','category','amount']

#Adding the Month-Year Column
def month_year(date_text):
    try:
        return datetime.strptime(date_text, '%m/%d/%Y').strftime("%b-%Y")
    except ValueError:
        print(date_text)
        return date_text
    
#Adding the Day of Week Column
def day_of_week(date_text):
    try:
        return datetime.strptime(date_text, '%m/%d/%Y').strftime("%A")
    except ValueError:
        print(date_text)
        return date_text
    
base_data['month_year'] = base_data['date'].apply(month_year)
base_data['day_of_week'] = base_data['date'].apply(day_of_week)

print(base_data)

           date      type   category   amount month_year day_of_week
0    11/28/2020  Expenses  Cigarette   -311.0   Nov-2020    Saturday
1    11/14/2020  Expenses  Cigarette   -480.0   Nov-2020    Saturday
2    11/13/2020  Expenses  Cigarette    -72.0   Nov-2020      Friday
3    11/12/2020  Expenses  Telephone    -49.0   Nov-2020    Thursday
4     11/9/2020  Expenses      Bills   -986.0   Nov-2020      Monday
..          ...       ...        ...      ...        ...         ...
182    5/1/2020  Expenses      Bills   -200.0   May-2020      Friday
183    5/1/2020  Expenses  Insurance  -1000.0   May-2020      Friday
184    5/1/2020  Expenses      Bills    -40.0   May-2020      Friday
185    5/1/2020  Expenses       Home  -5000.0   May-2020      Friday
186    5/1/2020    Income     Salary  28158.0   May-2020      Friday

[187 rows x 6 columns]


### Divide to expense and income frame

This is the point in the .py code where it is declared as global.
For all the upcoming cells the values will be taken from income and expense dataframe.

In [7]:
expense_data = base_data.loc[base_data['type'] == 'Expenses']
income_data = base_data.loc[base_data['type'] == 'Income']

#Convert the negative strings of expense to positive
expense_data['amount'] = pd.to_numeric(expense_data['amount'])
expense_data['amount'] = expense_data['amount'].apply(lambda x:abs(x))

#Convert Income data to Integer
income_data['amount'] = pd.to_numeric(income_data['amount'])

#Sort both the dataframes
expense_data = expense_data.sort_values(by=['date'],ascending=True)
income_data = income_data.sort_values(by=['date'],ascending=True)

print(income_data)

          date    type category   amount month_year day_of_week
38   10/1/2020  Income   Salary  28158.0   Oct-2020    Thursday
29   10/9/2020  Income  Refunds    472.0   Oct-2020      Friday
21   11/1/2020  Income   Salary  38158.0   Nov-2020      Sunday
5    11/7/2020  Income  Refunds    472.0   Nov-2020    Saturday
186   5/1/2020  Income   Salary  28158.0   May-2020      Friday
161   6/1/2020  Income   Salary  28830.0   Jun-2020      Monday
129  6/20/2020  Income  Refunds    680.0   Jun-2020    Saturday
144   6/3/2020  Income   Grants    550.0   Jun-2020   Wednesday
121   7/1/2020  Income   Salary  28830.0   Jul-2020   Wednesday
94    8/1/2020  Income   Salary  28830.0   Aug-2020    Saturday
70   8/25/2020  Income  Refunds   1416.0   Aug-2020     Tuesday
64    9/1/2020  Income   Salary  28158.0   Sep-2020     Tuesday
47   9/12/2020  Income  Refunds    472.0   Sep-2020    Saturday


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  expense_data['amount'] = pd.to_numeric(expense_data['amount'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  expense_data['amount'] = expense_data['amount'].apply(lambda x:abs(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  income_data['amount'] = pd.to_numeric(income_data['amount'])


### Calculations

1. API 1 - Get the MoM changes for expenditure
2. API 2 - Calculate the overall expenditure change every Month (SUM, MEAN & STD).
3. API 3 - Get All the Month Names
4. API 4 - For each month get the different categories of expenditure and income.
5. API 5 - For each month get the Top 5 Spend Categories (Grouped Column Chart)
6. API 6 - Get the sum change of income and expenditure every month

### Get the MoM change of expenditure for the last two months

In [8]:
grouped_expense_date = expense_data.groupby('month_year',as_index=False).agg({'amount':[np.sum]})
grouped_expense_date = grouped_expense_date.tail(2)

mom_expense_change = {}
list_mom_change = grouped_expense_date.values.tolist()

#Getting the difference
diff_change = list(map(lambda x: x[1], list_mom_change))

mom_expense_change['mom'] = round(diff_change[0] - diff_change[1],2)
print(mom_expense_change)

{'mom': 294.76}


### Highcharts Converter Functions

Functions identified are -

1. create_box_plot() - Creates a box-plot series
2. create_single_series() - Creates a simple single series for Pie Chart
3. create_single_series() - Create a simple single series for column/bar chart

In [9]:
#Get the color based on the last two values
def determine_color(data,type_positive):
    value_list = list(map(lambda x:x[1],data.values.tolist()))
    
    if(type_positive):
        if(value_list[-1]>value_list[-2]):
            return 'green'
        elif(value_list[-1]<value_list[-2]):
            return 'red'
    else:
        if(value_list[-1]>value_list[-2]):
            return 'red'
        elif(value_list[-1]<value_list[-2]):
            return 'green'
    return 'orange'

In [10]:
def create_box_plot(data, x_axis_title, y_axis_title):
    series = {}
    
    #Creating the x-axis
    x_axis_categories = list(map(lambda x: x[0], data.values.tolist()))
    series['xAxis'] = {}
    series['xAxis']['categories'] = x_axis_categories
    series['xAxis']['title'] = {'text': x_axis_title}
    
    #Creating the y-axis
    series['yAxis'] = {}
    series['yAxis']['title'] = {'text': y_axis_title}
    plot_lines_array = []
    series['yAxis']['plotLines'] = plot_lines_array
    plot_lines_dict = {}
    plot_lines_dict['value'] = round(np.average(list(map(lambda x: x[3], data.values.tolist()))),2)
    plot_lines_dict['color'] = 'red'
    plot_lines_dict['width'] = 1
    plot_lines_dict['label'] = {'text': 'Average Mean - ' + str(round(plot_lines_dict['value'],2)), 'align': 'center', 'style': {'color': 'black'}}
    plot_lines_array.append(plot_lines_dict)
    
    #Creating the series
    #Drop the name column
    data = data.drop('month_year', axis=1)
    series_array = []
    series['series'] = series_array
    series_dict = {}
    series_dict['name'] = 'Observations'
    series_dict['data'] = data.values.tolist()
    series_array.append(series_dict)
    
    return series

def create_single_series_pie(data, series_name):
    series = {}
    
    #Creating the series
    series_array = []
    series['series'] = series_array
    series_dict = {}
    series_dict['type'] = 'pie'
    series_dict['name'] = series_name
    series_dict['data'] = data.values.tolist()
    series_array.append(series_dict)
    
    return series

def create_multiple_series(data1, data2, y_axis_title, series_1_name, series_2_name):
    series = {}
    
    #Creating the x-axis
    x_axis_categories = list(map(lambda x: x[0], data1.values.tolist()))
    series['xAxis'] = {}
    series['xAxis']['categories'] = x_axis_categories
    
    #Creating the y-axis
    series['yAxis'] = {}
    series['yAxis']['title'] = {'text': y_axis_title}
    
    #Creating the series
    series_array = []
    series['series'] = series_array
    #1st Series
    series_dict1 = {}
    series_dict1['type'] = 'line'
    series_dict1['color'] = determine_color(data1, False)
    series_dict1['name'] = series_1_name
    series_dict1['data'] = list(map(lambda x:x[1],data1.values.tolist()))
    series_array.append(series_dict1)
    #2nd Series
    series_dict2 = {}
    series_dict2['type'] = 'line'
    series_dict2['color'] = determine_color(data2, True)
    series_dict2['name'] = series_2_name
    series_dict2['data'] = list(map(lambda x:x[1],data2.values.tolist()))
    series_array.append(series_dict2)
    
    return series
    

### Calculate the overall expenditure change every Month (SUM, MEAN & STD) & get all the month names

In [11]:
grouped_expense_date = expense_data.groupby('month_year',as_index=False).agg({'amount':[np.sum, np.median, np.mean, np.average, np.std]})
grouped_expense_date.columns = ['month_year','sum', 'median', 'mean', 'averagae', 'standard_deviation']

#Sort the date
grouped_expense_date['date'] = grouped_expense_date['month_year'].apply(lambda x: datetime.strptime(x, '%b-%Y').strftime("%m/%d/%Y"))
grouped_expense_date = grouped_expense_date.sort_values(by=['date'])

grouped_expense_date = grouped_expense_date.drop(['date'],axis=1)

#Round the three numeric columns
grouped_expense_date['sum'] = grouped_expense_date['sum'].apply(lambda x: round(x,2))
grouped_expense_date['median'] = grouped_expense_date['median'].apply(lambda x: round(x,2))
grouped_expense_date['mean'] = grouped_expense_date['mean'].apply(lambda x: round(x,2))
grouped_expense_date['averagae'] = grouped_expense_date['averagae'].apply(lambda x: round(x,2))
grouped_expense_date['standard_deviation'] = grouped_expense_date['standard_deviation'].apply(lambda x: round(x,2))

series_dict = create_box_plot(grouped_expense_date, 'Month-Year', 'Observations')
print(series_dict)

month_names_dict = {}

month_names_list = list(map(lambda x: x[0], grouped_expense_date.values.tolist()))
month_names_dict['month_names'] = month_names_list
print(month_names_dict)

{'xAxis': {'categories': ['May-2020', 'Jun-2020', 'Jul-2020', 'Aug-2020', 'Sep-2020', 'Oct-2020', 'Nov-2020'], 'title': {'text': 'Month-Year'}}, 'yAxis': {'title': {'text': 'Observations'}, 'plotLines': [{'value': 1332.41, 'color': 'red', 'width': 1, 'label': {'text': 'Average Mean - 1332.41', 'align': 'center', 'style': {'color': 'black'}}}]}, 'series': [{'name': 'Observations', 'data': [[29108.05, 575.0, 1212.84, 1212.84, 1824.76], [33945.61, 298.0, 917.45, 917.45, 2000.14], [30051.9, 509.5, 1155.84, 1155.84, 2145.2], [31371.44, 501.0, 1161.91, 1161.91, 1797.29], [28568.29, 540.0, 1242.1, 1242.1, 1802.5], [28863.05, 570.0, 1697.83, 1697.83, 2261.48], [38777.91, 490.0, 1938.9, 1938.9, 3313.45]]}]}
{'month_names': ['May-2020', 'Jun-2020', 'Jul-2020', 'Aug-2020', 'Sep-2020', 'Oct-2020', 'Nov-2020']}


### Calculate the overall income change every Month (SUM, MEAN & MEDIAN) & get all the month names

In [13]:
grouped_income_date = income_data.groupby('month_year', as_index=False).agg({'amount': [np.sum, np.median, np.mean]})
grouped_income_date.columns = ['month_year', 'sum', 'median', 'mean']

# Sort the date
grouped_income_date['date'] = grouped_income_date['month_year'].apply(lambda x: datetime.strptime(x, '%b-%Y').strftime("%m/%d/%Y"))
grouped_income_date = grouped_income_date.sort_values(by=['date'])

grouped_income_date = grouped_income_date.drop(['date'], axis=1)

# Round the three numeric columns
grouped_income_date['sum'] = grouped_income_date['sum'].apply(lambda x: round(x, 2))
grouped_income_date['median'] = grouped_income_date['median'].apply(lambda x: round(x, 2))
grouped_income_date['mean'] = grouped_income_date['mean'].apply(lambda x: round(x, 2))

series_dict = create_box_plot(grouped_income_date, 'Month-Year', 'Observations')
print(series_dict)

{'xAxis': {'categories': ['May-2020', 'Jun-2020', 'Jul-2020', 'Aug-2020', 'Sep-2020', 'Oct-2020', 'Nov-2020'], 'title': {'text': 'Month-Year'}}, 'yAxis': {'title': {'text': 'Observations'}, 'plotLines': [{'value': 18582.29, 'color': 'red', 'width': 1, 'label': {'text': 'Average Mean - 18582.29', 'align': 'center', 'style': {'color': 'black'}}}]}, 'series': [{'name': 'Observations', 'data': [[28158.0, 28158.0, 28158.0], [30060.0, 680.0, 10020.0], [28830.0, 28830.0, 28830.0], [30246.0, 15123.0, 15123.0], [28630.0, 14315.0, 14315.0], [28630.0, 14315.0, 14315.0], [38630.0, 19315.0, 19315.0]]}]}


### For each month get the different categories of expenditure and income.

In [14]:
def get_expenditure_category_by_month_name(month_name):
    expense_month_year = expense_data.loc[expense_data['month_year']==month_name]
    
    expense_month_year = expense_month_year.groupby('category', as_index=False).agg({'amount' : [np.sum]})
    expense_month_year.columns = ['category_name', 'sum_amount']
    return create_single_series_pie(expense_month_year, 'Monthly Expenditure')

def get_income_category_by_month_name(month_name):
    income_month_year = income_data.loc[income_data['month_year']==month_name]
    
    income_month_year = income_month_year.groupby('category', as_index=False).agg({'amount' : [np.sum]})
    return create_single_series_pie(income_month_year, 'Monthly Income')


#driver example
print(get_expenditure_category_by_month_name('May-2020'))
print(get_income_category_by_month_name('May-2020'))

{'series': [{'type': 'pie', 'name': 'Monthly Expenditure', 'data': [['Bills', 5175.8], ['Cigarette', 2979.0], ['Electronics', 800.0], ['Food', 3222.25], ['Home', 12750.0], ['Insurance', 1000.0], ['Shopping', 1100.0], ['Social', 1500.0], ['Telephone', 201.0], ['Wine', 380.0]]}]}
{'series': [{'type': 'pie', 'name': 'Monthly Income', 'data': [['Salary', 28158.0]]}]}


### Get the sum change of income and expenditure every month

In [15]:
#Expense
grouped_expense_date = expense_data.groupby('month_year',as_index=False).agg({'amount':[np.sum]})
grouped_expense_date.columns = ['month_year','sum']

#Sort the date
grouped_expense_date['date'] = grouped_expense_date['month_year'].apply(lambda x: datetime.strptime(x, '%b-%Y').strftime("%m/%d/%Y"))
grouped_expense_date = grouped_expense_date.sort_values(by=['date'])

grouped_expense_date = grouped_expense_date.drop(['date'],axis=1)

#Round the three numeric columns
grouped_expense_date['sum'] = grouped_expense_date['sum'].apply(lambda x: round(x,2))

#Income
grouped_income_date = income_data.groupby('month_year',as_index=False).agg({'amount':[np.sum]})
grouped_income_date.columns = ['month_year','sum']

#Sort the date
grouped_income_date['date'] = grouped_income_date['month_year'].apply(lambda x: datetime.strptime(x, '%b-%Y').strftime("%m/%d/%Y"))
grouped_income_date = grouped_income_date.sort_values(by=['date'])

grouped_income_date = grouped_income_date.drop(['date'],axis=1)

#Round the three numeric columns
grouped_income_date['sum'] = grouped_income_date['sum'].apply(lambda x: round(x,2))

print(create_multiple_series(grouped_expense_date, grouped_income_date, 'Total Count', 'Expense/Month', 'Income/Month'))

{'xAxis': {'categories': ['May-2020', 'Jun-2020', 'Jul-2020', 'Aug-2020', 'Sep-2020', 'Oct-2020', 'Nov-2020']}, 'yAxis': {'title': {'text': 'Total Count'}}, 'series': [{'type': 'line', 'color': 'red', 'name': 'Expense/Month', 'data': [29108.05, 33945.61, 30051.9, 31371.44, 28568.29, 28863.05, 38777.91]}, {'type': 'line', 'color': 'green', 'name': 'Income/Month', 'data': [28158.0, 30060.0, 28830.0, 30246.0, 28630.0, 28630.0, 38630.0]}]}
