# 1. Data Import & Exploration


## Load the large dataset


In [None]:
import pandas as pd
import numpy as np
fies_df = pd.read_csv('../datasets/fies_2023_volume1_494887610821.csv')
fies_df.head()

# 2. Identifying Data and Attributes


## List all column types and data types


In [None]:
columns_and_datatypes = pd.DataFrame({
    'Data Type': fies_df.dtypes
}, index=fies_df.columns)
# Display all rows of the DataFrame
for index, row in columns_and_datatypes.iterrows():
    print(f"{index}: {row['Data Type']}")

All columns seem to be either an integer or a float value.


In [None]:
# Code to remove "" from column names
fies_df.columns = fies_df.columns.str.strip('"')

# 3. Determining the Type of Dataset


## Check if columns are numerical, categorical, or mixed.


In [None]:
column_check = pd.DataFrame({
    'Column Type': None
}, index=fies_df.columns)

for column in fies_df.columns:
    column_check['Column Type'] = pd.api.types.infer_dtype(fies_df[column])

for index, row in column_check.iterrows():
    print(f"{index}: {row['Column Type']}")

All the columns seem to be numerical. However, an excel file that contains all metadata on what certain numbers in certain columns mean is provided by the PSA, (ex. Region Number Equivalents).


## Making a Data Dictionary


In [None]:
# fies_column_descriptions = {
#     'RDMD_ID': 'Unique identifier for the record',
#     'Region': 'Region code',
#     'Province': 'Province code',
#     'Household ID': 'Unique household identifier',
#     'RECODED PROVINCE': 'Recoded province information',
#     'Family Size': 'Number of people in the household',
#     'Salaries/Wages from Regular Employment': 'Income from regular employment',
#     'Salaries/Wages from Seasonal Employment': 'Income from seasonal employment',
#     'Income from Salaries and Wages': 'Total income from salaries and wages',
#     'Net Share of Crops, Fruits, etc. (Tot. Net Value of Share)': 'Net value from crop and fruit share',
#     'Cash Receipts, Support, etc. from Abroad': 'Cash support received from abroad',
#     'Cash Receipts, Support, etc. from Domestic Source': 'Cash support received domestically',
#     'Rentals Received from Non-Agri Lands, etc.': 'Income from land rentals (non-agricultural)',
#     'Unnamed: 13': 'Unknown or unnamed column',
#     'Pension and Retirement Benefits': 'Income from pensions and retirement',
#     'Dividends from Investment': 'Income from dividends',
#     'Other Sources of Income NEC': 'Other sources of income not elsewhere classified',
#     'Family Sustenance Activities': 'Income from family sustenance activities',
#     'Total Received as Gifts': 'Total gifts received by the household',
#     'Crop Farming and Gardening': 'Income from crop farming and gardening',
#     'Livestock and Poultry Raising': 'Income from livestock and poultry raising',
#     'Fishing': 'Income from fishing activities',
#     'Forestry and Hunting': 'Income from forestry and hunting',
#     'Wholesale and Retail': 'Income from wholesale and retail business',
#     'Manufacturing': 'Income from manufacturing activities',
#     'Transportation, Storage Services': 'Income from transportation and storage services',
#     'Entrep. Activities NEC': 'Income from entrepreneurial activities (not elsewhere classified)',
#     'Entrep. Activities NEC.1': 'Income from entrepreneurial activities (additional category 1)',
#     'Entrep. Activities NEC.2': 'Income from entrepreneurial activities (additional category 2)',
#     'Hhld, Income from Entrepreneurial Activities, Total': 'Total household income from entrepreneurial activities',
#     'Losses from EA': 'Losses from entrepreneurial activities',
#     'Cereal and Cereal Preparations (Total)': 'Expenditure on cereals and cereal preparations',
#     'Meat and Meat Preparations': 'Expenditure on meat and meat preparations',
#     'Fish and Marine Products (Total)': 'Expenditure on fish and marine products',
#     'Dairy Products and Eggs (Total)': 'Expenditure on dairy products and eggs',
#     'Oils and Fats (Total)': 'Expenditure on oils and fats',
#     'Fruits and Vegetables': 'Expenditure on fruits and vegetables',
#     'Vegetables (Total)': 'Expenditure on vegetables',
#     'Sugar, Jam and Honey (Total)': 'Expenditure on sugar, jam, and honey',
#     'Food Not Elsewhere Classified (Total)': 'Expenditure on other food items',
#     'Fruit and vegetable juices': 'Expenditure on fruit and vegetable juices',
#     'Coffee, Cocoa and Tea (Total)': 'Expenditure on coffee, cocoa, and tea',
#     'Tea (total)  expenditure': 'Expenditure on tea',
#     'Cocoa (total)  expenditure': 'Expenditure on cocoa',
#     'Main Source of Water Supply (2nd visit only)': 'Main source of water supply (second visit)',
#     'Softdrinks': 'Expenditure on soft drinks',
#     'Other Non Alcoholic Beverages': 'Expenditure on other non-alcoholic beverages',
#     'Alcoholic Beverages (Total)': 'Expenditure on alcoholic beverages',
#     'Tobacco (Total)': 'Expenditure on tobacco products',
#     'Other Vegetables (Total)': 'Expenditure on other types of vegetables',
#     'Services_Primary_Goods': 'Expenditure on services and primary goods',
#     'Alcohol Procduction Services': 'Expenditure on alcohol production services',
#     'Total Food Consumed at Home (Total)': 'Total food consumed at home',
#     'Food Regularly Consumed Outside The Home (Total)': 'Food consumed outside the home',
#     'Hhld, Food': 'Household expenditure on food',
#     'Clothing, Footwear and Other Wear': 'Expenditure on clothing, footwear, and other wear',
#     'Housing and water (Total)': 'Expenditure on housing and water',
#     'Actual House Rent': 'Expenditure on actual house rent',
#     'Imputed House Rental Value': 'Imputed value of house rental',
#     'Imputed Housing Benefit Rental Value': 'Imputed value of housing benefit rental',
#     'House Rent/Rental Value': 'Expenditure on house rent/rental value',
#     'Furnishings, Household Equipment & Routine Household Mainte': 'Expenditure on furnishings and household equipment',
#     'Health (Total)': 'Expenditure on health services and products',
#     'Transportation (Total)': 'Expenditure on transportation',
#     'Communication (Total)': 'Expenditure on communication services',
#     'Recreation and Culture (Total)': 'Expenditure on recreation and culture',
#     'Education (Total)': 'Expenditure on education',
#     'Insurance': 'Expenditure on insurance',
#     'Miscellaneous Goods and Services (Total)': 'Expenditure on miscellaneous goods and services',
#     'Durable Furniture': 'Expenditure on durable furniture',
#     'Special Family Occasion': 'Expenditure on special family occasions',
#     'Other Expenditure (inc. Value Consumed, Losses)': 'Other expenditures including losses',
#     'Other Disbursements': 'Other household disbursements',
#     'Accomodation Services': 'Expenditure on accommodation services',
#     'Total Non-Food Expenditure': 'Total non-food expenditure',
#     'Hhld, Income, Total': 'Total household income',
#     'Hhld, Expenditures, Total': 'Total household expenditures',
#     'Total Household Disbursements': 'Total household disbursements',
#     'Other Receipts': 'Other household receipts',
#     'Total Receipts': 'Total receipts',
#     'Psu (Recode)': 'Primary Sampling Unit (recoded)',
#     'Raising Factor': 'Raising factor for survey results',
#     'Final Population Weights': 'Final weights for population data',
#     'Urban / Rural': 'Urban or rural classification',
#     'Per Capita Income': 'Household per capita income',
#     'NPCINC': 'National per capita income',
#     'RPCINC': 'Regional per capita income',
#     'Per Capita Income Decile (Province)': 'Per capita income decile in the province',
#     'pPCINC': 'Provincial per capita income decile',
#     'Per Capita Income Decile (Region with Negros Island Region (NIR))': 'Per capita income decile (region with NIR)',
#     'Region (with NIR)': 'Region code including NIR'
# }


In [None]:
# fies_derivations = {
#     'Total Receipts': 'Total Household Income + Other Receipts',
#     'Hhld, Income, Total': 'Net Share of Crops, Fruits, etc. + Cash Receipts, Support, etc. from Abroad + Cash Receipts, Support, etc. from Domestic Source + Unnamed + Pension and Retirement Benefits + Dividends from Investment + Other Sources of Income NEC + Family Sustenance Activities + Total Received as Gifts + Household, Income from Entrep Activities, Total + Imputed House Rental Value',
#     'Hhld, Income from Entrepreneurial Activities, Total': 'Crop Farming and Gardening + Livestock and Poultry Raising + Fishing + Forestry and Hunting + Wholesale and Retail + Manufacturing + Transportation, Storage Services + Entrep. Activities NEC + Entrep. Activities NEC 1 + Entrep. Activities NEC 2',
#     'Total Household Disbursements': 'Total Household Expenditure + Other Disbursements',
#     'Hhld, Expenditures, Total': 'Household Food + Total Non-Food Expenditure',
#     'Hhld, Food': 'Total Food Consumed at Home + Food Regularly Consumed Outside The Home',
#     'Total Food Consumed at Home (Total)': 'Cereal and Cereal Preparations + Meat and Meat Preparations + Fish and Marine Products + Dairy and Eggs + Oils and Fats + Fruits and Vegetables + Vegetables + Sugar, jam and Honey + Food Not Elsewhere Classified + Fruit and Vegetable Juices + Coffee, Cocoa and Tea + Tea + Cocoa + Main Source of Water Supply + Softdrinks + Other Non Alcoholic Beverages',
#     'Total Non-Food Expenditure': 'Alcoholic Beverages + Tobacco + Other Vegetables + Services_Primary_Goods + Alcoholic Production Services + Housing and water (Total) + Furnishings, Household Equipment & Routine Household Maintenance + Health + Transportation + Communication + Recreation and Culture + Education + Insurance + Miscellaneous Goods and Services + Durable Furniture + Special Family Occasion + Other Expenditure + Accommodation Services + Clothing, Footwear and Other Wear',
# }


In [None]:
# fies_volume1_data_dict = pd.DataFrame({
#     'Column Name': fies_df.columns,
#     'Data Type': fies_df.dtypes,
#     'Non-Null Count': fies_df.notnull().sum(),
#     'Unique Values': fies_df.nunique(),
#     'Description': [fies_column_descriptions.get(col, 'No desciption available') for col in fies_df.columns],
#     'Derivations from other columns': [fies_derivations.get(col, '') for col in fies_df.columns]
# })
# fies_volume1_data_dict.to_csv('../fies_volume1_data_dict.csv', index=False)

# 4. Data Quality and Assessment


## Check for missing values, duplicates, outliers, and wrong data.<b>


Check for duplicates


In [None]:
number_of_rows = fies_df.shape[0]

print(f"Number of rows: {number_of_rows}")

removed_duplicates = fies_df.copy()
removed_duplicates.drop_duplicates(inplace=True)

print(f"Number of rows after dropping duplicates: {removed_duplicates.shape[0]}")

No duplicates are found.


From the data dictionary, the Total Household Disbursements column is the only one with an object datatype, suggesting mixed values of numbers, strings, etc.


In [None]:
for column in removed_duplicates.columns:
    if removed_duplicates[column].isnull().any():
        print(f"Column {column} has missing values")

The code block above does not show any null values initially, therefore there is the possibility of data with only whitespace values. The code below will strip all whitespaces
to know the true number of missing values.


In [None]:
# Function to check if a value is whitespace or empty
def has_whitespace(val):
    return isinstance(val, str) and val.strip() == ''

whitespace_rows = removed_duplicates.map(has_whitespace).any(axis=1)

whitespace_count = whitespace_rows.sum()

print(f"Number of rows with whitespace: {whitespace_count}")

There are whitespaces. Whitespaces could mean that the value for that data is zero. Therefore, a check must be made to ensure that there are zeroes in the dataset as well to know that whitespaces and zeroes are equivalent.


In [None]:
print(removed_duplicates['Total Household Disbursements'].value_counts().where(removed_duplicates['Total Household Disbursements'] == 0, 1).sum())

Now we detect potential outliers using statistical methods.
The main columns to look at are the Total Household Income and Total Household Expenditure columns..


In [None]:
income_mean = removed_duplicates['Hhld, Income, Total'].mean()
income_median = removed_duplicates['Hhld, Income, Total'].median()
income_std = removed_duplicates['Hhld, Income, Total'].std()

print(f"Income Mean: {income_mean}")
print(f"Income Median: {income_median}")
print(f"Income Standard Deviation: {income_std}")

In [None]:
expenditure_mean = removed_duplicates['Hhld, Expenditures, Total'].mean()
expenditure_median = removed_duplicates['Hhld, Expenditures, Total'].median()
expenditure_std = removed_duplicates['Hhld, Expenditures, Total'].std()

print(f"Expenditure Mean: {expenditure_mean}")
print(f"Expenditure Median: {expenditure_median}")
print(f"Expenditure Standard Deviation: {expenditure_std}")

From the results, the mean for the income and expenditure columns are quite large. To see more, a boxplot can be used to visualize the distribution


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
sns.boxplot(removed_duplicates['Hhld, Income, Total'])
plt.title('Boxplot of Income')
plt.xlabel('Income')
plt.ylabel('Value')

In [None]:
sns.boxplot(removed_duplicates['Hhld, Expenditures, Total'])
plt.title('Boxplot of Expenditures')
plt.xlabel('Expenditures')
plt.ylabel('Value')

In [None]:
fig, ax = plt.subplots(figsize=(6, 4))
ax.scatter(removed_duplicates['Hhld, Income, Total'], removed_duplicates['Hhld, Expenditures, Total'])
ax.set_xlabel('Income')
ax.set_ylabel('Expenditure')
plt.show()

From the boxplots and scatter plots, there defenitely are high-value outliers for both Income and Expenditure, and from the column derivations, this also means that by addressing only these two columns, the rest of the outlier columns can be addressed.


## Impute, discretize and data wrangling


Since there are zeroes present, the Total Household Disbursements column must be addressed. Upon inspection, Total Household Disbursements can be imputed from the sum of Hhld, Expenditures, Total and Other Disbursements.


In [None]:
removed_null = removed_duplicates.copy()
removed_null.loc[whitespace_rows, 'Total Household Disbursements'] = removed_null.loc[whitespace_rows, 
                                                                            'Hhld, Expenditures, Total'] + removed_null.loc[whitespace_rows, 'Other Disbursements']

Double-check for missing values


In [None]:
whitespace_rows = removed_null.map(has_whitespace).any(axis=1)
whitespace_count = whitespace_rows.sum()
print(f"Number of rows with whitespace: {whitespace_count}")

We now address the outliers using the IQR method


In [None]:
income_Q1 = removed_null['Hhld, Income, Total'].quantile(0.25)
income_Q3 = removed_null['Hhld, Income, Total'].quantile(0.75)
income_IQR = income_Q3 - income_Q1
print(f"Income Q1: {income_Q1}")
print(f"Income Q3: {income_Q3}")
print(f"Income IQR: {income_IQR}")

expenditure_Q1 = removed_null['Hhld, Expenditures, Total'].quantile(0.25)
expenditure_Q3 = removed_null['Hhld, Expenditures, Total'].quantile(0.75)
expenditure_IQR = expenditure_Q3 - expenditure_Q1
print(f"Expenditure Q1: {expenditure_Q1}")
print(f"Expenditure Q3: {expenditure_Q3}")
print(f"Expenditure IQR: {expenditure_IQR}")

In [None]:
# Income
removed_outliers = removed_null.copy()
print('Shape before removing outliers:', removed_outliers.shape)
lower_bound_income = income_Q1 - 1.5 * income_IQR
upper_bound_income = income_Q3 + 1.5 * income_IQR
upper_income = np.where(removed_outliers['Hhld, Income, Total'] >= upper_bound_income)[0]
lower_income = np.where(removed_outliers['Hhld, Income, Total'] <= lower_bound_income)[0]

removed_outliers.drop(index=upper_income, inplace=True)
removed_outliers.drop(index=lower_income, inplace=True)
print('Shape after removing outliers for Income:', removed_outliers.shape)

# Expenditure
lower_bound_expenditure = expenditure_Q1 - 1.5 * expenditure_IQR
upper_bound_expenditure = expenditure_Q3 + 1.5 * expenditure_IQR
removed_outliers.reset_index(drop=True, inplace=True)
upper_expenditure = np.where(removed_outliers['Hhld, Expenditures, Total'] >= upper_bound_expenditure)[0]
lower_expenditure = np.where(removed_outliers['Hhld, Expenditures, Total'] <= lower_bound_expenditure)[0]

removed_outliers.drop(index=upper_expenditure, inplace=True)
removed_outliers.drop(index=lower_expenditure, inplace=True)
print('Shape after removing outliers for Expenditure:', removed_outliers.shape)

Time to check using the same methods.


In [None]:
income_mean = removed_outliers['Hhld, Income, Total'].mean()
income_median = removed_outliers['Hhld, Income, Total'].median()
income_std = removed_outliers['Hhld, Income, Total'].std()

print(f"Income Mean: {income_mean}")
print(f"Income Median: {income_median}")
print(f"Income Standard Deviation: {income_std}")

expenditure_mean = removed_outliers['Hhld, Expenditures, Total'].mean()
expenditure_median = removed_outliers['Hhld, Expenditures, Total'].median()
expenditure_std = removed_outliers['Hhld, Expenditures, Total'].std()

print(f"Expenditure Mean: {expenditure_mean}")
print(f"Expenditure Median: {expenditure_median}")
print(f"Expenditure Standard Deviation: {expenditure_std}")

In [None]:
sns.boxplot(removed_outliers['Hhld, Income, Total'])
plt.title('Boxplot of Income')
plt.xlabel('Income')
plt.ylabel('Value')

In [None]:
sns.boxplot(removed_outliers['Hhld, Expenditures, Total'])
plt.title('Boxplot of Expenditures')
plt.xlabel('Expenditures')
plt.ylabel('Value')

In [None]:
fig, ax = plt.subplots(figsize=(6, 4))
ax.scatter(removed_outliers['Hhld, Income, Total'], removed_outliers['Hhld, Expenditures, Total'])
ax.set_xlabel('Income')
ax.set_ylabel('Expenditure')
plt.show()

The outliers are now removed.


There is a column named Unnamed_13 in the dataset. We opted to total the unnamed column and other sources of income NEC because the unnamed column contributes to the total income of the household as some of the total income were inaccurate if the unnamed column wans't included


In [None]:
imputted_column_13 = removed_outliers.copy()
imputted_column_13['Other Sources of Income NEC'] = imputted_column_13['Other Sources of Income NEC'] + imputted_column_13['Unnamed: 13']
print("Number of columns before dropping:", imputted_column_13.shape[1])
imputted_column_13.drop(columns=['Unnamed: 13'], inplace=True)
print("Number of columns after dropping:", imputted_column_13.shape[1])

# 5. Quantitative Statistics


## Generate statistics and provide EDA. Provide illustration


# 6. Application of Proximity (Distance Analysis)


## Check for correlation. Provide illustration.


To start off the distance analysis, we look at various correlations to see the relationship of income and expenditures to each other, and to each of their sources


In [None]:
# income_expenditure_dataframe = imputted_column_13[['Hhld, Income, Total', 'Hhld, Expenditures, Total']]
# income_expenditure_corr = income_expenditure_dataframe.corr(method='pearson')
# plt.figure(figsize=(20,10), dpi = 500)
# sns.heatmap(income_expenditure_corr,annot=True,fmt=".2f", linewidth=.5, cmap='coolwarm')
# plt.show()

The heatmap says that when income increases, expenditures slightly decreases. But overall, the two variables are highly correlated to each other, meaning that households are more likely to spend much of what they earn.


We will now correlate income and expencitures with the various sources found in the dataset.
Income will be correlated to monetary sources while expenditures will be correlated to various expenses


In [None]:
# monetary_columns = [
#     'Salaries/Wages from Regular Employment',
#     'Salaries/Wages from Seasonal Employment',
#     'Net Share of Crops, Fruits, etc. (Tot. Net Value of Share)',
#     'Cash Receipts, Support, etc. from Abroad',
#     'Cash Receipts, Support, etc. from Domestic Source',
#     'Rentals Received from Non-Agri Lands, etc.',
#     'Pension and Retirement Benefits',
#     'Dividends from Investment',
#     'Other Sources of Income NEC',
#     'Family Sustenance Activities',
#     'Total Received as Gifts',
#     'Crop Farming and Gardening',
#     'Livestock and Poultry Raising',
#     'Fishing',
#     'Forestry and Hunting',
#     'Wholesale and Retail',
#     'Manufacturing',
#     'Transportation, Storage Services',
#     'Entrep. Activities NEC',
#     'Entrep. Activities NEC.1',
#     'Entrep. Activities NEC.2',
# ]

# cost_columns = [
#     'Cereal and Cereal Preparations (Total)',
#     'Meat and Meat Preparations',
#     'Fish and Marine Products (Total)',
#     'Dairy Products and Eggs (Total)',
#     'Oils and Fats (Total)',
#     'Fruits and Vegetables',
#     'Vegetables (Total)',
#     'Sugar, Jam and Honey (Total)',
#     'Food Not Elsewhere Classified (Total)',
#     'Fruit and vegetable juices',
#     'Coffee, Cocoa and Tea (Total)',
#     'Tea (total)  expenditure',
#     'Cocoa (total)  expenditure',
#     'Main Source of Water Supply (2nd visit only)',
#     'Softdrinks',
#     'Other Non Alcoholic Beverages',
#     'Alcoholic Beverages (Total)',
#     'Tobacco (Total)',
#     'Other Vegetables (Total)',
#     'Services_Primary_Goods',
#     'Alcohol Procduction Services',
#     'Food Regularly Consumed Outside The Home (Total)',
#     'Clothing, Footwear and Other Wear',
#     'Housing and water (Total)',
#     'Actual House Rent',
#     'Furnishings, Household Equipment & Routine Household Mainte',
#     'Health (Total)',
#     'Transportation (Total)',
#     'Communication (Total)',
#     'Recreation and Culture (Total)',
#     'Education (Total)',
#     'Insurance',
#     'Miscellaneous Goods and Services (Total)',
#     'Durable Furniture',
#     'Special Family Occasion',
#     'Other Expenditure (inc. Value Consumed, Losses)',
#     'Accomodation Services',
# ]

To have a smoother heatmap, all similar columns will be combined to a category.


In [None]:
# income_dataframe = imputted_column_13[monetary_columns]

# # Combine similar columns to reduce dimensionality
# income_dataframe['Salaries/Wages'] = income_dataframe['Salaries/Wages from Regular Employment'] + income_dataframe['Salaries/Wages from Seasonal Employment']
# income_dataframe['Cash Receipts'] = income_dataframe['Cash Receipts, Support, etc. from Abroad'] + income_dataframe['Cash Receipts, Support, etc. from Domestic Source']
# income_dataframe['Farming'] = income_dataframe['Crop Farming and Gardening'] + income_dataframe['Net Share of Crops, Fruits, etc. (Tot. Net Value of Share)']
# income_dataframe['Logistics and Manufacturing'] = income_dataframe['Wholesale and Retail'] + income_dataframe['Transportation, Storage Services'] + income_dataframe['Manufacturing']
# income_dataframe['Entrep. Activities'] = income_dataframe['Entrep. Activities NEC'] + income_dataframe['Entrep. Activities NEC.1'] + income_dataframe['Entrep. Activities NEC.2']
# income_dataframe['Passive Income'] = income_dataframe['Total Received as Gifts'] + income_dataframe['Family Sustenance Activities'] + income_dataframe['Pension and Retirement Benefits'] + income_dataframe['Dividends from Investment'] + income_dataframe['Rentals Received from Non-Agri Lands, etc.']
# income_dataframe['Livestocks'] = income_dataframe['Livestock and Poultry Raising'] + income_dataframe['Fishing'] + income_dataframe['Forestry and Hunting']
# income_dataframe['Other Income NEC'] = income_dataframe['Other Sources of Income NEC']

# # Drop the original columns
# income_dataframe.drop(columns=monetary_columns, inplace=True)

In [None]:
# income_corr = income_dataframe.corr(method='pearson')
# plt.figure(figsize=(20,10), dpi = 500)
# sns.heatmap(income_corr,annot=True,fmt=".2f", linewidth=.5, cmap='coolwarm')
# plt.show()

As seen from the heatmap above, all monetary sources are surprisingly not much correlated to each other. Negative correlations can be interpreted as households not having much revenue streams in many categories. Particularly in the Salaries/Wages column not having a positive correlation with other sources.


In [None]:
# expenditure_dataframe = imputted_column_13[cost_columns]

# expenditure_dataframe['Processed Foods'] = expenditure_dataframe['Cereal and Cereal Preparations (Total)'] + expenditure_dataframe['Sugar, Jam and Honey (Total)'] + expenditure_dataframe['Softdrinks'] + expenditure_dataframe['Oils and Fats (Total)']
# expenditure_dataframe['Non-Processed Foods'] = expenditure_dataframe['Meat and Meat Preparations'] + expenditure_dataframe['Fish and Marine Products (Total)'] + expenditure_dataframe['Dairy Products and Eggs (Total)']
# expenditure_dataframe['Other Foods'] = expenditure_dataframe['Food Not Elsewhere Classified (Total)'] + expenditure_dataframe['Food Regularly Consumed Outside The Home (Total)']
# expenditure_dataframe['Fruits and Vegetables and Juices'] = expenditure_dataframe['Fruits and Vegetables'] + expenditure_dataframe['Vegetables (Total)'] + expenditure_dataframe['Fruit and vegetable juices'] + expenditure_dataframe['Other Vegetables (Total)']
# expenditure_dataframe['Non-Alcoholic Beverages'] = expenditure_dataframe['Coffee, Cocoa and Tea (Total)'] + expenditure_dataframe['Tea (total)  expenditure'] + expenditure_dataframe['Cocoa (total)  expenditure'] + expenditure_dataframe['Other Non Alcoholic Beverages']
# expenditure_dataframe['Non-Essential Expenditures'] = expenditure_dataframe['Alcoholic Beverages (Total)'] + expenditure_dataframe['Tobacco (Total)'] 
# expenditure_dataframe['Services and Primary Goods'] = expenditure_dataframe['Services_Primary_Goods'] + expenditure_dataframe['Main Source of Water Supply (2nd visit only)'] + expenditure_dataframe['Accomodation Services'] + expenditure_dataframe['Alcohol Procduction Services']
# expenditure_dataframe['Miscellaneous Expenditures'] = expenditure_dataframe['Miscellaneous Goods and Services (Total)'] + expenditure_dataframe['Durable Furniture'] + expenditure_dataframe['Special Family Occasion']
# expenditure_dataframe['Essential Expenditures'] = expenditure_dataframe['Clothing, Footwear and Other Wear'] + expenditure_dataframe['Housing and water (Total)'] + expenditure_dataframe['Actual House Rent'] + expenditure_dataframe['Furnishings, Household Equipment & Routine Household Mainte'] + expenditure_dataframe['Health (Total)']+ expenditure_dataframe['Transportation (Total)'] + expenditure_dataframe['Communication (Total)'] + expenditure_dataframe['Recreation and Culture (Total)'] + expenditure_dataframe['Education (Total)'] + expenditure_dataframe['Insurance']
# expenditure_dataframe['Other Expenditure NEC'] = expenditure_dataframe['Other Expenditure (inc. Value Consumed, Losses)']

# expenditure_dataframe.drop(columns=cost_columns, inplace=True)

In [None]:
# expenditure_corr = expenditure_dataframe.corr(method='pearson')
# plt.figure(figsize=(20,10), dpi = 500)
# sns.heatmap(expenditure_corr,annot=True,fmt=".2f", linewidth=.5, cmap='coolwarm')
# plt.show()

In the expenditure heatmap, it is very different from the income heatmap wherein all of the expenditure sources have positive correlations with one another. This indicates that households spend on various things while not having much revenue streams. Both of the Food categories and the Essential Expenditures have the highest correlation with the rest of the columns.

The three heatmaps give a picture on the relationship of income-expenditure ratio for households, the relationship of revenue stream categories, and the relationship of spending habit of various households in the FIES dataset.


## Calculate a distance matrix (e.g., Euclidean distance) for numeric data as required. Provide illustration


For the computation of the distance matrix, Euclidean distance will be used to cluster households based on their income-expenditure ratio. Household Numbers provided in the dataset will be the main index of the distance matrix


In [None]:
# distance_dataframe = imputted_column_13.copy()
# household_incomes = distance_dataframe['Hhld, Income, Total'].values
# household_expenditures = distance_dataframe['Hhld, Expenditures, Total'].values
# household_ids = distance_dataframe['Household ID'].values
# household_df = pd.DataFrame({'Total Income': household_incomes, 'Total Expenditures': household_expenditures}, index=household_ids)
# household_df.index = household_df.index.map(lambda x: f"Household No. {x}")
# household_df

Since the dataset has about 149,622 entries even without the outliers, it is essential to reduce these entries so as to preserve computational power. The challenege now lies in finding a way to reduce rows but still keeping a good representation of the data. One such way researched is the <b>Freedman-Diaconis Rule</b> of getting the optimal number of bins to group the dataset into.

The <b>Freedman-Diaconis Rule</b> is a method of determining the number of bins in a histogram. It is based on the interquartile range of the data. It was devised from the Scott's Rule, obtained by asymptotically minimizing the integral mean square error of the density estimate with respect to a Gaussian reference (Markov, 2022).

The <b>Freedman-Diaconis Rule</b> will be implemented by finding the optimal number of bins to group the dataset into by income and expenditure. A Strata will be created by the combining the bins of income and expenditure into a string, by which a Stratified Sampling will be implemented to gain equal representation of the data. Stratas that areless than two will be dropped.

Sources

1. <https://medium.com/@maxmarkovvision/optimal-number-of-bins-for-histograms-3d7c48086fde>


In [None]:
# num_rows = household_df.shape[0]

# income_iqr = household_df['Total Income'].quantile(0.75) - household_df['Total Income'].quantile(0.25)
# expenditure_iqr = household_df['Total Expenditures'].quantile(0.75) - household_df['Total Expenditures'].quantile(0.25)

# # Implement Freedman-Diaconis Rule
# bin_width_income = 2 * income_iqr / (num_rows ** (1/3))
# bin_width_expenditure = 2 * expenditure_iqr / (num_rows ** (1/3))

# income_bins = int((household_df['Total Income'].max() - household_df['Total Income'].min()) / bin_width_income)
# expenditure_bins = int((household_df['Total Expenditures'].max() - household_df['Total Expenditures'].min()) / bin_width_expenditure)

# print(f"Income number of bins: {income_bins}")
# print(f"Expenditure number of bins: {expenditure_bins}")

In [None]:
# household_df['Income Bin'] = pd.qcut(household_df['Total Income'], q=income_bins, labels=False, duplicates='drop')
# household_df['Expenditure Bin'] = pd.qcut(household_df['Total Expenditures'], q=expenditure_bins, labels=False, duplicates='drop')

# household_df['Strata'] = household_df['Income Bin'].astype(str) + '-' + household_df['Expenditure Bin'].astype(str)
# print(household_df['Strata'].value_counts())

In [None]:
# print("Shape of the data before removing rare strata:", household_df.shape)
# strata_counts = household_df['Strata'].value_counts()
# rare_strata = strata_counts[strata_counts < 2].index
# # Drop the rare strata
# household_df = household_df[~household_df['Strata'].isin(rare_strata)]

# print("Shape of the data after removing rare strata:", household_df.shape)

We will now implement <b>Stratified Sampling</b> to get 5% of the data which will account for 7,434 rows for a balance of good representation of the data and a save in computational power


In [None]:
# from sklearn.model_selection import train_test_split

# sampled_df, _ = train_test_split(household_df, test_size=0.95, random_state=42, stratify=household_df['Strata']) # Get 5% of the data

# sampled_df.drop(columns=['Income Bin', 'Expenditure Bin', 'Strata'], inplace=True)
# print('Shape of sampled data:', sampled_df.shape)

Since Euclidean Distance is sensitive to the scale of the data, we will normalize the data using Standard Scaling.


In [None]:
# from sklearn.preprocessing import StandardScaler
# # Scale the data for Euclidean distance
# scaled_data = StandardScaler().fit_transform(sampled_df)
# scaled_data_df = pd.DataFrame(scaled_data, columns=sampled_df.columns, index=sampled_df.index)
# scaled_data_df

Scipy will be used to calculate the Euclidean Distance and obtain the distance matrix


In [None]:
# from scipy.spatial import distance_matrix

# euclidean_distances = pd.DataFrame(distance_matrix(scaled_data, scaled_data), index=sampled_df.index, columns=sampled_df.index)
# euclidean_distances

As a sample visualization, 10 samples will be used to create a heatmap. In this case, cooler colors means that households are closer to each other while warmer colors means that households are further away from each other


In [None]:
# n_samples = 10
# sampled_indices = euclidean_distances.sample(n=n_samples, random_state=42).index
# euclidean_distances_sampled = euclidean_distances.loc[sampled_indices, sampled_indices]
# plt.figure(figsize=(10, 8))
# sns.heatmap(euclidean_distances_sampled, cmap="coolwarm", annot=False)
# plt.title("Euclidean Distance Heatmap")
# plt.show()

Agglomerative Clustering directly works with a distance matrix as an input. We use the distance matrix of euclidean distances and form clusters based on similar household incomes and expenditures, and we add them to the sampled dataframe.


In [None]:
# from sklearn.cluster import AgglomerativeClustering
# from sklearn.metrics import silhouette_score

# silhouette_scores = []

# for n_clusters in range(2, 11):
#     clustering = AgglomerativeClustering(n_clusters=n_clusters)
#     labels = clustering.fit_predict(scaled_data)
#     score = silhouette_score(scaled_data, labels)
#     silhouette_scores.append(score)

# plt.figure(figsize=(10, 6))
# plt.plot(range(2, 11), silhouette_scores, marker='o')
# plt.xlabel('Number of Clusters')
# plt.ylabel('Silhouette Score')
# plt.title('Finding the Optimal Number of Clusters')
# plt.show()

As seen from the silhouette score plot, the optimal number of clusters is 2.


In [None]:
# num_clusters = 2  

# clustering = AgglomerativeClustering(n_clusters=num_clusters)
# cluster_labels = clustering.fit_predict(euclidean_distances)

# sampled_df['Cluster'] = cluster_labels
# sampled_df.head()

In [None]:
# plt.figure(figsize=(8, 6))
# sns.scatterplot(
#     x='Total Income', 
#     y='Total Expenditures', 
#     hue='Cluster', 
#     palette='viridis', 
#     data=sampled_df, 
#     s=100
# )
# plt.title('Cluster Visualization')
# plt.xlabel('Total Income')
# plt.ylabel('Total Expenditures')
# plt.legend(title='Cluster')
# plt.show()

A scatterplot shows the clusters formed by the agglomerative clustering. We can see that the clusters are formed by similar households. In this case, two clusters are formed.

1. The 0 Cluster are households with relatively high to upper middle income and high expenditures
2. The 1 Cluster are households with relatively low to lower middle income and expenditures.


In [None]:
# income_dataframe.columns[0]

In [None]:
# from sklearn.metrics.pairwise import cosine_similarity

# similarity_matrix = cosine_similarity(sampled_df[sampled_df.columns])
# print(similarity_matrix[:5, :5])

# 7. Data Mining: Association Rule Mining


## If needed, transform the dataset (one-hot encoding) and apply the Apriori algorithm to extract association rules.


In [None]:
# distance_dataframe = imputted_column_13.copy()
# household_incomes = distance_dataframe['Hhld, Income, Total'].values
# household_expenditures = distance_dataframe['Hhld, Expenditures, Total'].values
# household_ids = distance_dataframe['Household ID'].values
# household_df = pd.DataFrame({'Total Income': household_incomes, 'Total Expenditures': household_expenditures}, index=household_ids)
# household_df.index = household_df.index.map(lambda x: f"Household No. {x}")
# household_df

In [None]:
assoc_rule_sampling = imputted_column_13.copy()

num_rows = assoc_rule_sampling.shape[0]

income_iqr = assoc_rule_sampling['Hhld, Income, Total'].quantile(0.75) - assoc_rule_sampling['Hhld, Income, Total'].quantile(0.25)
expenditure_iqr = assoc_rule_sampling['Hhld, Expenditures, Total'].quantile(0.75) - assoc_rule_sampling['Hhld, Expenditures, Total'].quantile(0.25)

# Implement Freedman-Diaconis Rule
bin_width_income = 2 * income_iqr / (num_rows ** (1/3))
bin_width_expenditure = 2 * expenditure_iqr / (num_rows ** (1/3))

income_bins = int((assoc_rule_sampling['Hhld, Income, Total'].max() - assoc_rule_sampling['Hhld, Income, Total'].min()) / bin_width_income)
expenditure_bins = int((assoc_rule_sampling['Hhld, Expenditures, Total'].max() - assoc_rule_sampling['Hhld, Expenditures, Total'].min()) / bin_width_expenditure)

print(f"Income number of bins: {income_bins}")
print(f"Expenditure number of bins: {expenditure_bins}")

In [None]:
assoc_rule_sampling['Income Bin'] = pd.qcut(assoc_rule_sampling['Hhld, Income, Total'], q=income_bins, labels=False, duplicates='drop')
assoc_rule_sampling['Expenditure Bin'] = pd.qcut(assoc_rule_sampling['Hhld, Expenditures, Total'], q=expenditure_bins, labels=False, duplicates='drop')

assoc_rule_sampling['Strata'] = assoc_rule_sampling['Income Bin'].astype(str) + '-' + assoc_rule_sampling['Expenditure Bin'].astype(str)
print(assoc_rule_sampling['Strata'].value_counts())

In [None]:
print("Shape of the data before removing rare strata:", assoc_rule_sampling.shape)
strata_counts = assoc_rule_sampling['Strata'].value_counts()
rare_strata = strata_counts[strata_counts < 2].index
# Drop the rare strata
assoc_rule_sampling = assoc_rule_sampling[~assoc_rule_sampling['Strata'].isin(rare_strata)]

print("Shape of the data after removing rare strata:", assoc_rule_sampling.shape)

In [None]:
from sklearn.model_selection import train_test_split

assoc_rule_sampled, _ = train_test_split(assoc_rule_sampling, test_size=0.95, random_state=42, stratify=assoc_rule_sampling['Strata']) # Get 5% of the data

assoc_rule_sampled.drop(columns=['Income Bin', 'Expenditure Bin', 'Strata'], inplace=True)
print('Shape of sampled data:', assoc_rule_sampled.shape)

In [None]:
monetary_columns = [
    'Salaries/Wages from Regular Employment',
    'Salaries/Wages from Seasonal Employment',
    'Net Share of Crops, Fruits, etc. (Tot. Net Value of Share)',
    'Cash Receipts, Support, etc. from Abroad',
    'Cash Receipts, Support, etc. from Domestic Source',
    'Rentals Received from Non-Agri Lands, etc.',
    'Pension and Retirement Benefits',
    'Dividends from Investment',
    'Other Sources of Income NEC',
    'Family Sustenance Activities',
    'Total Received as Gifts',
    'Crop Farming and Gardening',
    'Livestock and Poultry Raising',
    'Fishing',
    'Forestry and Hunting',
    'Wholesale and Retail',
    'Manufacturing',
    'Transportation, Storage Services',
    'Entrep. Activities NEC',
    'Entrep. Activities NEC.1',
    'Entrep. Activities NEC.2',
]
cost_columns = [
    'Cereal and Cereal Preparations (Total)',
    'Meat and Meat Preparations',
    'Fish and Marine Products (Total)',
    'Dairy Products and Eggs (Total)',
    'Oils and Fats (Total)',
    'Fruits and Vegetables',
    'Vegetables (Total)',
    'Sugar, Jam and Honey (Total)',
    'Food Not Elsewhere Classified (Total)',
    'Fruit and vegetable juices',
    'Coffee, Cocoa and Tea (Total)',
    'Tea (total)  expenditure',
    'Cocoa (total)  expenditure',
    'Main Source of Water Supply (2nd visit only)',
    'Softdrinks',
    'Other Non Alcoholic Beverages',
    'Alcoholic Beverages (Total)',
    'Tobacco (Total)',
    'Other Vegetables (Total)',
    'Services_Primary_Goods',
    'Alcohol Procduction Services',
    'Food Regularly Consumed Outside The Home (Total)',
    'Clothing, Footwear and Other Wear',
    'Housing and water (Total)',
    'Actual House Rent',
    'Furnishings, Household Equipment & Routine Household Mainte',
    'Health (Total)',
    'Transportation (Total)',
    'Communication (Total)',
    'Recreation and Culture (Total)',
    'Education (Total)',
    'Insurance',
    'Miscellaneous Goods and Services (Total)',
    'Durable Furniture',
    'Special Family Occasion',
    'Other Expenditure (inc. Value Consumed, Losses)',
    'Accomodation Services',
]



In [None]:
assoc_rule_1 = assoc_rule_sampled.copy()
# categorical_columns = ["NPCINC"]
assoc_rule_1 = assoc_rule_1[monetary_columns + cost_columns].copy()
# Combine similar columns to reduce dimensionality
assoc_rule_1['Salaries/Wages'] = assoc_rule_1['Salaries/Wages from Regular Employment'] + assoc_rule_1['Salaries/Wages from Seasonal Employment']
assoc_rule_1['Cash Receipts'] = assoc_rule_1['Cash Receipts, Support, etc. from Abroad'] + assoc_rule_1['Cash Receipts, Support, etc. from Domestic Source']
assoc_rule_1['Farming'] = assoc_rule_1['Crop Farming and Gardening'] + assoc_rule_1['Net Share of Crops, Fruits, etc. (Tot. Net Value of Share)']
assoc_rule_1['Logistics and Manufacturing'] = assoc_rule_1['Wholesale and Retail'] + assoc_rule_1['Transportation, Storage Services'] + assoc_rule_1['Manufacturing']
assoc_rule_1['Entrep. Activities'] = assoc_rule_1['Entrep. Activities NEC'] + assoc_rule_1['Entrep. Activities NEC.1'] + assoc_rule_1['Entrep. Activities NEC.2']
assoc_rule_1['Passive Income'] = assoc_rule_1['Total Received as Gifts'] + assoc_rule_1['Family Sustenance Activities'] + assoc_rule_1['Pension and Retirement Benefits'] + assoc_rule_1['Dividends from Investment'] + assoc_rule_1['Rentals Received from Non-Agri Lands, etc.']
assoc_rule_1['Livestocks'] = assoc_rule_1['Livestock and Poultry Raising'] + assoc_rule_1['Fishing'] + assoc_rule_1['Forestry and Hunting']
assoc_rule_1['Other Income NEC'] = assoc_rule_1['Other Sources of Income NEC']

# Drop the original columns
assoc_rule_1['Processed Foods'] = assoc_rule_1['Cereal and Cereal Preparations (Total)'] + assoc_rule_1['Sugar, Jam and Honey (Total)'] + assoc_rule_1['Softdrinks'] + assoc_rule_1['Oils and Fats (Total)']
assoc_rule_1['Non-Processed Foods'] = assoc_rule_1['Meat and Meat Preparations'] + assoc_rule_1['Fish and Marine Products (Total)'] + assoc_rule_1['Dairy Products and Eggs (Total)']
assoc_rule_1['Other Foods'] = assoc_rule_1['Food Not Elsewhere Classified (Total)'] + assoc_rule_1['Food Regularly Consumed Outside The Home (Total)']
assoc_rule_1['Fruits and Vegetables and Juices'] = assoc_rule_1['Fruits and Vegetables'] + assoc_rule_1['Vegetables (Total)'] + assoc_rule_1['Fruit and vegetable juices'] + assoc_rule_1['Other Vegetables (Total)']
assoc_rule_1['Non-Alcoholic Beverages'] = assoc_rule_1['Coffee, Cocoa and Tea (Total)'] + assoc_rule_1['Tea (total)  expenditure'] + assoc_rule_1['Cocoa (total)  expenditure'] + assoc_rule_1['Other Non Alcoholic Beverages']
assoc_rule_1['Non-Essential Expenditures'] = assoc_rule_1['Alcoholic Beverages (Total)'] + assoc_rule_1['Tobacco (Total)'] 
assoc_rule_1['Services and Primary Goods'] = assoc_rule_1['Services_Primary_Goods'] + assoc_rule_1['Main Source of Water Supply (2nd visit only)'] + assoc_rule_1['Accomodation Services'] + assoc_rule_1['Alcohol Procduction Services']
assoc_rule_1['Miscellaneous Expenditures'] = assoc_rule_1['Miscellaneous Goods and Services (Total)'] + assoc_rule_1['Durable Furniture'] + assoc_rule_1['Special Family Occasion']
assoc_rule_1['Essential Expenditures'] = assoc_rule_1['Clothing, Footwear and Other Wear'] + assoc_rule_1['Housing and water (Total)'] + assoc_rule_1['Actual House Rent'] + assoc_rule_1['Furnishings, Household Equipment & Routine Household Mainte'] + assoc_rule_1['Health (Total)']+ assoc_rule_1['Transportation (Total)'] + assoc_rule_1['Communication (Total)'] + assoc_rule_1['Recreation and Culture (Total)'] + assoc_rule_1['Education (Total)'] + assoc_rule_1['Insurance']
assoc_rule_1['Other Expenditure NEC'] = assoc_rule_1['Other Expenditure (inc. Value Consumed, Losses)']

assoc_rule_1.drop(columns=monetary_columns, inplace=True)
assoc_rule_1.drop(columns=cost_columns, inplace=True)

# convert monetary and cost to low/med/high using percentiles
def categorize_lmh(value, quantiles):
    if value == 0:
        return "None"
    if value <= quantiles[0.33]:
        return "Low"
    elif value <= quantiles[0.66]:
        return "Medium"
    else:
        return "High"
    
for col in assoc_rule_1.columns:
    quantiles = assoc_rule_1[col].quantile([0.33, 0.66]).to_dict()
    assoc_rule_1[col] = assoc_rule_1[col].apply(lambda x: categorize_lmh(x, quantiles))

assoc_rule_1.shape

In [None]:
assoc_rule_1_transactions = pd.get_dummies(assoc_rule_1)
assoc_rule_1_transactions.shape

In [None]:
from mlxtend.frequent_patterns import fpgrowth
frequent_itemsets = fpgrowth(assoc_rule_1_transactions, min_support=0.3, use_colnames=True, verbose=1)
# 0.3 support in a 7434 dataset is 7434 * 0.3 = 2294 transactions where the item / combination is present

In [None]:
from mlxtend.frequent_patterns import association_rules
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1)
# save frequent itemsets and association rules to csv
frequent_itemsets.to_csv('../itemsets_and_rules/frequent_itemsets_spendingPatterns.csv', index=False)
rules.to_csv('../itemsets_and_rules/association_rules_spendingPatterns.csv', index=False)

In [None]:
import plotly.express as px
import ipywidgets as widgets
from IPython.display import display
# Convert frozensets to strings for JSON serialization
rules['antecedents'] = rules['antecedents'].apply(lambda x: ', '.join(list(x)))
rules['consequents'] = rules['consequents'].apply(lambda x: ', '.join(list(x)))

unique_values = sorted(set(rules["antecedents"]).union(set(rules["consequents"])))
unique_values.insert(0, "All")  # Add "All" at the beginning

value_dropdown = widgets.Dropdown(
    options=unique_values, 
    value="All",  # Default selection
    description='Filter:'
)

def update_plot(selected_value):
    if selected_value == "All":
        filtered_rules = rules  # Show full dataset
    else:
        # Filter the DataFrame where antecedents or consequents contain the selected value
        filtered_rules = rules[
            (rules["antecedents"] == selected_value) | (rules["consequents"] == selected_value)
        ]
    
    if filtered_rules.empty:
        print(f"No rules found for '{selected_value}'")
        return

    # Create the scatter plot
    fig = px.scatter(
        filtered_rules,
        x='support',
        y='confidence',
        size='lift',
        color='lift',
        hover_name='antecedents',
        hover_data=['consequents', 'support', 'confidence', 'lift'],
        title=f'Rules Involving "{selected_value}"' if selected_value != "All" else "All Rules",
        labels={'support': 'Support', 'confidence': 'Confidence'},
    )
    fig.show()

# Display interactive widgets
interactive_plot = widgets.interactive(update_plot, selected_value=value_dropdown)
display(interactive_plot)

In [None]:
assoc_rule_2 = assoc_rule_sampled.copy()
assoc_rule_2 = assoc_rule_2[monetary_columns].copy()
# Combine similar columns to reduce dimensionality
assoc_rule_2['Salaries/Wages'] = assoc_rule_2['Salaries/Wages from Regular Employment'] + assoc_rule_2['Salaries/Wages from Seasonal Employment']
assoc_rule_2['Cash Receipts'] = assoc_rule_2['Cash Receipts, Support, etc. from Abroad'] + assoc_rule_2['Cash Receipts, Support, etc. from Domestic Source']
assoc_rule_2['Farming'] = assoc_rule_2['Crop Farming and Gardening'] + assoc_rule_2['Net Share of Crops, Fruits, etc. (Tot. Net Value of Share)']
assoc_rule_2['Logistics and Manufacturing'] = assoc_rule_2['Wholesale and Retail'] + assoc_rule_2['Transportation, Storage Services'] + assoc_rule_2['Manufacturing']
assoc_rule_2['Entrep. Activities'] = assoc_rule_2['Entrep. Activities NEC'] + assoc_rule_2['Entrep. Activities NEC.1'] + assoc_rule_2['Entrep. Activities NEC.2']
assoc_rule_2['Passive Income'] = assoc_rule_2['Total Received as Gifts'] + assoc_rule_2['Family Sustenance Activities'] + assoc_rule_2['Pension and Retirement Benefits'] + assoc_rule_2['Dividends from Investment'] + assoc_rule_2['Rentals Received from Non-Agri Lands, etc.']
assoc_rule_2['Livestocks'] = assoc_rule_2['Livestock and Poultry Raising'] + assoc_rule_2['Fishing'] + assoc_rule_2['Forestry and Hunting']
assoc_rule_2['Other Income NEC'] = assoc_rule_2['Other Sources of Income NEC']

assoc_rule_2.drop(columns=monetary_columns, inplace=True)

def categorize_lmh(value, quantiles):
    if value == 0:
        return "None"
    if value <= quantiles[0.33]:
        return "Low"
    elif value <= quantiles[0.66]:
        return "Medium"
    else:
        return "High"
    
for col in assoc_rule_2.columns:
    quantiles = assoc_rule_2[col].quantile([0.33, 0.66]).to_dict()
    assoc_rule_2[col] = assoc_rule_2[col].apply(lambda x: categorize_lmh(x, quantiles))

assoc_rule_2.shape

In [None]:
assoc_rule_2_transactions = pd.get_dummies(assoc_rule_2)
assoc_rule_2_transactions.shape
frequent_itemsets_2 = fpgrowth(assoc_rule_2_transactions, min_support=0.1, use_colnames=True, verbose=1)
rules_2 = association_rules(frequent_itemsets_2, metric="lift", min_threshold=1)
# save frequent itemsets and association rules_2 to csv
frequent_itemsets_2.to_csv('../itemsets_and_rules/frequent_itemsets_financialFreedom.csv', index=False)
rules_2.to_csv('../itemsets_and_rules/association_rules_financialFreedom.csv', index=False)
rules_2['antecedents'] = rules_2['antecedents'].apply(lambda x: ', '.join(list(x)))
rules_2['consequents'] = rules_2['consequents'].apply(lambda x: ', '.join(list(x)))

unique_values = sorted(set(rules_2["antecedents"]).union(set(rules_2["consequents"])))
unique_values.insert(0, "All")  # Add "All" at the beginning

value_dropdown = widgets.Dropdown(
    options=unique_values, 
    value="All",  
    description='Filter:'
)

def update_plot(selected_value):
    if selected_value == "All":
        filtered_rules = rules_2  
    else:
        filtered_rules = rules_2[
            (rules_2["antecedents"] == selected_value) | (rules_2["consequents"] == selected_value)
        ]
    
    if filtered_rules.empty:
        print(f"No rules_2 found for '{selected_value}'")
        return

    # Create the scatter plot
    fig = px.scatter(
        filtered_rules,
        x='support',
        y='confidence',
        size='lift',
        color='lift',
        hover_name='antecedents',
        hover_data=['consequents', 'support', 'confidence', 'lift'],
        title=f'Rules Involving "{selected_value}"' if selected_value != "All" else "All Rules",
        labels={'support': 'Support', 'confidence': 'Confidence'},
    )
    fig.show()

interactive_plot = widgets.interactive(update_plot, selected_value=value_dropdown)
display(interactive_plot)

In [None]:
assoc_rule_3 = imputted_column_13.copy()
categorical_columns = ['Region']
assoc_rule_3 = assoc_rule_3[monetary_columns + categorical_columns].copy()

assoc_rule_3['Salaries/Wages'] = assoc_rule_3['Salaries/Wages from Regular Employment'] + assoc_rule_3['Salaries/Wages from Seasonal Employment']
assoc_rule_3['Cash Receipts'] = assoc_rule_3['Cash Receipts, Support, etc. from Abroad'] + assoc_rule_3['Cash Receipts, Support, etc. from Domestic Source']
assoc_rule_3['Farming'] = assoc_rule_3['Crop Farming and Gardening'] + assoc_rule_3['Net Share of Crops, Fruits, etc. (Tot. Net Value of Share)']
assoc_rule_3['Logistics and Manufacturing'] = assoc_rule_3['Wholesale and Retail'] + assoc_rule_3['Transportation, Storage Services'] + assoc_rule_3['Manufacturing']
assoc_rule_3['Entrep. Activities'] = assoc_rule_3['Entrep. Activities NEC'] + assoc_rule_3['Entrep. Activities NEC.1'] + assoc_rule_3['Entrep. Activities NEC.2']
assoc_rule_3['Passive Income'] = assoc_rule_3['Total Received as Gifts'] + assoc_rule_3['Family Sustenance Activities'] + assoc_rule_3['Pension and Retirement Benefits'] + assoc_rule_3['Dividends from Investment'] + assoc_rule_3['Rentals Received from Non-Agri Lands, etc.']
assoc_rule_3['Livestocks'] = assoc_rule_3['Livestock and Poultry Raising'] + assoc_rule_3['Fishing'] + assoc_rule_3['Forestry and Hunting']
assoc_rule_3['Other Income NEC'] = assoc_rule_3['Other Sources of Income NEC']

assoc_rule_3.drop(columns=monetary_columns, inplace=True)

assoc_rule_3, _ = train_test_split(assoc_rule_3, test_size=0.95, random_state=42, stratify=assoc_rule_3['Region']) 

def categorize_lmh(value, quantiles):
    if value == 0:
        return "None"
    if value <= quantiles[0.33]:
        return "Low"
    elif value <= quantiles[0.66]:
        return "Medium"
    else:
        return "High"

def categorize_region(value):
    luzon = {
        1: 'Ilocos Region',
        2: 'Cagayan Valley',
        3: 'Central Luzon',
        4: 'CALABARZON',
        5: 'Bicol',
        13: 'NCR',
        14: 'CAR',
        17: 'MIMAROPA'
    }
    visayas = {
        6: 'Western Visayas',
        7: 'Central Visayas',
        8: 'Eastern Visayas'
    }
    mindanao = {
        9: 'Zamboanga Peninsula',
        10: 'Northern Mindanao',
        11: 'Davao Region',
        12: 'Soccsksargen',
        16: 'Caraga',
        19: 'BARMM'
    }
    if value in luzon:
        return "Luzon"
    elif value in visayas:
        return "Visayas"
    elif value in mindanao:
        return "Mindanao"
for col in assoc_rule_3.columns:
    if col == 'Region':
        assoc_rule_3[col] = assoc_rule_3[col].apply(categorize_region)
    else:
        quantiles = assoc_rule_3[col].quantile([0.33, 0.66]).to_dict()
        assoc_rule_3[col] = assoc_rule_3[col].apply(lambda x: categorize_lmh(x, quantiles))

assoc_rule_3.shape

In [None]:
assoc_rule_3_transactions = pd.get_dummies(assoc_rule_3)
assoc_rule_3_transactions.shape
frequent_itemsets_3 = fpgrowth(assoc_rule_3_transactions, min_support=0.1, use_colnames=True, verbose=1)
rules_3 = association_rules(frequent_itemsets_3, metric="lift", min_threshold=1)
# save frequent itemsets and association rules_3 to csv
frequent_itemsets_3.to_csv('../itemsets_and_rules/frequent_itemsets_financialFreedomPerRegion.csv', index=False)
rules_3.to_csv('../itemsets_and_rules/association_rules_financialFreedomPerRegion.csv', index=False)
rules_3['antecedents'] = rules_3['antecedents'].apply(lambda x: ', '.join(list(x)))
rules_3['consequents'] = rules_3['consequents'].apply(lambda x: ', '.join(list(x)))

unique_values = sorted(set(rules_3["antecedents"]).union(set(rules_3["consequents"])))
unique_values.insert(0, "All")  # Add "All" at the beginning

value_dropdown = widgets.Dropdown(
    options=unique_values, 
    value="All",  
    description='Filter:'
)

def update_plot(selected_value):
    if selected_value == "All":
        filtered_rules = rules_3  
    else:
        filtered_rules = rules_3[
            (rules_3["antecedents"] == selected_value) | (rules_3["consequents"] == selected_value)
        ]
    
    if filtered_rules.empty:
        print(f"No rules_3 found for '{selected_value}'")
        return

    # Create the scatter plot
    fig = px.scatter(
        filtered_rules,
        x='support',
        y='confidence',
        size='lift',
        color='lift',
        hover_name='antecedents',
        hover_data=['consequents', 'support', 'confidence', 'lift'],
        title=f'Rules Involving "{selected_value}"' if selected_value != "All" else "All Rules",
        labels={'support': 'Support', 'confidence': 'Confidence'},
    )
    fig.show()

interactive_plot = widgets.interactive(update_plot, selected_value=value_dropdown)
display(interactive_plot)


In [None]:
assoc_rule_4 = imputted_column_13.copy()
categorical_columns = ['NPCINC']
assoc_rule_4 = assoc_rule_4[monetary_columns + categorical_columns].copy()
# Combine similar columns to reduce dimensionality
assoc_rule_4['Salaries/Wages'] = assoc_rule_4['Salaries/Wages from Regular Employment'] + assoc_rule_4['Salaries/Wages from Seasonal Employment']
assoc_rule_4['Cash Receipts'] = assoc_rule_4['Cash Receipts, Support, etc. from Abroad'] + assoc_rule_4['Cash Receipts, Support, etc. from Domestic Source']
assoc_rule_4['Farming'] = assoc_rule_4['Crop Farming and Gardening'] + assoc_rule_4['Net Share of Crops, Fruits, etc. (Tot. Net Value of Share)']
assoc_rule_4['Logistics and Manufacturing'] = assoc_rule_4['Wholesale and Retail'] + assoc_rule_4['Transportation, Storage Services'] + assoc_rule_4['Manufacturing']
assoc_rule_4['Entrep. Activities'] = assoc_rule_4['Entrep. Activities NEC'] + assoc_rule_4['Entrep. Activities NEC.1'] + assoc_rule_4['Entrep. Activities NEC.2']
assoc_rule_4['Passive Income'] = assoc_rule_4['Total Received as Gifts'] + assoc_rule_4['Family Sustenance Activities'] + assoc_rule_4['Pension and Retirement Benefits'] + assoc_rule_4['Dividends from Investment'] + assoc_rule_4['Rentals Received from Non-Agri Lands, etc.']
assoc_rule_4['Livestocks'] = assoc_rule_4['Livestock and Poultry Raising'] + assoc_rule_4['Fishing'] + assoc_rule_4['Forestry and Hunting']
assoc_rule_4['Other Income NEC'] = assoc_rule_4['Other Sources of Income NEC']

assoc_rule_4.drop(columns=monetary_columns, inplace=True)
assoc_rule_4, _ = train_test_split(assoc_rule_4, test_size=0.95, random_state=42, stratify=assoc_rule_4['NPCINC']) 
def categorize_lmh(value, quantiles):
    if value == 0:
        return "None"
    if value <= quantiles[0.33]:
        return "Low"
    elif value <= quantiles[0.66]:
        return "Medium"
    else:
        return "High"

def categorize_income(decile):
    if 1 <= decile <= 3:
        return 'Low Income'
    elif 4 <= decile <= 7:
        return 'Middle Income'
    elif 8 <= decile <= 10:
        return 'High Income'
    else:
        return 'Invalid Decile'
    
for col in assoc_rule_4.columns:
    if col == 'NPCINC':
        assoc_rule_4[col] = assoc_rule_4[col].apply(categorize_income)
    else:
        quantiles = assoc_rule_4[col].quantile([0.33, 0.66]).to_dict()
        assoc_rule_4[col] = assoc_rule_4[col].apply(lambda x: categorize_lmh(x, quantiles))

assoc_rule_4.shape

In [None]:
assoc_rule_4_transactions = pd.get_dummies(assoc_rule_4)
assoc_rule_4_transactions.shape
frequent_itemsets_4 = fpgrowth(assoc_rule_4_transactions, min_support=0.1, use_colnames=True, verbose=1)
rules_4 = association_rules(frequent_itemsets_4, metric="lift", min_threshold=1)
# save frequent itemsets and association rules_4 to csv
frequent_itemsets_4.to_csv('../itemsets_and_rules/frequent_itemsets_financialFreedomPerIncomeGroup.csv', index=False)
rules_4.to_csv('../itemsets_and_rules/association_rules_financialFreedomPerIncomeGroup.csv', index=False)
rules_4['antecedents'] = rules_4['antecedents'].apply(lambda x: ', '.join(list(x)))
rules_4['consequents'] = rules_4['consequents'].apply(lambda x: ', '.join(list(x)))

unique_values = sorted(set(rules_4["antecedents"]).union(set(rules_4["consequents"])))
unique_values.insert(0, "All")  # Add "All" at the beginning

value_dropdown = widgets.Dropdown(
    options=unique_values, 
    value="All",  
    description='Filter:'
)

def update_plot(selected_value):
    if selected_value == "All":
        filtered_rules = rules_4  
    else:
        filtered_rules = rules_4[
            (rules_4["antecedents"] == selected_value) | (rules_4["consequents"] == selected_value)
        ]
    
    if filtered_rules.empty:
        print(f"No rules_4 found for '{selected_value}'")
        return

    # Create the scatter plot
    fig = px.scatter(
        filtered_rules,
        x='support',
        y='confidence',
        size='lift',
        color='lift',
        hover_name='antecedents',
        hover_data=['consequents', 'support', 'confidence', 'lift'],
        title=f'Rules Involving "{selected_value}"' if selected_value != "All" else "All Rules",
        labels={'support': 'Support', 'confidence': 'Confidence'},
    )
    fig.show()

interactive_plot = widgets.interactive(update_plot, selected_value=value_dropdown)
display(interactive_plot)