# 1. Data Import & Exploration


## Load the large dataset


In [None]:
import pandas as pd
import numpy as np
fies_df = pd.read_csv('../datasets/fies_2023_volume1_494887610821.csv')
fies_df.sample(10)

# 2. Identifying Data and Attributes


## List all column types and data types


In [None]:
# List all column types and data types
fies_df.dtypes

# 3. Determining the Type of Dataset


## Check if columns are numerical, categorical, or mixed.


In [None]:
for column in fies_df.columns:
    print(column, ':', pd.api.types.infer_dtype(fies_df[column]))

## Making a Data Dictionary


In [None]:
fies_column_descriptions = {
    'RDMD_ID': 'Unique identifier for the record',
    'Region': 'Region code',
    'Province': 'Province code',
    'Household ID': 'Unique household identifier',
    'RECODED PROVINCE': 'Recoded province information',
    'Family Size': 'Number of people in the household',
    'Salaries/Wages from Regular Employment': 'Income from regular employment',
    'Salaries/Wages from Seasonal Employment': 'Income from seasonal employment',
    'Income from Salaries and Wages': 'Total income from salaries and wages',
    'Net Share of Crops, Fruits, etc. (Tot. Net Value of Share)': 'Net value from crop and fruit share',
    'Cash Receipts, Support, etc. from Abroad': 'Cash support received from abroad',
    'Cash Receipts, Support, etc. from Domestic Source': 'Cash support received domestically',
    'Rentals Received from Non-Agri Lands, etc.': 'Income from land rentals (non-agricultural)',
    'Unnamed: 13': 'Unknown or unnamed column',
    'Pension and Retirement Benefits': 'Income from pensions and retirement',
    'Dividends from Investment': 'Income from dividends',
    'Other Sources of Income NEC': 'Other sources of income not elsewhere classified',
    'Family Sustenance Activities': 'Income from family sustenance activities',
    'Total Received as Gifts': 'Total gifts received by the household',
    'Crop Farming and Gardening': 'Income from crop farming and gardening',
    'Livestock and Poultry Raising': 'Income from livestock and poultry raising',
    'Fishing': 'Income from fishing activities',
    'Forestry and Hunting': 'Income from forestry and hunting',
    'Wholesale and Retail': 'Income from wholesale and retail business',
    'Manufacturing': 'Income from manufacturing activities',
    'Transportation, Storage Services': 'Income from transportation and storage services',
    'Entrep. Activities NEC': 'Income from entrepreneurial activities (not elsewhere classified)',
    'Entrep. Activities NEC.1': 'Income from entrepreneurial activities (additional category 1)',
    'Entrep. Activities NEC.2': 'Income from entrepreneurial activities (additional category 2)',
    'Hhld, Income from Entrepreneurial Activities, Total': 'Total household income from entrepreneurial activities',
    'Losses from EA': 'Losses from entrepreneurial activities',
    'Cereal and Cereal Preparations (Total)': 'Expenditure on cereals and cereal preparations',
    'Meat and Meat Preparations': 'Expenditure on meat and meat preparations',
    'Fish and Marine Products (Total)': 'Expenditure on fish and marine products',
    'Dairy Products and Eggs (Total)': 'Expenditure on dairy products and eggs',
    'Oils and Fats (Total)': 'Expenditure on oils and fats',
    'Fruits and Vegetables': 'Expenditure on fruits and vegetables',
    'Vegetables (Total)': 'Expenditure on vegetables',
    'Sugar, Jam and Honey (Total)': 'Expenditure on sugar, jam, and honey',
    'Food Not Elsewhere Classified (Total)': 'Expenditure on other food items',
    'Fruit and vegetable juices': 'Expenditure on fruit and vegetable juices',
    'Coffee, Cocoa and Tea (Total)': 'Expenditure on coffee, cocoa, and tea',
    'Tea (total)  expenditure': 'Expenditure on tea',
    'Cocoa (total)  expenditure': 'Expenditure on cocoa',
    'Main Source of Water Supply (2nd visit only)': 'Main source of water supply (second visit)',
    'Softdrinks': 'Expenditure on soft drinks',
    'Other Non Alcoholic Beverages': 'Expenditure on other non-alcoholic beverages',
    'Alcoholic Beverages (Total)': 'Expenditure on alcoholic beverages',
    'Tobacco (Total)': 'Expenditure on tobacco products',
    'Other Vegetables (Total)': 'Expenditure on other types of vegetables',
    'Services_Primary_Goods': 'Expenditure on services and primary goods',
    'Alcohol Procduction Services': 'Expenditure on alcohol production services',
    'Total Food Consumed at Home (Total)': 'Total food consumed at home',
    'Food Regularly Consumed Outside The Home (Total)': 'Food consumed outside the home',
    'Hhld, Food': 'Household expenditure on food',
    'Clothing, Footwear and Other Wear': 'Expenditure on clothing, footwear, and other wear',
    'Housing and water (Total)': 'Expenditure on housing and water',
    'Actual House Rent': 'Expenditure on actual house rent',
    'Imputed House Rental Value': 'Imputed value of house rental',
    'Imputed Housing Benefit Rental Value': 'Imputed value of housing benefit rental',
    'House Rent/Rental Value': 'Expenditure on house rent/rental value',
    'Furnishings, Household Equipment & Routine Household Mainte': 'Expenditure on furnishings and household equipment',
    'Health (Total)': 'Expenditure on health services and products',
    'Transportation (Total)': 'Expenditure on transportation',
    'Communication (Total)': 'Expenditure on communication services',
    'Recreation and Culture (Total)': 'Expenditure on recreation and culture',
    'Education (Total)': 'Expenditure on education',
    'Insurance': 'Expenditure on insurance',
    'Miscellaneous Goods and Services (Total)': 'Expenditure on miscellaneous goods and services',
    'Durable Furniture': 'Expenditure on durable furniture',
    'Special Family Occasion': 'Expenditure on special family occasions',
    'Other Expenditure (inc. Value Consumed, Losses)': 'Other expenditures including losses',
    'Other Disbursements': 'Other household disbursements',
    'Accomodation Services': 'Expenditure on accommodation services',
    'Total Non-Food Expenditure': 'Total non-food expenditure',
    'Hhld, Income, Total': 'Total household income',
    'Hhld, Expenditures, Total': 'Total household expenditures',
    'Total Household Disbursements': 'Total household disbursements',
    'Other Receipts': 'Other household receipts',
    'Total Receipts': 'Total receipts',
    'Psu (Recode)': 'Primary Sampling Unit (recoded)',
    'Raising Factor': 'Raising factor for survey results',
    'Final Population Weights': 'Final weights for population data',
    'Urban / Rural': 'Urban or rural classification',
    'Per Capita Income': 'Household per capita income',
    'NPCINC': 'National per capita income',
    'RPCINC': 'Regional per capita income',
    'Per Capita Income Decile (Province)': 'Per capita income decile in the province',
    'pPCINC': 'Provincial per capita income decile',
    'Per Capita Income Decile (Region with Negros Island Region (NIR))': 'Per capita income decile (region with NIR)',
    'Region (with NIR)': 'Region code including NIR'
}


In [None]:
fies_derivations = {
    'Total Receipts': 'Total Household Income + Other Receipts',
    'Hhld, Income, Total': 'Net Share of Crops, Fruits, etc. + Cash Receipts, Support, etc. from Abroad + Cash Receipts, Support, etc. from Domestic Source + Unnamed + Pension and Retirement Benefits + Dividends from Investment + Other Sources of Income NEC + Family Sustenance Activities + Total Received as Gifts + Household, Income from Entrep Activities, Total + Imputed House Rental Value',
    'Hhld, Income from Entrepreneurial Activities, Total': 'Crop Farming and Gardening + Livestock and Poultry Raising + Fishing + Forestry and Hunting + Wholesale and Retail + Manufacturing + Transportation, Storage Services + Entrep. Activities NEC + Entrep. Activities NEC 1 + Entrep. Activities NEC 2',
    'Total Household Disbursements': 'Total Household Expenditure + Other Disbursements',
    'Hhld, Expenditures, Total': 'Household Food + Total Non-Food Expenditure',
    'Hhld, Food': 'Total Food Consumed at Home + Food Regularly Consumed Outside The Home',
    'Total Food Consumed at Home (Total)': 'Cereal and Cereal Preparations + Meat and Meat Preparations + Fish and Marine Products + Dairy and Eggs + Oils and Fats + Fruits and Vegetables + Vegetables + Sugar, jam and Honey + Food Not Elsewhere Classified + Fruit and Vegetable Juices + Coffee, Cocoa and Tea + Tea + Cocoa + Main Source of Water Supply + Softdrinks + Other Non Alcoholic Beverages',
    'Total Non-Food Expenditure': 'Alcoholic Beverages + Tobacco + Other Vegetables + Services_Primary_Goods + Alcoholic Production Services + Housing and water (Total) + Furnishings, Household Equipment & Routine Household Maintenance + Health + Transportation + Communication + Recreation and Culture + Education + Insurance + Miscellaneous Goods and Services + Durable Furniture + Special Family Occasion + Other Expenditure + Accommodation Services + Clothing, Footwear and Other Wear',
}


In [None]:
fies_volume1_data_dict = pd.DataFrame({
    'Column Name': fies_df.columns,
    'Data Type': fies_df.dtypes,
    'Non-Null Count': fies_df.notnull().sum(),
    'Unique Values': fies_df.nunique(),
    'Description': [fies_column_descriptions.get(col, 'No desciption available') for col in fies_df.columns],
    'Derivations from other columns': [fies_derivations.get(col, '') for col in fies_df.columns]
})
fies_volume1_data_dict.to_csv('../fies_volume1_data_dict.csv', index=False)

# 4. Data Quality and Assessment


## Check for missing values, duplicates, outliers, and wrong data.<b>


Check for duplicates


In [None]:
number_of_rows = fies_df.shape[0]

print(f"Number of rows: {number_of_rows}")

removed_duplicates = fies_df.copy()
removed_duplicates.drop_duplicates(inplace=True)

print(f"Number of rows after dropping duplicates: {removed_duplicates.shape[0]}")

No duplicates are found.


From the data dictionary, the Total Household Disbursements column is the only one with an object datatype, suggesting mixed values of numbers, strings, etc.


In [None]:
for column in removed_duplicates.columns:
    if removed_duplicates[column].isnull().any():
        print(f"Column {column} has missing values")

The code block above does not show any null values initially, therefore there is the possibility of data with only whitespace values. The code below will strip all whitespaces
to know the true number of missing values.


In [None]:
# Function to check if a value is whitespace or empty
def has_whitespace(val):
    return isinstance(val, str) and val.strip() == ''

whitespace_rows = removed_duplicates.map(has_whitespace).any(axis=1)

whitespace_count = whitespace_rows.sum()

print(f"Number of rows with whitespace: {whitespace_count}")

There are whitespaces. Whitespaces could mean that the value for that data is zero. Therefore, a check must be made to ensure that there are zeroes in the dataset as well to know that whitespaces and zeroes are equivalent.


In [None]:
print(removed_duplicates['Total Household Disbursements'].value_counts().where(removed_duplicates['Total Household Disbursements'] == 0, 1).sum())

Now we detect potential outliers using statistical methods.
The main columns to look at are the Total Household Income and Total Household Expenditure columns..


In [None]:
income_mean = removed_duplicates['Hhld, Income, Total'].mean()
income_median = removed_duplicates['Hhld, Income, Total'].median()
income_std = removed_duplicates['Hhld, Income, Total'].std()

print(f"Income Mean: {income_mean}")
print(f"Income Median: {income_median}")
print(f"Income Standard Deviation: {income_std}")

In [None]:
expenditure_mean = removed_duplicates['Hhld, Expenditures, Total'].mean()
expenditure_median = removed_duplicates['Hhld, Expenditures, Total'].median()
expenditure_std = removed_duplicates['Hhld, Expenditures, Total'].std()

print(f"Expenditure Mean: {expenditure_mean}")
print(f"Expenditure Median: {expenditure_median}")
print(f"Expenditure Standard Deviation: {expenditure_std}")

From the results, the mean for the income and expenditure columns are quite large. To see more, a boxplot can be used to visualize the distribution


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
sns.boxplot(removed_duplicates['Hhld, Income, Total'])
plt.title('Boxplot of Income')
plt.xlabel('Income')
plt.ylabel('Value')

In [None]:
sns.boxplot(removed_duplicates['Hhld, Expenditures, Total'])
plt.title('Boxplot of Expenditures')
plt.xlabel('Expenditures')
plt.ylabel('Value')

In [None]:
fig, ax = plt.subplots(figsize=(6, 4))
ax.scatter(removed_duplicates['Hhld, Income, Total'], removed_duplicates['Hhld, Expenditures, Total'])
ax.set_xlabel('Income')
ax.set_ylabel('Expenditure')
plt.show()

From the boxplots and scatter plots, there defenitely are high-value outliers for both Income and Expenditure, and from the column derivations, this also means that by addressing only these two columns, the rest of the outlier columns can be addressed.


## Impute, discretize and data wrangling


Since there are zeroes present, the Total Household Disbursements column must be addressed. Upon inspection, Total Household Disbursements can be imputed from the sum of Hhld, Expenditures, Total and Other Disbursements.


In [None]:
removed_null = removed_duplicates.copy()
removed_null.loc[whitespace_rows, 'Total Household Disbursements'] = removed_null.loc[whitespace_rows, 
                                                                            'Hhld, Expenditures, Total'] + removed_null.loc[whitespace_rows, 'Other Disbursements']

Double-check for missing values


In [None]:
whitespace_rows = removed_null.map(has_whitespace).any(axis=1)
whitespace_count = whitespace_rows.sum()
print(f"Number of rows with whitespace: {whitespace_count}")

We now address the outliers using the IQR method


In [None]:
income_Q1 = removed_null['Hhld, Income, Total'].quantile(0.25)
income_Q3 = removed_null['Hhld, Income, Total'].quantile(0.75)
income_IQR = income_Q3 - income_Q1
print(f"Income Q1: {income_Q1}")
print(f"Income Q3: {income_Q3}")
print(f"Income IQR: {income_IQR}")

expenditure_Q1 = removed_null['Hhld, Expenditures, Total'].quantile(0.25)
expenditure_Q3 = removed_null['Hhld, Expenditures, Total'].quantile(0.75)
expenditure_IQR = expenditure_Q3 - expenditure_Q1
print(f"Expenditure Q1: {expenditure_Q1}")
print(f"Expenditure Q3: {expenditure_Q3}")
print(f"Expenditure IQR: {expenditure_IQR}")

In [None]:
# Income
removed_outliers = removed_null.copy()
print('Shape before removing outliers:', removed_outliers.shape)
lower_bound_income = income_Q1 - 1.5 * income_IQR
upper_bound_income = income_Q3 + 1.5 * income_IQR
upper_income = np.where(removed_outliers['Hhld, Income, Total'] >= upper_bound_income)[0]
lower_income = np.where(removed_outliers['Hhld, Income, Total'] <= lower_bound_income)[0]

removed_outliers.drop(index=upper_income, inplace=True)
removed_outliers.drop(index=lower_income, inplace=True)
print('Shape after removing outliers for Income:', removed_outliers.shape)

# Expenditure
lower_bound_expenditure = expenditure_Q1 - 1.5 * expenditure_IQR
upper_bound_expenditure = expenditure_Q3 + 1.5 * expenditure_IQR
removed_outliers.reset_index(drop=True, inplace=True)
upper_expenditure = np.where(removed_outliers['Hhld, Expenditures, Total'] >= upper_bound_expenditure)[0]
lower_expenditure = np.where(removed_outliers['Hhld, Expenditures, Total'] <= lower_bound_expenditure)[0]

removed_outliers.drop(index=upper_expenditure, inplace=True)
removed_outliers.drop(index=lower_expenditure, inplace=True)
print('Shape after removing outliers for Expenditure:', removed_outliers.shape)

Time to check using the same methods.


In [None]:
income_mean = removed_outliers['Hhld, Income, Total'].mean()
income_median = removed_outliers['Hhld, Income, Total'].median()
income_std = removed_outliers['Hhld, Income, Total'].std()

print(f"Income Mean: {income_mean}")
print(f"Income Median: {income_median}")
print(f"Income Standard Deviation: {income_std}")

expenditure_mean = removed_outliers['Hhld, Expenditures, Total'].mean()
expenditure_median = removed_outliers['Hhld, Expenditures, Total'].median()
expenditure_std = removed_outliers['Hhld, Expenditures, Total'].std()

print(f"Expenditure Mean: {expenditure_mean}")
print(f"Expenditure Median: {expenditure_median}")
print(f"Expenditure Standard Deviation: {expenditure_std}")

In [None]:
sns.boxplot(removed_outliers['Hhld, Income, Total'])
plt.title('Boxplot of Income')
plt.xlabel('Income')
plt.ylabel('Value')

In [None]:
sns.boxplot(removed_outliers['Hhld, Expenditures, Total'])
plt.title('Boxplot of Expenditures')
plt.xlabel('Expenditures')
plt.ylabel('Value')

In [None]:
fig, ax = plt.subplots(figsize=(6, 4))
ax.scatter(removed_outliers['Hhld, Income, Total'], removed_outliers['Hhld, Expenditures, Total'])
ax.set_xlabel('Income')
ax.set_ylabel('Expenditure')
plt.show()

The outliers are now removed.


There is a column named Unnamed_13 in the dataset. We opted to total the unnamed column and other sources of income NEC because the unnamed column contributes to the total income of the household as some of the total income were inaccurate if the unnamed column wans't included


In [None]:
imputted_column_13 = removed_outliers.copy()
imputted_column_13['Other Sources of Income NEC'] = imputted_column_13['Other Sources of Income NEC'] + imputted_column_13['Unnamed: 13']
print("Number of columns before dropping:", imputted_column_13.shape[1])
imputted_column_13.drop(columns=['Unnamed: 13'], inplace=True)
print("Number of columns after dropping:", imputted_column_13.shape[1])

# 5. Quantitative Statistics


In [None]:
cleaned_df = imputted_column_13.copy()

#### Functions

In [None]:
def create_stacked_bar_plot(df, x_col, y_cols, title): # y_cols is a list of columns to stack
    subset_data = df[y_cols].copy()
    subset_data[x_col] = df[x_col]
    subset_data = subset_data.groupby(x_col).sum().reset_index()
    subset_data = subset_data.set_index(x_col)
    subset_data.plot(kind="bar", stacked=True, figsize=(12, 10))
    plt.title(f"Stacked Bar Plot of {title} by {x_col}")
    plt.xlabel(x_col)
    plt.ylabel("Amount")
    plt.tight_layout()
    plt.show()

In [None]:
def create_boxplot(df, x_col, y_col):
    plt.figure(figsize=(12, 6))
    sns.boxplot(x=x_col, y=y_col, data=df)
    plt.title(f"Box Plot of {y_col} by {x_col}")
    plt.xlabel(x_col)
    plt.ylabel(y_col)
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

In [None]:
def plot_histogram_with_highlight(df, x_axis):
    counts, bins, patches = plt.hist(
        df[x_axis], bins=10, color="blue", edgecolor="black"
    )

    top_3_bins = np.argsort(counts)[-3:]
    for idx in top_3_bins:
        patches[idx].set_facecolor("green")

    plt.title(f"Distribution of {x_axis}")
    plt.xlabel(x_axis)
    plt.ylabel("Frequency")
    plt.show()

    top_3_values = df[x_axis].value_counts().nlargest(3)
    print(top_3_values)


In [None]:
def plot_expenditures_and_income_boxplot(df, x_axis):
    df_plot = df[[x_axis, "Hhld, Expenditures, Total", "Hhld, Income, Total"]].copy()
    df_plot = df_plot.melt(id_vars=x_axis, 
                           var_name="Type", 
                           value_name="Value")  # Melt the DataFrame to long format for easier plotting

    df_plot["Type"] = df_plot["Type"].replace(
        {
            "Hhld, Expenditures, Total": "Total Expenditures",
            "Hhld, Income, Total": "Total Income",
        }
    )  # Renaming the columns for better readability

    plt.figure(figsize=(12, 6))
    sns.boxplot(x=x_axis, y="Value", hue="Type", data=df_plot, dodge=True)
    plt.title(f"Boxplot of Total Expenditures and Income by {x_axis}")
    plt.xlabel(x_axis)
    plt.ylabel("Amount (Philippine Peso)")
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()



## Generate statistics and provide EDA. Provide illustration


In [None]:
categorical_columns = [
    'RDMD_ID',
    'Household ID',
    'Region',
    'Province',
    'RECODED PROVINCE',
    'Psu (Recode)',
    'Urban / Rural',
    'Per Capita Income Decile (Province)',
    'pPCINC',
    'Per Capita Income Decile (Region)'
]

cleaned_df.drop(columns=categorical_columns, errors='ignore').describe()

### Data Visualization

#### *Family Size Distribution*

In [None]:

plot_histogram_with_highlight(
    cleaned_df,
    x_axis="Family Size",
)


#### *Province*

In [None]:
plot_histogram_with_highlight(
    cleaned_df,
    x_axis="Region",
)

#### *Per Capita Income*

In [None]:
plot_histogram_with_highlight(
    cleaned_df,
    "NPCINC"
)

### **Income**

#### *Total Income*

In [None]:
# Stacked bar plot of income sources
income_sources = [
    'Salaries/Wages from Regular Employment',
    'Salaries/Wages from Seasonal Employment',
    'Income from Salaries and Wages',
    'Net Share of Crops, Fruits, etc. (Tot. Net Value of Share)',
    'Cash Receipts, Support, etc. from Abroad',
    'Cash Receipts, Support, etc. from Domestic Source',
    'Rentals Received from Non-Agri Lands, etc.',
    'Pension and Retirement Benefits',
    'Dividends from Investment',
    'Other Sources of Income NEC',
]

create_stacked_bar_plot(cleaned_df, 'Family Size', income_sources, 'Income Sources')

In [None]:
create_stacked_bar_plot(cleaned_df, 'Region', income_sources, 'Income Sources by Region')

In [None]:
total_income = 'Hhld, Income, Total'
create_boxplot(cleaned_df, 'Family Size', total_income)

In [None]:


create_boxplot(cleaned_df, 'NPCINC', 'Hhld, Income, Total')

In [None]:
create_boxplot(cleaned_df, 'Region', 'Hhld, Income, Total')

> [Explanation]

#### *Wages and Salaries*

In [None]:
create_boxplot(cleaned_df, 'Family Size', 'Income from Salaries and Wages')

In [None]:
create_boxplot(cleaned_df, "NPCINC", "Income from Salaries and Wages")

In [None]:
create_boxplot(cleaned_df, "Region", "Income from Salaries and Wages")

> [Explanation]

### **Expenses**

#### *Total Expenses*

In [None]:
# Boxplot of "Total Household Disbursements" per "Family Size"
total_disbursements = "Hhld, Expenditures, Total"
create_boxplot(cleaned_df, 'Family Size', total_disbursements)

In [None]:
create_boxplot(cleaned_df, 'NPCINC', total_disbursements)

In [None]:
create_boxplot(cleaned_df, 'Region', total_disbursements)

In [None]:
plot_expenditures_and_income_boxplot(cleaned_df, "Family Size")

In [None]:
plot_expenditures_and_income_boxplot(cleaned_df, "Region")

In [None]:
plot_expenditures_and_income_boxplot(cleaned_df, "NPCINC")

> As seen in the box plots above, families with lower 

#### *Food-related Expenditures*

In [None]:
# Boxplot of "Total Food Consumed at Home (Total)" per "Family Size"
plt.figure(figsize=(12, 6))
sns.boxplot(x="Family Size", y="Hhld, Food", data=cleaned_df)
plt.title("Boxplot of Household Food Expenses by Family Size")
plt.xlabel("Family Size")
plt.ylabel("Total Food Food Expenses")
plt.xticks(rotation=45)
plt.show()

In [None]:
food_expenses = [
    "Cereal and Cereal Preparations (Total)",
    "Meat and Meat Preparations",
    "Fish and Marine Products (Total)",
    "Dairy Products and Eggs (Total)",
    "Oils and Fats (Total)",
    "Fruits and Vegetables",
    "Vegetables (Total)",
    "Sugar, Jam and Honey (Total)",
    "Food Not Elsewhere Classified (Total)",
    "Fruit and vegetable juices",
    "Coffee, Cocoa and Tea (Total)",
    "Tea (total)  expenditure",
    "Cocoa (total)  expenditure",
    "Main Source of Water Supply (2nd visit only)",
    "Softdrinks",
    "Other Non Alcoholic Beverages",
    "Food Regularly Consumed Outside The Home (Total)",
]

create_stacked_bar_plot(cleaned_df, 'Family Size', food_expenses, 'Food Expenses')

> [Explanation]

#### *Non-Food Expenditures*

In [None]:
create_boxplot(cleaned_df, 'Family Size', 'Total Non-Food Expenditure')

In [None]:
# Stacked Bar Plot of Non-Food Expenses by Family Size

non_food_expenses = [
    "Tobacco (Total)",
    "Services_Primary_Goods",
    "Housing and water (Total)",
    "Actual House Rent",
    "Imputed House Rental Value",
    "Imputed Housing Benefit Rental Value",
    "House Rent/Rental Value",
    "Furnishings, Household Equipment & Routine Household Mainte",
    "Health (Total)",
    "Transportation (Total)",
    "Communication (Total)",
    "Recreation and Culture (Total)",
    "Education (Total)",
    "Insurance",
    "Miscellaneous Goods and Services (Total)",
    "Durable Furniture",
    "Special Family Occasion",
    "Other Expenditure (inc. Value Consumed, Losses)",
    "Other Disbursements",
    "Accomodation Services",
]

create_stacked_bar_plot(cleaned_df, 'Family Size', non_food_expenses, 'Non-Food Expenses')

In [None]:
create_stacked_bar_plot(cleaned_df, 'Region', non_food_expenses, 'Non-Food Expenses by Region')

In [None]:
create_stacked_bar_plot(cleaned_df, 'NPCINC', non_food_expenses, 'Non-Food Expenses by NPCINC')

> [Explanation]

# 6. Application of Proximity (Distance Analysis)


## Check for correlation. Provide illustration.


## Calculate a distance matrix (e.g., Euclidean distance) for numeric data as required. Provide illustration


# 7. Data Mining: Association Rule Mining


## If needed, transform the dataset (one-hot encoding) and apply the Apriori algorithm to extract association rules.


In [None]:
income_column = "Per Capita Income"
decile_column = "NPCINC"

income_stats = imputted_column_13.groupby(decile_column)[income_column].agg(['min', 'max']).reset_index()

income_stats

## Per Capita Income, Total nonfood, total food, family size

In [None]:
import pandas as pd
from mlxtend.frequent_patterns import fpgrowth, association_rules


# low, medium, high for necessities
lmh_columns = [
    "Total Non-Food Expenditure", "Hhld, Food",
]

categorical_columns = ["Family Size", "NPCINC"]

df_filtered = imputted_column_13[lmh_columns + categorical_columns].copy()

df_filtered


In [None]:


# Convert Low/Medium/High using percentiles
def categorize_lmh(value, quantiles):
    if value <= quantiles[0.33]:
        return "Low"
    elif value <= quantiles[0.66]:
        return "Medium"
    else:
        return "High"

for col in lmh_columns:
    quantiles = df_filtered[col].quantile([0.33, 0.66]).to_dict()
    df_filtered[col] = df_filtered[col].apply(lambda x: categorize_lmh(x, quantiles))

# Convert Family Size to Small/Medium/Large
def categorize_family_size(size):
    if size <= 3:
        return "Small"
    elif size <= 6:
        return "Medium"
    else:
        return "Large"

df_filtered["Family Size"] = df_filtered["Family Size"].apply(categorize_family_size)

def categorize_npcinc(decile):
    if decile <= 3:
        return "Low"
    elif decile <= 7:
        return "Medium"
    else:
        return "High"

df_filtered["NPCINC"] = df_filtered["NPCINC"].apply(categorize_npcinc)

In [None]:
# Step 6: One-Hot Encode Categorical Columns (LMH + Family Size + NPCINC)
df_encoded = pd.get_dummies(df_filtered, columns=lmh_columns + categorical_columns)

# Step 7: Apply FP-Growth
min_support = 0.07  # Adjust as needed
frequent_itemsets = fpgrowth(df_encoded, min_support=min_support, use_colnames=True)

# Step 8: Generate Association Rules
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1)

# Step 9: Save results
#frequent_itemsets.to_csv("frequent_itemsets_filtered2.csv", index=False)
#rules.to_csv("association_rules_filtered2.csv", index=False)

## Per capita income with food 1

In [None]:


# low, medium, high for necessities
lmh_columns_food1 = [
    "Cereal and Cereal Preparations (Total)", "Meat and Meat Preparations",
    "Fish and Marine Products (Total)", "Dairy Products and Eggs (Total)", "Oils and Fats (Total)", "Fruits and Vegetables", "Vegetables (Total)", "Sugar, Jam and Honey (Total)"
]

categorical_columns = ["Family Size", "NPCINC"]

df_filtered = imputted_column_13[lmh_columns_food1 + categorical_columns].copy()

# Convert Low/Medium/High using percentiles
def categorize_lmh(value, quantiles):
    if value <= quantiles[0.33]:
        return "Low"
    elif value <= quantiles[0.66]:
        return "Medium"
    else:
        return "High"

for col in lmh_columns_food1:
    quantiles = df_filtered[col].quantile([0.33, 0.66]).to_dict()
    df_filtered[col] = df_filtered[col].apply(lambda x: categorize_lmh(x, quantiles))

# Convert Family Size to Small/Medium/Large
def categorize_family_size(size):
    if size <= 3:
        return "Small"
    elif size <= 6:
        return "Medium"
    else:
        return "Large"

df_filtered["Family Size"] = df_filtered["Family Size"].apply(categorize_family_size)

def categorize_npcinc(decile):
    if decile <= 3:
        return "Low"
    elif decile <= 7:
        return "Medium"
    else:
        return "High"

df_filtered["NPCINC"] = df_filtered["NPCINC"].apply(categorize_npcinc)

# Step 6: One-Hot Encode Categorical Columns (LMH + Family Size + NPCINC)
df_encoded = pd.get_dummies(df_filtered, columns=lmh_columns_food1 + categorical_columns)

# Step 7: Apply FP-Growth
min_support = 0.05  # Adjust as needed
frequent_itemsets = fpgrowth(df_encoded, min_support=min_support, use_colnames=True)

# Step 8: Generate Association Rules
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1)

# Step 9: Save results
# frequent_itemsets.to_csv("frequent_itemsets_food1_lower_th.csv", index=False)
# rules.to_csv("association_rules_food1_lower_th.csv", index=False)

## Per capita income with Food 2

In [None]:


# low, medium, high for necessities
lmh_columns_food1 = [
    "Food Not Elsewhere Classified (Total)", "Fruit and vegetable juices",
    "Coffee, Cocoa and Tea (Total)", "Tea (total)  expenditure", "Cocoa (total)  expenditure", "Main Source of Water Supply (2nd visit only)", "Softdrinks", "Other Non Alcoholic Beverages"
]

categorical_columns = ["Family Size", "NPCINC"]

df_filtered = imputted_column_13[lmh_columns_food1 + categorical_columns].copy()

# Convert Low/Medium/High using percentiles
def categorize_lmh(value, quantiles):
    if value <= quantiles[0.33]:
        return "Low"
    elif value <= quantiles[0.66]:
        return "Medium"
    else:
        return "High"

for col in lmh_columns_food1:
    quantiles = df_filtered[col].quantile([0.33, 0.66]).to_dict()
    df_filtered[col] = df_filtered[col].apply(lambda x: categorize_lmh(x, quantiles))

# Convert Family Size to Small/Medium/Large
def categorize_family_size(size):
    if size <= 3:
        return "Small"
    elif size <= 6:
        return "Medium"
    else:
        return "Large"

df_filtered["Family Size"] = df_filtered["Family Size"].apply(categorize_family_size)

def categorize_npcinc(decile):
    if decile <= 3:
        return "Low"
    elif decile <= 7:
        return "Medium"
    else:
        return "High"

df_filtered["NPCINC"] = df_filtered["NPCINC"].apply(categorize_npcinc)

# Step 6: One-Hot Encode Categorical Columns (LMH + Family Size + NPCINC)
df_encoded = pd.get_dummies(df_filtered, columns=lmh_columns_food1 + categorical_columns)

# Step 7: Apply FP-Growth
min_support = 0.05  # Adjust as needed
frequent_itemsets = fpgrowth(df_encoded, min_support=min_support, use_colnames=True)

# Step 8: Generate Association Rules
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1)

# Step 9: Save results
frequent_itemsets.to_csv("frequent_itemsets_food2.csv", index=False)
rules.to_csv("association_rules_food2.csv", index=False)

## home vs outside per capita income

In [None]:



# low, medium, high for necessities
lmh_columns_home_vs_outside = [
    "Total Food Consumed at Home (Total)", "Food Regularly Consumed Outside The Home (Total)"
]

categorical_columns = ["Family Size", "NPCINC"]

df_filtered = imputted_column_13[lmh_columns_home_vs_outside + categorical_columns].copy()

# Convert Low/Medium/High using percentiles
def categorize_lmh(value, quantiles):
    if value <= quantiles[0.33]:
        return "Low"
    elif value <= quantiles[0.66]:
        return "Medium"
    else:
        return "High"

for col in lmh_columns_home_vs_outside:
    quantiles = df_filtered[col].quantile([0.33, 0.66]).to_dict()
    df_filtered[col] = df_filtered[col].apply(lambda x: categorize_lmh(x, quantiles))

# Convert Family Size to Small/Medium/Large
def categorize_family_size(size):
    if size <= 3:
        return "Small"
    elif size <= 6:
        return "Medium"
    else:
        return "Large"

df_filtered["Family Size"] = df_filtered["Family Size"].apply(categorize_family_size)

def categorize_npcinc(decile):
    if decile <= 3:
        return "Low"
    elif decile <= 7:
        return "Medium"
    else:
        return "High"

df_filtered["NPCINC"] = df_filtered["NPCINC"].apply(categorize_npcinc)

# Step 6: One-Hot Encode Categorical Columns (LMH + Family Size + NPCINC)
df_encoded = pd.get_dummies(df_filtered, columns=lmh_columns_home_vs_outside + categorical_columns)

# Step 7: Apply FP-Growth
min_support = 0.05  # Adjust as needed
frequent_itemsets = fpgrowth(df_encoded, min_support=min_support, use_colnames=True)

# Step 8: Generate Association Rules
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1)

# Step 9: Save results
frequent_itemsets.to_csv("frequent_itemsets_food_total.csv", index=False)
rules.to_csv("association_rules_food_total.csv", index=False)