# Preliminary Project - Data Analysis and Visualization
**Group Members:**
- Benjamin Francis Abadila
- Angelo Dela Paz
- Carl Mitzchel Padua
- Edjin Jerney Payumo
- Levin Jacob Sta. Cruz

---
## Setup

In [None]:
%matplotlib inline
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler 
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.utils.class_weight import compute_class_weight


In [None]:
# 2023 Dataset
fies_23 = pd.read_csv('dataset/PHL-PSA-FIES-2023-V1-PUF/FIES PUF 2023 Volume1.csv')

---
## Data Preprocessing

### Data Inspection

In [None]:
# Show all columns on output
pd.set_option('display.max_columns', None)


In [None]:
fies_23.info()

`'TOTDIS'` is the only object datatype although it should also be numerical 

In [None]:
fies_23.describe()

In [None]:
# Identify column that are non-numeric
fies_23.describe(include='O')

In [None]:
# Convert the TOTDIS column to string for inspection
fies_23['TOTDIS'] = fies_23['TOTDIS'].astype(str)

# Check for null values
null_values_count = fies_23['TOTDIS'].isnull().sum()

# Identify non-numerical entries (including whitespace)
non_numerical_entries = fies_23[~fies_23['TOTDIS'].str.replace('.', '', regex=False).str.isnumeric()]

# Display results
print(f"Number of null values: {null_values_count}")
print(f"Number of non-numerical entries: {len(non_numerical_entries)}")

# Display the non-numerical entries for inspection
print(non_numerical_entries[['TOTDIS']])


In [None]:
# Strip whitespace from the TOTAL_DISBURSEMENTS column
fies_23['TOTDIS'] = fies_23['TOTDIS'].astype(str)

# Identify and view rows where TOTAL_DISBURSEMENTS contains only whitespace
whitespace_entries = fies_23[fies_23['TOTDIS'].str.strip() == '']

# Display the rows with whitespace in TOTAL_DISBURSEMENTS
whitespace_entries['TOTDIS']


All the non-numeric values of column `'TOTDIS'` are whitespaces, probably caused by an encoding error

In [None]:
# Calculating the Missing Values % contribution in DF
df_null = round(100*(fies_23.isnull().sum())/len(fies_23), 2)
df_null

In [None]:
df_null[df_null > 0] # Show only columns having Missing Values (non-zero)

In [None]:
# Plotting the df_null
plt.figure(figsize=(16,8))
sns.barplot(x=df_null.index, y=df_null.values, alpha=0.8)
plt.title('Missing Values (Pre-Cleaning)')
plt.ylabel('Missing Values %')
plt.xlabel('Columns')
plt.xticks(rotation=90)
plt.show()

There are no identified null values except for the empty whitespaces in `'TOTDIS'`

In [None]:
fies_23.shape

In [None]:
fies_23.columns

In [None]:
fies_23

### Data Cleaning

In [None]:
# Convert TOTDIS to string to check for whitespaces
fies_23['TOTDIS'] = fies_23['TOTDIS'].astype(str)

# Identify rows where TOTDIS contains only whitespace
whitespace_rows = fies_23[fies_23['TOTDIS'].str.strip() == '']

# Compute the new values for TOTDIS
fies_23.loc[whitespace_rows.index, 'TOTDIS'] = (
    fies_23.loc[whitespace_rows.index, 'TOTEX'] + 
    fies_23.loc[whitespace_rows.index, 'OTHER_DISBURSEMENT']
)

# Convert TOTDIS back to numeric type
fies_23['TOTDIS'] = pd.to_numeric(fies_23['TOTDIS'], errors='coerce')

# Display the updated DataFrame for verification
print(fies_23[['TOTDIS', 'TOTEX', 'OTHER_DISBURSEMENT']].head())

Since `TOTDIS` = `TOTEX` + `OTHER_DISBURSEMENT`, we used the data on these two columns to recompute the rows in `TOTDIS` containing whitespaces  

In [None]:
# Convert the TOTDIS column to string for inspection
fies_23['TOTDIS'] = fies_23['TOTDIS'].astype(str)

# Identify non-numerical entries (including whitespace)
non_numerical_entries = fies_23[~fies_23['TOTDIS'].str.replace('.', '', regex=False).str.isnumeric()]

# Display results
print(f"Number of non-numerical entries: {len(non_numerical_entries)}")

In [None]:
# Dropping Unnecessary Columns
cleaned_fies = fies_23.copy()

In [None]:
# Drop Entrepreneurial Activity Component Columns
# The column EAINC, which is the total of these components will be used instead
cleaned_fies.drop(columns=[
    'NET_CFG', 'NET_LPR', 'NET_FISH', 'NET_FOR', 
    'NET_RET', 'NET_MFG', 'NET_TRANS', 'NET_NEC_A8', 
    'NET_NEC_A9', 'NET_NEC_A10'
], inplace=True)


In [None]:
# Drop Food Component Columns
# The column FOOD, which is the total of these components will be used instead
cleaned_fies.drop(columns=[
    'BREAD', 'MEAT', 'FISH', 'MILK', 'OIL', 
    'FRUIT', 'VEG', 'SUGAR', 'FOOD_NEC', 'FRUIT_VEG', 
    'COFFEE', 'TEA', 'COCOA', 'WATER', 'SOFTDRINKS', 
    'OTHER_NON_ALCOHOL', 'ALCOHOL', 'TOBACCO', 'OTHER_VEG', 
    'SERVICES_PRIMARY_GOODS', 'ALCOHOL_PROCDUCTION_SERVICES', 
    'FOOD_HOME', 'FOOD_OUTSIDE'
], inplace=True)

In [None]:
# Drop Non-Food Component Columns
# The column NFOOD, which is the total of these components will be used instead
cleaned_fies.drop(columns=[
    'CLOTH', 'HOUSING_WATER', 'FURNISHING', 'HEALTH', 
    'TRANSPORT', 'COMMUNICATION', 'RECREATION', 'EDUCATION', 
    'INSURANCE', 'MISCELLANEOUS', 'DURABLE', 'OCCASION', 
    'OTHER_EXPENDITURE'
], inplace=True)

In [None]:
# Drop Other Unnecessary Columns
cleaned_fies.drop(columns=[
    'RPROV','WAGES','LOSSES', 'ACTRENT', 'IMPUTED_RENT', 'BIMPUTED_RENT',
    'FOOD_ACCOM_SRVC', 'RPSU', 'RFACT', 'MEM_RFACT', 'NPCINC', 
    'RPCINC', 'PRPCINC', 'PPCINC', 'RPCINC_NIR', 
    'W_REGN_NIR'
], inplace=True)


In [None]:
cleaned_fies.columns

In [None]:
# Standardize Column Names
cleaned_fies.rename(
    columns={
        "W_REGN": "REGION",
        "W_PROV": "PROVINCE",
        "SEQ_NO": "FAMILY_ID",
        "FSIZE": "FAMILY_SIZE",
        "NETSHARE": "NET_SHARE",
        "OTHER_SOURCE": "OTHER_INCOME_SOURCE",
        "REGFT": "GIFT_RECEIPTS",
        "EAINC": "ENTREP_INCOME",
        "FOOD": "FOOD_EXPENSES",
        "NFOOD": "NON_FOOD_EXPENSES",
        "TOINC": "TOTAL_INCOME",
        "TOTEX": "TOTAL_EXPENSES",
        "TOTDIS": "TOTAL_DISBURSEMENTS",
        "OTHREC": "OTHER_RECEIPTS",
        "TOREC": "TOTAL_RECEIPTS",
        "URB": "AREA_CLASS",
        "PERCAPITA": "PCAPITA_INCOME",
    },
    inplace=True,
)

In [None]:
cleaned_fies.columns

In [None]:
cleaned_fies.shape

In [None]:
cleaned_fies.info()

### Data Engineering

Classifying incomes into different social classes based on the defined thresholds

*References*
- https://www.moneymax.ph/personal-finance/articles/social-class-philippines
- https://psa.gov.ph/statistics/poverty

In [None]:
poverty_threshold = 13873 * 12  # Official poverty threshold (Annual)

def classify_income(row):
    total_income = row['TOTAL_INCOME']
    if total_income < poverty_threshold:
        return 'Poor'
    elif total_income < 2 * poverty_threshold:
        return 'Low income'
    elif total_income < 4 * poverty_threshold:
        return 'Lower-middle income'
    elif total_income < 7 * poverty_threshold:
        return 'Middle income'
    elif total_income < 12 * poverty_threshold:
        return 'Upper-middle income'
    elif total_income < 20 * poverty_threshold:
        return 'Upper income'
    else:
        return 'Rich'

cleaned_fies['SOCIAL_CLASS'] = cleaned_fies.apply(classify_income, axis=1)
print(cleaned_fies[['TOTAL_INCOME', 'SOCIAL_CLASS']].head())


Calculates the net savings for each individual by subtracting their total expenses (`TOTAL_EXPENSES`) from their total income (`TOTAL_INCOME`) and stores the result in a new column called `NET_SAVINGS` in the `cleaned_fies` DataFrame.

In [None]:
cleaned_fies['NET_SAVINGS'] = cleaned_fies['TOTAL_INCOME'] - cleaned_fies['TOTAL_EXPENSES']

In [None]:
cleaned_fies.columns

In [None]:
cleaned_fies.describe()

## Exploratory Data Analysis

In [None]:
# Create the separate DataFrame for income analysis
income_columns = [
    'REG_SAL', 'SEASON_SAL', 'NET_SHARE', 'CASH_ABROAD', 'CASH_DOMESTIC', 
    'RENTALS_REC', 'INTEREST', 'PENSION', 'DIVIDENDS', 
    'OTHER_INCOME_SOURCE', 'NET_RECEIPT', 'GIFT_RECEIPTS', 
    'ENTREP_INCOME', 'RENTVAL', 'TOTAL_INCOME'
]

# Create income_df with only the income-related columns
income_df = cleaned_fies[income_columns]

In [None]:
income_df.columns

In [None]:
income_df.shape

In [None]:
# Correlation heatmap 
plt.figure(figsize=(14, 10)) 
sns.heatmap(
    income_df.corr(),
    annot=True,      
    cmap='coolwarm',  
    linewidths=1,     
    linecolor='white', 
    cbar_kws={'shrink': 0.8}, 
    square=True,    
    fmt='.2f',     
    annot_kws={'size': 10}  
)

plt.title('Correlation Heatmap for Income Components', fontsize=18, pad=20)

plt.xticks(rotation=45, ha='right', fontsize=12)
plt.yticks(rotation=0, fontsize=12)

# Display the heatmap
plt.tight_layout()  
plt.show()

>**Analysis**:
Entrepreneurial income and regular salary show the strongest correlations with total income (0.66 and 0.60 respectively), highlighting their significance in overall household finances.
Rental value moderately correlates with total income and regular salary, suggesting a link between housing value and income levels.
Most other income components show weak correlations, indicating diverse and independent income sources across the population.

In [None]:
# List of columns to drop (income components, except for TOTAL_INCOME)
columns_to_drop = [
    'REG_SAL', 'SEASON_SAL', 'NET_SHARE', 'CASH_ABROAD', 'CASH_DOMESTIC',
    'RENTALS_REC', 'INTEREST', 'PENSION', 'DIVIDENDS',
    'OTHER_INCOME_SOURCE', 'NET_RECEIPT', 'GIFT_RECEIPTS', 
    'ENTREP_INCOME', 'RENTVAL'
]

# Drop the component columns from cleaned_fies, keeping TOTAL_INCOME
cleaned_fies = cleaned_fies.drop(columns=columns_to_drop)

In [None]:
cleaned_fies.columns

In [None]:
# Correlation heatmap
plt.figure(figsize=(14, 10)) 

# Convert 'SOCIAL_CLASS' into numerical values
social_class_mapping = {
    'Poor': 1,
    'Low income': 2,
    'Lower-middle income': 3,
    'Middle income': 4,
    'Upper-middle income': 5,
    'Upper income': 6,
    'Rich': 7
}

corr_fies = cleaned_fies.copy()

corr_fies['SOCIAL_CLASS'] = corr_fies['SOCIAL_CLASS'].replace(social_class_mapping)

sns.heatmap(
    corr_fies.corr(), 
    annot=False,        
    cmap='coolwarm',
    linewidths=0,    
    square=True,      
    cbar_kws={'shrink': 0.8} 
)

plt.title('Correlation Heatmap for FIES', fontsize=18, pad=20)

plt.xticks(rotation=45, ha='right', fontsize=12)
plt.yticks(rotation=0, fontsize=12)

# Display the heatmap
plt.tight_layout()  
plt.show()

>**Analysis**:
Total income, expenses, and disbursements are highly correlated, as expected in household finances.
Family size positively correlates with food expenses and other disbursements, indicating higher expenses for larger families.
Geographic factors (region and province) show little correlation with financial metrics, suggesting economic status isn't strongly tied to specific areas.
Area classification shows some correlation with financial metrics, hinting at economic differences between urban and rural areas.
Net savings correlate positively with total income and negatively with expenses, but not extremely strongly, suggesting varied saving behaviors across income levels.

In [None]:
# Boxplot to identify outliers in the dataset

plt.figure(figsize=(15, 5))

plt.subplot(1, 3, 1)
sns.boxplot(data=cleaned_fies['TOTAL_INCOME'], color='skyblue')
plt.title('Total Income Boxplot')
plt.xlabel('Total Income')

plt.subplot(1, 3, 2)
sns.boxplot(data=cleaned_fies['TOTAL_EXPENSES'], color='lightgreen')
plt.title('Total Expenses Boxplot')
plt.xlabel('TotalExpenses')

plt.subplot(1, 3, 3)
sns.boxplot(data=cleaned_fies['PCAPITA_INCOME'], color='salmon')
plt.title('Per Capita Income Boxplot')
plt.xlabel('Per Capita income')

plt.tight_layout()
plt.show()

In [None]:
# Set display options
pd.set_option('display.float_format', '{:.2f}'.format)

# Getting descriptive statistics for the NET_SAVINGS column
cleaned_fies['NET_SAVINGS'].describe()

`NET_SAVINGS` has a minimum value of -5174102.00 and a maximum value of 77005008.00

In [None]:
# Assuming cleaned_fies is your DataFrame
# Sort the DataFrame by NET_SAVINGS in descending order
sorted_fies = cleaned_fies.sort_values(by='NET_SAVINGS', ascending=False)

# Display the top 10 rows of the sorted DataFrame
sorted_fies.head(10)


## Data Visualization

In [None]:
social_class_order = [
    "Poor",
    "Low income",
    "Lower-middle income",
    "Middle income",
    "Upper-middle income",
    "Upper income",
    "Rich",
]

In [None]:
plt.figure(figsize=(10, 6))
sns.countplot(
    y="SOCIAL_CLASS",
    data=cleaned_fies,
    palette="viridis",
    order=social_class_order,
)
plt.title('Distribution of Social Classes')
plt.xlabel('Frequency')
plt.ylabel('Social Class')
plt.show()

>**Analysis**:
Majority of the households in PH are part of the low income and poor classes, indicating prevalence of poverty and other issues such as income inequality

In [None]:
# Distribution of Social Classes grouped by Region
plt.figure(figsize=(10, 6))
sns.countplot(
    y="REGION",
    data=cleaned_fies,
    hue="SOCIAL_CLASS",
    palette="viridis",
    hue_order=social_class_order,
    order=sorted(cleaned_fies["REGION"].unique()),
    dodge=False
)
plt.title("Distribution of Social Classes by Region")
plt.xlabel("Frequency")
plt.show()

>**Analysis**:
The chart illustrates substantial regional variations in social class distribution, with some areas showing a more diverse mix of income groups while others are dominated by specific social classes.

In [None]:
# Distribution of Social Classes grouped by Region
plt.figure(figsize=(10, 10))
sns.countplot(y="AREA_CLASS", data=cleaned_fies, hue="SOCIAL_CLASS", hue_order=social_class_order, palette="viridis")
plt.title("Distribution of Social Classes by Region")
plt.xlabel("Frequency")
plt.show()

>**Analysis**:
Differences in social class distribution between urban (`AREA_CLASS = 1`) and rural areas (`AREA_CLASS = 2`), with rural regions showing higher proportions of poor and low-income groups, while urban areas demonstrate a more balanced distribution across various income levels.

In [None]:
# Graph for Total Income by Social Class
plt.figure(figsize=(12, 8))
sns.boxplot(
    x="SOCIAL_CLASS",
    y="TOTAL_INCOME",
    data=cleaned_fies,
    palette="viridis",
    order=social_class_order,
)
plt.title('Total Income by Social Class')
plt.xlabel('Social Class')
plt.ylabel('Total Income')
plt.show()

>**Analysis**:
Income inequality across social classes, with the "Rich" category showing significantly higher and more varied incomes. There's a clear upward trend in income levels from "Poor" to "Rich", with the gap between classes widening as we move up the social ladder. The lower income classes show very compressed distributions, while upper classes, especially the "Rich", display wide ranges and numerous high-income outliers.

In [None]:
# Graph for Total Expenses by Social Class
plt.figure(figsize=(12, 8))
sns.boxplot(
    x="SOCIAL_CLASS",
    y="TOTAL_EXPENSES",
    data=cleaned_fies,
    palette="viridis",
    order=social_class_order,
)
plt.title("Total Expenses by Social Class")
plt.xlabel("Social Class")
plt.ylabel("Total Expenses")
plt.show()

>**Analysis**:
The expense plot, while following a similar trend, shows less dramatic differences between classes, suggesting that while income varies greatly, expenses don't increase as sharply across social classes.

In [None]:
# Graph for Net Savings by Social Class
plt.figure(figsize=(12, 8))
sns.boxplot(
    x="SOCIAL_CLASS",
    y="NET_SAVINGS",
    data=cleaned_fies,
    palette="viridis",
    order=social_class_order,
)
plt.title("Net Savings by Social Class")
plt.xlabel("Social Class")
plt.ylabel("Net Savings")
plt.show()

>**Analysis**:
Lower income groups show minimal savings capacity with compressed distributions near zero. As we move up the social ladder, there's a clear upward trend in net savings, with the "Rich" category displaying significantly higher median savings and a much wider distribution. Upper income classes, especially the "Rich", exhibit numerous high-value outliers, indicating substantial savings potential. However, the presence of negative outliers across most classes suggests that instances of debt or negative savings occur at all social levels, though more pronounced in lower income groups.

In [None]:
# Graph for Family Size by Social Class
plt.figure(figsize=(15, 6))
sns.boxplot(
    y="FAMILY_SIZE", x="SOCIAL_CLASS", data=cleaned_fies, hue="SOCIAL_CLASS", palette="viridis", legend=False, hue_order=social_class_order
)

plt.title('Family Size by Social Class')
plt.xlabel('Social Class')
plt.ylabel('Family Size')
plt.show()


>**Insights**:
`FAMILY_SIZE` does not change significantly throughout all `SOCIAL_CLASS` except for `POOR` 

### Demographic Visualization

In [None]:
# Distribution of Regions
plt.figure(figsize=(10, 6))
sns.countplot(y="REGION", data=cleaned_fies, palette="viridis")
plt.title("Distribution of Regions")
plt.xlabel("Frequency")
plt.show()

>**Insights**:
NCR has the most number of families, followed by REGION 3, REGION 6, REGION 15, REGION 8, and REGION 1 has the least

In [None]:
# Savings by Region
plt.figure(figsize=(10, 6))
sns.boxplot(x="REGION", y="NET_SAVINGS", data=cleaned_fies, palette="viridis")
plt.title("Net Savings by Region")
plt.xlabel("Region")
plt.ylabel("Net Savings")
plt.show()


>**Analysis**: The box plot reveals significant disparities in net savings across different regions. While most regions show median net savings close to zero, there's considerable variation in the spread and outliers. Some regions display higher positive outliers, indicating individuals with substantially higher savings. However, the generally small interquartile ranges suggest that the majority of people in each region have similar, relatively low levels of net savings. The presence of both positive and negative outliers in most regions highlights the economic diversity within each area, with some individuals having high savings and others facing debt.

---

# Overall Insights

## Key Findings

1. **Income Inequality**: The analysis reveals significant income disparities across different social classes in the Philippines. The "Rich" category shows substantially higher and more varied incomes compared to other classes.

2. **Social Class Distribution**: The majority of households in the Philippines fall into the low income and poor classes, indicating prevalent poverty and income inequality issues.

3. **Regional Variations**: There are substantial regional differences in social class distribution. Some areas show a more diverse mix of income groups, while others are dominated by specific social classes.

4. **Urban vs. Rural Divide**: Rural areas tend to have higher proportions of poor and low-income groups, while urban areas demonstrate a more balanced distribution across various income levels.

5. **Expenses and Savings**: While income varies greatly across social classes, expenses don't increase as sharply. This leads to significant differences in net savings capacity, with lower income groups showing minimal savings potential and upper classes, especially the "Rich", displaying much higher median savings.

6. **Family Size**: Family size does not change significantly across social classes, except for the "Poor" category which shows some variation.

7. **Regional Demographics**: The National Capital Region (NCR) has the highest number of families, followed by Regions 3, 6, 15, and 8. Region 1 has the least number of families.

8. **Net Savings by Region**: There are significant disparities in net savings across different regions, with most regions showing median net savings close to zero but considerable variation in outliers.

## Implications

These findings highlight the complex socio-economic landscape of the Philippines, characterized by significant income inequality, regional disparities, and varying capacities for savings across different social classes. The data suggests a need for targeted economic policies and interventions to address poverty, promote more equitable income distribution, and enhance savings capacity, particularly in rural areas and among lower income groups.

## Future Directions

As noted in the project description, future stages of this analysis will include:

- Machine learning model implementation
- More comprehensive data analysis
- Advanced data visualization techniques
- Formulation of conclusions and recommendations based on the extended analysis

These additional steps will likely provide deeper insights into the factors influencing income distribution, savings behavior, and overall economic well-being across different segments of the Philippine population.

In [None]:
cleaned_fies.columns

In [None]:
income_df.columns

In [None]:
%matplotlib inline

cleaned_fies['TOTAL_EXPENSES_RATIO'] = cleaned_fies['TOTAL_EXPENSES'] / cleaned_fies['TOTAL_RECEIPTS']
cleaned_fies['FOOD_EXPENSES_RATIO'] = cleaned_fies['FOOD_EXPENSES'] / cleaned_fies['TOTAL_EXPENSES']
cleaned_fies['NON_FOOD_EXPENSES_RATIO'] = cleaned_fies['NON_FOOD_EXPENSES'] / cleaned_fies['TOTAL_EXPENSES']
cleaned_fies['SAVINGS_RATIO'] = cleaned_fies['NET_SAVINGS'] / cleaned_fies['TOTAL_RECEIPTS']
cleaned_fies['PCAPITA_EXPENSES'] = cleaned_fies['TOTAL_EXPENSES'] / cleaned_fies['FAMILY_SIZE']
cleaned_fies['PCAPITA_FOOD_EXPENSES'] = cleaned_fies['FOOD_EXPENSES'] / cleaned_fies['FAMILY_SIZE']
cleaned_fies['PCAPITA_NON_FOOD_EXPENSES'] = cleaned_fies['NON_FOOD_EXPENSES'] / cleaned_fies['FAMILY_SIZE']

features = ['FAMILY_SIZE', 'FOOD_EXPENSES_RATIO', 'NON_FOOD_EXPENSES_RATIO', 
            'SAVINGS_RATIO', 'PCAPITA_EXPENSES', 'PCAPITA_FOOD_EXPENSES', 
            'PCAPITA_NON_FOOD_EXPENSES', 'AREA_CLASS', 'REGION', 'PROVINCE']

In [None]:
X = cleaned_fies[features]
y = cleaned_fies['SOCIAL_CLASS']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
# # apply smote for handling class imbalance
# smote = SMOTE(random_state=42)
# X_train_balanced, y_train_balanced = smote.fit_resample(X_train_scaled, y_train)

In [None]:
y.value_counts()

In [None]:
import optuna
from sklearn.metrics import f1_score

In [None]:
# Compute class weights
class_weights = compute_class_weight(class_weight='balanced', 
                                   classes=np.unique(y_train), 
                                   y=y_train)
class_weight_dict = dict(zip(np.unique(y_train), class_weights))

#Train improved model with class weights

def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 500),
        'max_depth': trial.suggest_int('max_depth', 10, 30),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 10),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 5),
        'class_weight': class_weight_dict,
        'random_state': 42
    }

    rf_model = RandomForestClassifier(**params)

    rf_model.fit(X_train_scaled, y_train)

    y_pred = rf_model.predict(X_test_scaled)

    return f1_score(y_test, y_pred, average='weighted')


study = optuna.create_study(direction='maximize', study_name='Random Forest Optimization', storage='sqlite:///rf_study.db', load_if_exists=True)
study.optimize(objective, n_trials=100)

trial = study.best_trial
print('F1 Score: {}'.format(trial.value))




In [None]:
trial = study.best_trial

rf_model = RandomForestClassifier(**trial.params)


# rf_model = RandomForestClassifier(
#     n_estimators=200,
#     max_depth=15,
#     min_samples_split=5,
#     min_samples_leaf=2,
#     class_weight= class_weight_dict,
#     random_state=42
# )

rf_model.fit(X_train_scaled, y_train)

# Make predictions
y_pred = rf_model.predict(X_test_scaled)

# Print classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Print Accuracy

In [None]:
# 1. Confusion Matrix
plt.figure(figsize=(12, 10))
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
plt.title("Improved Model Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

# 2. Feature Importance
feature_importance = pd.DataFrame(
    {"feature": features, "importance": rf_model.feature_importances_}
).sort_values("importance", ascending=False)

plt.figure(figsize=(12, 8))
sns.barplot(x="importance", y="feature", data=feature_importance)
plt.title("Feature Importance (Improved Model)")
plt.xlabel("Importance")
plt.ylabel("Feature")
plt.tight_layout()
plt.show()




In [None]:
# 3. Distribution of Social Classes
plt.figure(figsize=(12, 6))
sns.countplot(
    y="SOCIAL_CLASS",
    data=cleaned_fies,
    order=cleaned_fies["SOCIAL_CLASS"].value_counts().index,
)
plt.title("Distribution of Social Classes")
plt.xlabel("Count")
plt.ylabel("Social Class")
plt.show()


# 4. Savings Ratio by Social Class
plt.figure(figsize=(12, 6))
sns.boxplot(
    x="SOCIAL_CLASS",
    y="SAVINGS_RATIO",
    data=cleaned_fies,
    order=cleaned_fies["SOCIAL_CLASS"].value_counts().index,
)
plt.title("Savings Ratio by Social Class")
plt.xlabel("Social Class")
plt.ylabel("Savings Ratio")
plt.xticks(rotation=45)
plt.show()


# 5. Food Expenses Ratio by Social Class
plt.figure(figsize=(12, 6))
sns.boxplot(
    x="SOCIAL_CLASS",
    y="FOOD_EXPENSES_RATIO",
    data=cleaned_fies,
    order=cleaned_fies["SOCIAL_CLASS"].value_counts().index,
)
plt.title("Food Expenses Ratio by Social Class")
plt.xlabel("Social Class")
plt.ylabel("Food Expenses Ratio")
plt.xticks(rotation=45)
plt.show()


# 5.5 Non-food Expenses Ratio by Social Class
plt.figure(figsize=(12, 6))
sns.boxplot(
    x="SOCIAL_CLASS",
    y="NON_FOOD_EXPENSES_RATIO",
    data=cleaned_fies,
    order=cleaned_fies["SOCIAL_CLASS"].value_counts().index,
)
plt.title("Non-Food Expenses Ratio by Social Class")
plt.xlabel("Social Class")
plt.ylabel("Non-Food Expenses Ratio")
plt.xticks(rotation=45)
plt.show()


# 6. Per Capita Expenses by Region
plt.figure(figsize=(14, 8))
sns.boxplot(x="REGION", y="PCAPITA_EXPENSES", data=cleaned_fies)
plt.title("Per Capita Expenses by Region")
plt.xlabel("Region")
plt.ylabel("Per Capita Expenses")
plt.xticks(rotation=45)
plt.show()


# 7. Correlation Heatmap
correlation_features = [
    "FAMILY_SIZE",
    "FOOD_EXPENSES_RATIO",
    "NON_FOOD_EXPENSES_RATIO",
    "SAVINGS_RATIO",
    "PCAPITA_EXPENSES",
    "PCAPITA_FOOD_EXPENSES",
    "PCAPITA_NON_FOOD_EXPENSES",
]
correlation_matrix = cleaned_fies[correlation_features].corr()

plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm", vmin=-1, vmax=1, center=0)
plt.title("Correlation Heatmap of Key Features")
plt.tight_layout()
plt.show()



# 8. Family Size Distribution by Social Class
plt.figure(figsize=(12, 6))
sns.boxplot(
    x="SOCIAL_CLASS",
    y="FAMILY_SIZE",
    data=cleaned_fies,
    order=cleaned_fies["SOCIAL_CLASS"].value_counts().index,
)
plt.title("Family Size Distribution by Social Class")
plt.xlabel("Social Class")
plt.ylabel("Family Size")
plt.xticks(rotation=45)
plt.show()


# 9. Area Class Distribution by Social Class
area_class_dist = cleaned_fies.groupby('SOCIAL_CLASS')['AREA_CLASS'].value_counts(normalize=True).unstack()
area_class_dist.plot(kind='bar', stacked=True, figsize=(12, 6))
plt.title('Area Class Distribution by Social Class')
plt.xlabel('Social Class')
plt.ylabel('Proportion')
plt.legend(title='Area Class', labels=['Urban', 'Rural'])
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# 10. Per Capita Food Expenses by Social Class
plt.figure(figsize=(12, 6))
sns.boxplot(x='SOCIAL_CLASS', y='PCAPITA_FOOD_EXPENSES', data=cleaned_fies, order=cleaned_fies['SOCIAL_CLASS'].value_counts().index)
plt.title('Per Capita Food Expenses by Social Class')
plt.xlabel('Social Class')
plt.ylabel('Per Capita Food Expenses')
plt.xticks(rotation=45)
plt.show()

# 11. Savings Ratio Distribution
plt.figure(figsize=(12, 6))
sns.histplot(data=cleaned_fies, x='SAVINGS_RATIO', kde=True)
plt.title('Distribution of Savings Ratio')
plt.xlabel('Savings Ratio')
plt.ylabel('Count')
plt.show()