In [18]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
import joblib
import numpy as np


In [19]:
# Load the dataset
file_path = '/content/Financial budget system datasets.xlsx'
data = pd.read_excel(file_path)


In [20]:
data.describe()

Unnamed: 0,Household Size,Total Monthly Income (â‚¦),Monthly Feeding Expense (â‚¦),Monthly Housing Expense (â‚¦),Monthly Utilities Expense (â‚¦),Monthly Transportation Expense (â‚¦),Monthly Healthcare Expense (â‚¦),Monthly Savings (â‚¦),Other Monthly Expenses (â‚¦)
count,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0
mean,3.0,188520.0,36966.666667,27180.0,15586.666667,12460.0,9973.333333,26066.666667,16260.0
std,1.418951,480069.4,12583.901864,12666.644884,7224.294087,8905.642582,5671.010311,13285.035567,9869.912248
min,1.0,50000.0,15000.0,10000.0,3000.0,4000.0,3000.0,5000.0,2000.0
25%,2.0,88000.0,25000.0,15000.0,10000.0,8000.0,5000.0,15000.0,10000.0
50%,3.0,137000.0,35000.0,25000.0,15000.0,12000.0,10000.0,25000.0,15000.0
75%,4.0,183000.0,45000.0,35000.0,20000.0,15000.0,15000.0,40000.0,25000.0
max,5.0,5926000.0,60000.0,50000.0,30000.0,100000.0,20000.0,50000.0,35000.0


In [21]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 11 columns):
 #   Column                                Non-Null Count  Dtype 
---  ------                                --------------  ----- 
 0   Age Range                             150 non-null    object
 1   Household Size                        150 non-null    int64 
 2   Employment Status                     150 non-null    object
 3   Total Monthly Income (â‚¦)            150 non-null    int64 
 4   Monthly Feeding Expense (â‚¦)         150 non-null    int64 
 5   Monthly Housing Expense (â‚¦)         150 non-null    int64 
 6   Monthly Utilities Expense (â‚¦)       150 non-null    int64 
 7   Monthly Transportation Expense (â‚¦)  150 non-null    int64 
 8   Monthly Healthcare Expense (â‚¦)      150 non-null    int64 
 9   Monthly Savings (â‚¦)                 150 non-null    int64 
 10  Other Monthly Expenses (â‚¦)          150 non-null    int64 
dtypes: int64(9), object(2)
memory us

In [22]:
data.head()

Unnamed: 0,Age Range,Household Size,Employment Status,Total Monthly Income (â‚¦),Monthly Feeding Expense (â‚¦),Monthly Housing Expense (â‚¦),Monthly Utilities Expense (â‚¦),Monthly Transportation Expense (â‚¦),Monthly Healthcare Expense (â‚¦),Monthly Savings (â‚¦),Other Monthly Expenses (â‚¦)
0,Under 25,1,Employed full time,100000,30000,20000,10000,5000,5000,20000,10000
1,25 - 34,2,Employed part time,140000,35000,25000,15000,15000,10000,25000,15000
2,35 - 44,3,Self Employed,180000,45000,35000,20000,15000,15000,30000,20000
3,45 - 54,4,Unemployed,260000,60000,50000,30000,20000,20000,50000,30000
4,55 - 64,5,Student,78000,25000,15000,8000,6000,4000,15000,5000


In [23]:
data['Age Range'].unique()

array(['Under 25', '25 - 34', '35 - 44', '45 - 54', '55 - 64', '65+'],
      dtype=object)

In [24]:
data['Employment Status'].unique()

array(['Employed full time', 'Employed part time', 'Self Employed',
       'Unemployed', 'Student'], dtype=object)

In [25]:
# Rename columns for easier access
data.columns = ['Age Range', 'Household', 'Employment Status', 'Total Monthly Income',
                'Monthly Food', 'Monthly Housing', 'Monthly Transportation',
                'Monthly Utilities', 'Monthly Insurance', 'Monthly Savings',
                'Monthly Other Expenses']

In [28]:
label_encode_features = ['Age Range', 'Employment Status']

# Label encode these features
label_encoders = {}
for column in label_encode_features:
    le = LabelEncoder()
    data[column] = le.fit_transform(data[column])
    label_encoders[column] = le

# Save the encoders dictionary for use in the Flask app
joblib.dump(label_encoders, 'label_encoder.pkl')

['label_encoder.pkl']

In [29]:
# Define the features and target variables
features = data[['Age Range', 'Household', 'Employment Status', 'Total Monthly Income']]
target_columns = ['Monthly Food', 'Monthly Housing', 'Monthly Transportation',
                  'Monthly Utilities', 'Monthly Insurance', 'Monthly Savings', 'Monthly Other Expenses']
target = data[target_columns]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

In [30]:
rf_model = MultiOutputRegressor(RandomForestRegressor(random_state=42))
rf_scores = cross_val_score(rf_model, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
print("Random Forest Regressor MSE:", -np.mean(rf_scores))

Random Forest Regressor MSE: 21702475.758928575


In [31]:
gb_model = MultiOutputRegressor(GradientBoostingRegressor(random_state=42))
gb_scores = cross_val_score(gb_model, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
print("Gradient Boosting Regressor MSE:", -np.mean(gb_scores))


Gradient Boosting Regressor MSE: 21060021.75203604


In [32]:
dt_model = MultiOutputRegressor(DecisionTreeRegressor(random_state=42))
dt_scores = cross_val_score(dt_model, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
print("Decision Tree Regressor MSE:", -np.mean(dt_scores))

Decision Tree Regressor MSE: 22822619.04761905


In [33]:
lr_model = MultiOutputRegressor(LinearRegression())
lr_scores = cross_val_score(lr_model, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
print("Linear Regression MSE:", -np.mean(lr_scores))

Linear Regression MSE: 783504541.5899264


In [34]:
svr_model = MultiOutputRegressor(SVR())
svr_scores = cross_val_score(svr_model, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
print("Support Vector Regressor MSE:", -np.mean(svr_scores))

Support Vector Regressor MSE: 108741749.55074616


In [35]:
models_mse = {
    "Random Forest": -np.mean(rf_scores),
    "Gradient Boosting": -np.mean(gb_scores),
    "Decision Tree": -np.mean(dt_scores),
    "Linear Regression": -np.mean(lr_scores),
    "Support Vector Regressor": -np.mean(svr_scores)
}

best_model_name = min(models_mse, key=models_mse.get)
best_model_mse = models_mse[best_model_name]

print(f"Best Model: {best_model_name} with MSE of {best_model_mse}")

Best Model: Gradient Boosting with MSE of 21060021.75203604


In [36]:
if best_model_name == "Random Forest":
    best_model = rf_model
elif best_model_name == "Gradient Boosting":
    best_model = gb_model
elif best_model_name == "Decision Tree":
    best_model = dt_model
elif best_model_name == "Linear Regression":
    best_model = lr_model
elif best_model_name == "Support Vector Regressor":
    best_model = svr_model

# Train the best model on the entire training set
best_model.fit(X_train, y_train)

# Save the trained best model
joblib.dump(best_model, 'best_budget_recommender_model.pkl')

['best_budget_recommender_model.pkl']