In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
import joblib
import numpy as np


In [2]:
# Load the dataset
file_path = '/content/Financial budget system datasets.xlsx'
data = pd.read_excel(file_path)

# Rename columns for easier access
data.columns = ['Age Range', 'Household', 'Employment Status', 'Total Monthly Income',
                'Monthly Food', 'Monthly Housing', 'Monthly Transportation',
                'Monthly Utilities', 'Monthly Insurance', 'Monthly Savings',
                'Monthly Other Expenses']

# Encode categorical variables
label_encoder = LabelEncoder()
data['Age Range'] = label_encoder.fit_transform(data['Age Range'])
data['Employment Status'] = label_encoder.fit_transform(data['Employment Status'])

# Save the encoders for use in the Flask app
joblib.dump(label_encoder, 'age_encoder.pkl')
joblib.dump(label_encoder, 'employment_encoder.pkl')

# Define the features and target variables
features = data[['Age Range', 'Household', 'Employment Status', 'Total Monthly Income']]
target_columns = ['Monthly Food', 'Monthly Housing', 'Monthly Transportation',
                  'Monthly Utilities', 'Monthly Insurance', 'Monthly Savings', 'Monthly Other Expenses']
target = data[target_columns]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

In [3]:
rf_model = MultiOutputRegressor(RandomForestRegressor(random_state=42))
rf_scores = cross_val_score(rf_model, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
print("Random Forest Regressor MSE:", -np.mean(rf_scores))

Random Forest Regressor MSE: 21702475.758928575


In [4]:
gb_model = MultiOutputRegressor(GradientBoostingRegressor(random_state=42))
gb_scores = cross_val_score(gb_model, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
print("Gradient Boosting Regressor MSE:", -np.mean(gb_scores))


Gradient Boosting Regressor MSE: 21060021.75203604


In [5]:
dt_model = MultiOutputRegressor(DecisionTreeRegressor(random_state=42))
dt_scores = cross_val_score(dt_model, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
print("Decision Tree Regressor MSE:", -np.mean(dt_scores))

Decision Tree Regressor MSE: 22822619.04761905


In [6]:
lr_model = MultiOutputRegressor(LinearRegression())
lr_scores = cross_val_score(lr_model, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
print("Linear Regression MSE:", -np.mean(lr_scores))

Linear Regression MSE: 783504541.5899264


In [7]:
svr_model = MultiOutputRegressor(SVR())
svr_scores = cross_val_score(svr_model, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
print("Support Vector Regressor MSE:", -np.mean(svr_scores))

Support Vector Regressor MSE: 108741749.55074616


In [8]:
models_mse = {
    "Random Forest": -np.mean(rf_scores),
    "Gradient Boosting": -np.mean(gb_scores),
    "Decision Tree": -np.mean(dt_scores),
    "Linear Regression": -np.mean(lr_scores),
    "Support Vector Regressor": -np.mean(svr_scores)
}

best_model_name = min(models_mse, key=models_mse.get)
best_model_mse = models_mse[best_model_name]

print(f"Best Model: {best_model_name} with MSE of {best_model_mse}")

Best Model: Gradient Boosting with MSE of 21060021.75203604


In [9]:
if best_model_name == "Random Forest":
    best_model = rf_model
elif best_model_name == "Gradient Boosting":
    best_model = gb_model
elif best_model_name == "Decision Tree":
    best_model = dt_model
elif best_model_name == "Linear Regression":
    best_model = lr_model
elif best_model_name == "Support Vector Regressor":
    best_model = svr_model

# Train the best model on the entire training set
best_model.fit(X_train, y_train)

# Save the trained best model
joblib.dump(best_model, 'best_budget_recommender_model.pkl')

['best_budget_recommender_model.pkl']