In [1]:
import pandas as pd

# Load dataset
df = pd.read_csv("../data/personal_finance_expenses.csv")

Features shape: (20000, 23)
Target shape: (20000,)


In [2]:
# --- Feature engineering ---
expense_cols = [
    'Rent','Loan_Repayment','Insurance','Groceries','Transport','Eating_Out',
    'Entertainment','Utilities','Healthcare','Education','Miscellaneous'
]

df['total_expenses'] = df[expense_cols].sum(axis=1)
df['estimated_savings'] = df['Income'] - df['total_expenses']

print(df[['Income','total_expenses','estimated_savings',
          'Desired_Savings','Disposable_Income']].head(5))

# Target
y = df['Desired_Savings']

# Features (drop derived columns and target)
X = df.drop(['Desired_Savings'] + 
            [col for col in df.columns if 'Potential_Savings' in col], axis=1)

# One-hot encode categorical columns
categorical_cols = ['Occupation', 'City_Tier']
X = pd.get_dummies(X, columns=categorical_cols)

print("Features shape:", X.shape)
print("Target shape:", y.shape)


          Income  total_expenses  estimated_savings  Desired_Savings  \
0   44637.249636    33371.621929       11265.627707      6200.537192   
1   26858.596592    17181.777859        9676.818733      1923.176434   
2   50367.605084    36476.154459       13891.450624      7050.360422   
3  101455.600247    69837.646632       31617.953615     16694.965136   
4   24875.283548    18609.583016        6265.700532      1874.099434   

   Disposable_Income  
0       11265.627707  
1        9676.818733  
2       13891.450624  
3       31617.953615  
4        6265.700532  
Features shape: (20000, 25)
Target shape: (20000,)


In [3]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print("Training set:", X_train.shape, y_train.shape)
print("Test set:", X_test.shape, y_test.shape)


Training set: (16000, 23) (16000,)
Test set: (4000, 23) (4000,)


In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

y_pred = lr_model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error (MSE):", mse)
print("RMSE:", rmse)
print("R^2 Score:", r2)
