In [63]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBRegressor
import pandas as pd
from pathlib import Path
import pickle
import os

In [64]:
data_dir = os.path.join(Path.cwd().parent, 'data', 'raw_data', "personal_budget_dataset.csv")
df = pd.read_csv(data_dir)

In [65]:
df["Category"].value_counts()

Category
Groceries         24
Rent              24
Utilities         24
Entertainment     24
Transportation    24
Dining Out        24
Healthcare        24
Savings           24
Name: count, dtype: int64

In [66]:
def encode_categories(data:pd.DataFrame, column:str) -> pd.DataFrame:
    encoder = LabelEncoder()
    data[column] = encoder.fit_transform(data[column])
    with open("encoder.pickle", 'wb') as file:
        pickle.dump(encoder, file)
    return data

def seperate_data(data, config):
    features = data.iloc[:,:-config["output_features"]]
    target = data.iloc[:, -config["output_features"]:]
    return features, target


def split_data(features, target, config):
    x_train, x_test, y_train, y_test = train_test_split(features, target, test_size=config["test_size"])
    return x_train, x_test, y_train, y_test


In [70]:
config = {"output_features": 1, "test_size":0.2}
#data = encode_categories(df, 'Category')
#category = df.pop(['Category'])
income = df.pop(["Income"])
features, target = seperate_data(df, config)

InvalidIndexError: ['Income']

In [71]:
x_train, x_test, y_train, y_test = split_data(features, target, config)

In [72]:
def get_model():
    params = {
        "gamma": range(0,2),
        "max_depth": range(0,5),
        "lambda": range(0,5)
    }
    model = GridSearchCV(XGBRegressor(), param_grid=params, scoring='neg_mean_squared_error', cv=5,)
    return model


def get_best_param(model, x_train, y_train):
    model.fit(x_train, y_train)
    best_param = model.best_params_
    return best_param


def get_best_estimator(best_param):
    model = XGBRegressor(**best_param)
    return model
    

In [73]:
model = get_model()
best_params = get_best_param(model, x_train.values, y_train.values)

In [74]:
best_params

{'gamma': 0, 'lambda': 1, 'max_depth': 3}

In [75]:
from sklearn.metrics import mean_squared_error

y_pred = model.predict(x_test)
mse = mean_squared_error(y_pred, y_test)
mse

208.7369851145214