In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datasets import load_dataset


In [None]:
df = load_dataset("gvlassis/california_housing")
df

In [None]:
# merge the train, validation and test data
df_train = pd.DataFrame(df['train'])
df_validation = pd.DataFrame(df['validation'])
df_test = pd.DataFrame(df['test'])

df = pd.concat([df_train, df_validation, df_test])

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
X = df.drop(columns='MedHouseVal')
y = df['MedHouseVal']

In [None]:
X.shape, y.shape

In [None]:
X

In [None]:
y

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [None]:
scalar = StandardScaler()
X_train = scalar.fit_transform(X_train)
X_test = scalar.transform(X_test)

In [None]:
X_train

In [None]:
X_test

In [None]:
y_train

In [None]:
y_test

# XGBoost : Regression

In [None]:
import xgboost as xgb
from sklearn.neighbors import KNeighborsRegressor

In [None]:
xgb_regressor = xgb.XGBRegressor(objective='reg:squarederror', eval_metric='rmse')

In [None]:
xgb_regressor.fit(X_train, y_train)

In [None]:
y_pred = xgb_regressor.predict(X_test)

In [None]:
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

In [None]:
mean_absolute_error(y_test, y_pred), mean_squared_error(y_test, y_pred), r2_score(y_test, y_pred)

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
param_grid = {
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.5],
    'n_estimators': [50, 100, 200],
    'gamma': [0, 0.1, 0.2],
    'min_child_weight': [1, 2, 3]
}

In [None]:
grid_search = GridSearchCV(xgb_regressor, param_grid, cv=3, n_jobs=-1, verbose=1)

In [None]:
grid_search.fit(X_train, y_train)

In [None]:
grid_search.best_params_

In [None]:
grid_search.best_score_

In [None]:
xgb_regressor_best_params = xgb.XGBRegressor(**grid_search.best_params_, objective='reg:squarederror', eval_metric='rmse')

In [None]:
xgb_regressor_best_params.fit(X_train, y_train)

In [None]:
y_pred_best_params = xgb_regressor_best_params.predict(X_test)

In [None]:
mean_absolute_error(y_test, y_pred_best_params), mean_squared_error(y_test, y_pred_best_params), r2_score(y_test, y_pred_best_params)

# KNN : Regression

In [None]:
knn_regressor = KNeighborsRegressor(n_neighbors=5)

In [None]:
knn_regressor.fit(X_train, y_train)

In [None]:
y_pred_knn = knn_regressor.predict(X_test)

In [None]:
mean_absolute_error(y_test, y_pred_knn), mean_squared_error(y_test, y_pred_knn), r2_score(y_test, y_pred_knn)

In [None]:
param_grid_knn = {
    'n_neighbors': range(1, 21, 2),
    'weights': ['uniform', 'distance'],
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
    'leaf_size': range(10, 51, 10),
    'p': [1, 2, 3]
}

In [None]:
grid_search_knn = GridSearchCV(knn_regressor, param_grid_knn, cv=3, n_jobs=-1, verbose=1)

In [47]:
grid_search_knn.fit(X_train, y_train)

In [None]:
grid_search_knn.best_params_

In [None]:
grid_search_knn.best_score_

In [None]:
grid_search_knn.best_params_

In [None]:
knn_regressor_best_params = KNeighborsRegressor(**grid_search_knn.best_params_, n_jobs=-1)

In [None]:
knn_regressor_best_params.fit(X_train, y_train)

In [None]:
y_pred_knn_best_params = knn_regressor_best_params.predict(X_test)

In [None]:
mean_absolute_error(y_test, y_pred_knn_best_params), mean_squared_error(y_test, y_pred_knn_best_params), r2_score(y_test, y_pred_knn_best_params)

In [None]:
import joblib

In [None]:
data_to_save = {
    'scalar': scalar,
    'xgb_regressor': xgb_regressor,
    'xgb_regressor_best_params': xgb_regressor_best_params,
    'knn_regressor': knn_regressor,
    'knn_regressor_best_params': knn_regressor_best_params
}

In [None]:
joblib.dump(data_to_save, 'xgboost regressor, knn regressor and scalar.pkl')