In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import datetime as dt
import xgboost 
import os
import pickle

from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import KFold, GridSearchCV, RandomizedSearchCV
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import (
    RandomForestRegressor,
    GradientBoostingRegressor,
    ExtraTreesClassifier,
)
from sklearn.svm import SVR
from scipy.stats import randint, uniform

## Process Weather Data

In [None]:
weather_data = pd.read_csv("../data/processed_weather_data_leuven.csv", index_col=0)


In [None]:
weather_data.head(4)

In [None]:
weather_data.columns

In [None]:
weather_data.dtypes

In [None]:
np.unique(weather_data.weathercode)

In [None]:
# extract from timestamp
weather_data["time"] = pd.to_datetime(weather_data["time"])
weather_data["date"] = weather_data["time"].dt.date
weather_data["hour"] = weather_data["time"].dt.hour
weather_data["month"] = weather_data["time"].dt.month
weather_data["weekday"] = weather_data["time"].dt.strftime("%a")
weather_data.head(5)

In [None]:
weather_data.drop("time", axis=1, inplace=True)

In [None]:
weather_data.shape

In [None]:
weather_data.head(3)

## Process air quality data

In [None]:
air_quality_data = pd.read_csv("../data/processed_air_quality_data.csv", index_col=0)


In [None]:
air_quality_data.head(3)

In [None]:
air_quality_data.columns

In [None]:
# extract from timestamp
air_quality_data["date"] = pd.to_datetime(air_quality_data["date"]).dt.date
air_quality_data.head(5)



In [None]:
air_quality_data = (
    air_quality_data.groupby(["date", "hour", "month", "weekday"]).mean().reset_index()
)

In [None]:
air_quality_data

## Process file42 data

In [None]:
file42 = pd.read_csv("../data/processed_file42_data.csv", index_col=0)
file42.dropna(inplace=True)
file42.head(5)

In [None]:
file42.columns

In [None]:
file42.rename(columns={"#object_id": "object_id"}, inplace=True)

In [None]:
file42["object_id"] = pd.to_numeric(file42["object_id"]).astype(int)

In [None]:
file42.head(5)

In [None]:
# Convert the 'result_timestamp' column to a datetime data type
file42["result_timestamp"] = pd.to_datetime(file42["result_timestamp"])
file42["date"] = file42["result_timestamp"].dt.date
file42["hour"] = file42["result_timestamp"].dt.hour
file42["month"] = file42["result_timestamp"].dt.month
file42["weekday"] = file42["result_timestamp"].dt.strftime("%a")
file42.head(4)

In [None]:
file42 = (
    file42.groupby(["object_id", "date", "hour", "month", "weekday"])
    .mean()
    .reset_index()
)
file42.head(5)

## Merge data

In [None]:
data_model = file42.merge(
    air_quality_data,
    how="inner",
    left_on=["date", "hour", "month", "weekday"],
    right_on=["date", "hour", "month", "weekday"],
)

In [None]:
data_model = data_model.merge(
    weather_data,
    how="inner",
    left_on=["date", "hour", "month", "weekday"],
    right_on=["date", "hour", "month", "weekday"],
)

In [None]:
data_model.shape

In [None]:
data_model.head(5)

In [None]:
data_model.dtypes

In [None]:
## split train, test data
train_df, val_df = train_test_split(data_model, test_size=0.2, random_state=7)

## Process dependent variables

In [None]:
target_variable = ["lamax", "laeq", "lceq", "lcpeak"]

In [None]:
y_train = train_df[target_variable]
y_train.head(2)

In [None]:
y_val = val_df[target_variable]
y_val.head(2)

## Process independent variables

In [None]:
X_train = train_df.drop(target_variable + ["date"], axis=1)
X_val = val_df.drop(target_variable + ["date"], axis=1)

In [None]:
X_train.columns

In [None]:
one_hot_var = ["hour", "month", "weekday", "object_id", "weathercode"]
numerical_var = [col for col in X_train.columns if col not in one_hot_var]

In [None]:
t = ColumnTransformer(
    transformers=[
        ("OneHot", OneHotEncoder(handle_unknown="ignore"), one_hot_var),
        ("StandardScaler", StandardScaler(), numerical_var),
    ]
)

# fit the encoder
t.fit(X_train)

In [None]:
## save columnstransformer

with open("../model/model_noise_level_file42/encoder_model_file42.pkl", "wb") as file:
    pickle.dump(t, file)

In [None]:
# Load the pickle file
with open("../model/model_noise_level_file42/encoder_model_file42.pkl", "rb") as file:
    t = pickle.load(file)

In [None]:
# create pandas DataFrame from dense matrix
X_train = pd.DataFrame(t.fit_transform(X_train), columns=t.get_feature_names_out())

X_val = pd.DataFrame(t.transform(X_val), columns=t.get_feature_names_out())

In [None]:
X_train.columns

In [None]:
y_train.columns

## Predict lamax

In [None]:
# Define the model parameters
model_params = {
    "random_forest": {
        "model": RandomForestRegressor(),
        "params": {
            "n_estimators": randint(50, 100),
            "max_depth": randint(3, 50),
            "max_features": ["auto", "sqrt"],
            "min_samples_split": randint(2, 20),
            "min_samples_leaf": randint(1, 10),
            "bootstrap": [True, False],
        },
    },
    "gradient_boosting": {
        "model": GradientBoostingRegressor(),
        "params": {
            "n_estimators": randint(50, 100),
            "learning_rate": uniform(0.01, 0.5),
            "max_depth": randint(1, 10),
            "min_samples_split": randint(2, 20),
            "min_samples_leaf": randint(1, 10),
        },
    },
    "xgboost": {
        "model": xgboost.XGBRegressor(),
        "params": {
            "n_estimators": randint(50, 100),
            "learning_rate": uniform(0.01, 0.5),
            "max_depth": randint(1, 10),
            "min_child_weight": randint(1, 10),
            "gamma": uniform(0, 1),
            "reg_alpha": uniform(0, 1),
            "reg_lambda": uniform(0, 1),
        },
    },
}

params_dict = {}

# Loop through each model in model_params and run RandomizedSearchCV
for model_name, model_info in model_params.items():
    print("Running RandomizedSearchCV for {}...".format(model_name))

    # Create a RandomizedSearchCV object for the current model
    model = model_info["model"]
    param_dist = model_info["params"]
    random_search = RandomizedSearchCV(
        model, param_distributions=param_dist, n_iter=10, cv=5, n_jobs=1, random_state=7
    )

    # Fit the RandomizedSearchCV object to the data
    random_search.fit(X_train, y_train["lamax"])

    # Print the best parameters and score
    params_dict[model_name] = random_search.best_params_
    print("Best parameters for {}: ".format(model_name), random_search.best_params_)
    print("Best score for {}: ".format(model_name), random_search.best_score_)
    print("\n")


In [None]:
# Save optimal param dictionary
pickle.dump(params_dict, open("../model/model_noise_level_file42/lamax_dict", "wb"))

In [None]:
# Load param dictionary
lamax_dict = pickle.load(open("../model/model_noise_level_file42/lamax_dict", "rb"))
lamax_dict

In [None]:
gb_params = lamax_dict["gradient_boosting"]

gb = GradientBoostingRegressor(**gb_params, random_state=7)

gb.fit(X_train, y_train["lamax"])

train_preds = gb.predict(X_train)
val_preds = gb.predict(X_val)

print("Train RMSE:", np.sqrt(mean_squared_error(train_preds, y_train["lamax"])))
print("Val RMSE:", np.sqrt(mean_squared_error(val_preds, y_val["lamax"])))
print("Train MAE:", mean_absolute_error(train_preds, y_train["lamax"]))
print("Val MAE:", mean_absolute_error(val_preds, y_val["lamax"]))

In [None]:
rf_params = lamax_dict["random_forest"]

rf = RandomForestRegressor(**rf_params, random_state=7)

rf.fit(X_train, y_train["lamax"])

train_preds = rf.predict(X_train)
val_preds = rf.predict(X_val)

print("Train RMSE:", np.sqrt(mean_squared_error(train_preds, y_train["lamax"])))
print("Val RMSE:", np.sqrt(mean_squared_error(val_preds, y_val["lamax"])))
print("Train MAE:", mean_absolute_error(train_preds, y_train["lamax"]))
print("Val MAE:", mean_absolute_error(val_preds, y_val["lamax"]))

In [None]:
xgb_params = lamax_dict["xgboost"]

xgb = xgboost.XGBRegressor(**xgb_params, random_state=7)

xgb.fit(X_train, y_train["lamax"])

train_preds = xgb.predict(X_train)
val_preds = xgb.predict(X_val)


print("Train RMSE:", np.sqrt(mean_squared_error(train_preds, y_train["lamax"])))
print("Val RMSE:", np.sqrt(mean_squared_error(val_preds, y_val["lamax"])))
print("Train MAE:", mean_absolute_error(train_preds, y_train["lamax"]))
print("Val MAE:", mean_absolute_error(val_preds, y_val["lamax"]))


In [None]:
plt.scatter(val_preds, y_val["lamax"])
plt.xlabel("y pred")
plt.ylabel("y val")

In [None]:
r2_score(val_preds, y_val["lamax"])

In [None]:
## save model

pickle.dump(xgb, open("../model/model_noise_level_file42/xgb_lamax.pkl", "wb"))

## predict laeq

In [None]:
# Define the model parameters
model_params = {
    "random_forest": {
        "model": RandomForestRegressor(),
        "params": {
            "n_estimators": randint(50, 100),
            "max_depth": randint(3, 50),
            "max_features": ["auto", "sqrt"],
            "min_samples_split": randint(2, 20),
            "min_samples_leaf": randint(1, 10),
            "bootstrap": [True, False],
        },
    },
    "gradient_boosting": {
        "model": GradientBoostingRegressor(),
        "params": {
            "n_estimators": randint(50, 100),
            "learning_rate": uniform(0.01, 0.5),
            "max_depth": randint(1, 10),
            "min_samples_split": randint(2, 20),
            "min_samples_leaf": randint(1, 10),
        },
    },
    "xgboost": {
        "model": xgboost.XGBRegressor(),
        "params": {
            "n_estimators": randint(50, 100),
            "learning_rate": uniform(0.01, 0.5),
            "max_depth": randint(1, 10),
            "min_child_weight": randint(1, 10),
            "gamma": uniform(0, 1),
            "reg_alpha": uniform(0, 1),
            "reg_lambda": uniform(0, 1),
        },
    },
}

params_dict = {}

# Loop through each model in model_params and run RandomizedSearchCV
for model_name, model_info in model_params.items():
    print("Running RandomizedSearchCV for {}...".format(model_name))

    # Create a RandomizedSearchCV object for the current model
    model = model_info["model"]
    param_dist = model_info["params"]
    random_search = RandomizedSearchCV(
        model, param_distributions=param_dist, n_iter=10, cv=5, n_jobs=1, random_state=7
    )

    # Fit the RandomizedSearchCV object to the data
    random_search.fit(X_train, y_train["laeq"])

    # Print the best parameters and score
    params_dict[model_name] = random_search.best_params_
    print("Best parameters for {}: ".format(model_name), random_search.best_params_)
    print("Best score for {}: ".format(model_name), random_search.best_score_)
    print("\n")

In [None]:
# Save optimal param dictionary
pickle.dump(params_dict, open("../model/model_noise_level_file42/laeq_dict", "wb"))

In [None]:
# Load param dictionary
laeq_dict = pickle.load(open("../model/model_noise_level_file42/laeq_dict", "rb"))
laeq_dict

In [None]:
gb_params = laeq_dict["gradient_boosting"]

gb = GradientBoostingRegressor(**gb_params, random_state=7)

gb.fit(X_train, y_train["laeq"])

train_preds = gb.predict(X_train)
val_preds = gb.predict(X_val)

print("Train RMSE:", np.sqrt(mean_squared_error(train_preds, y_train["laeq"])))
print("Val RMSE:", np.sqrt(mean_squared_error(val_preds, y_val["laeq"])))
print("Train MAE:", mean_absolute_error(train_preds, y_train["laeq"]))
print("Val MAE:", mean_absolute_error(val_preds, y_val["laeq"]))

In [None]:
rf_params = laeq_dict["random_forest"]

rf = RandomForestRegressor(**rf_params, random_state=7)

rf.fit(X_train, y_train["laeq"])

train_preds = rf.predict(X_train)
val_preds = rf.predict(X_val)

print("Train RMSE:", np.sqrt(mean_squared_error(train_preds, y_train["laeq"])))
print("Val RMSE:", np.sqrt(mean_squared_error(val_preds, y_val["laeq"])))
print("Train MAE:", mean_absolute_error(train_preds, y_train["laeq"]))
print("Val MAE:", mean_absolute_error(val_preds, y_val["laeq"]))

In [None]:
xgb_params = laeq_dict["xgboost"]

xgb = xgboost.XGBRegressor(**xgb_params, random_state=7)

xgb.fit(X_train, y_train["laeq"])

train_preds = xgb.predict(X_train)
val_preds = xgb.predict(X_val)


print("Train RMSE:", np.sqrt(mean_squared_error(train_preds, y_train["laeq"])))
print("Val RMSE:", np.sqrt(mean_squared_error(val_preds, y_val["laeq"])))
print("Train MAE:", mean_absolute_error(train_preds, y_train["laeq"]))
print("Val MAE:", mean_absolute_error(val_preds, y_val["laeq"]))

In [None]:
plt.scatter(val_preds, y_val["laeq"])
plt.xlabel("y pred")
plt.ylabel("y val")

In [None]:
r2_score(val_preds, y_val["laeq"])

In [None]:
## save model

pickle.dump(xgb, open("../model/model_noise_level_file42/xgb_laeq.pkl", "wb"))

## Predict lceq

In [None]:
# Define the model parameters
model_params = {
    "random_forest": {
        "model": RandomForestRegressor(),
        "params": {
            "n_estimators": randint(50, 100),
            "max_depth": randint(3, 50),
            "max_features": ["auto", "sqrt"],
            "min_samples_split": randint(2, 20),
            "min_samples_leaf": randint(1, 10),
            "bootstrap": [True, False],
        },
    },
    "gradient_boosting": {
        "model": GradientBoostingRegressor(),
        "params": {
            "n_estimators": randint(50, 100),
            "learning_rate": uniform(0.01, 0.5),
            "max_depth": randint(1, 10),
            "min_samples_split": randint(2, 20),
            "min_samples_leaf": randint(1, 10),
        },
    },
    "xgboost": {
        "model": xgboost.XGBRegressor(),
        "params": {
            "n_estimators": randint(50, 100),
            "learning_rate": uniform(0.01, 0.5),
            "max_depth": randint(1, 10),
            "min_child_weight": randint(1, 10),
            "gamma": uniform(0, 1),
            "reg_alpha": uniform(0, 1),
            "reg_lambda": uniform(0, 1),
        },
    },
}

params_dict = {}

# Loop through each model in model_params and run RandomizedSearchCV
for model_name, model_info in model_params.items():
    print("Running RandomizedSearchCV for {}...".format(model_name))

    # Create a RandomizedSearchCV object for the current model
    model = model_info["model"]
    param_dist = model_info["params"]
    random_search = RandomizedSearchCV(
        model, param_distributions=param_dist, n_iter=10, cv=5, n_jobs=1, random_state=7
    )

    # Fit the RandomizedSearchCV object to the data
    random_search.fit(X_train, y_train["lceq"])

    # Print the best parameters and score
    params_dict[model_name] = random_search.best_params_
    print("Best parameters for {}: ".format(model_name), random_search.best_params_)
    print("Best score for {}: ".format(model_name), random_search.best_score_)
    print("\n")

In [None]:
# Save optimal param dictionary
pickle.dump(params_dict, open("../model/model_noise_level_file42/lceq_dict", "wb"))

In [None]:
# Load param dictionary
lceq_dict = pickle.load(open("../model/model_noise_level_file42/lceq_dict", "rb"))
lceq_dict

In [None]:
gb_params = lceq_dict["gradient_boosting"]

gb = GradientBoostingRegressor(**gb_params, random_state=7)

gb.fit(X_train, y_train["lceq"])

train_preds = gb.predict(X_train)
val_preds = gb.predict(X_val)

print("Train RMSE:", np.sqrt(mean_squared_error(train_preds, y_train["lceq"])))
print("Val RMSE:", np.sqrt(mean_squared_error(val_preds, y_val["lceq"])))
print("Train MAE:", mean_absolute_error(train_preds, y_train["lceq"]))
print("Val MAE:", mean_absolute_error(val_preds, y_val["lceq"]))

In [None]:
rf_params = lceq_dict["random_forest"]

rf = RandomForestRegressor(**rf_params, random_state=7)

rf.fit(X_train, y_train["lceq"])

train_preds = rf.predict(X_train)
val_preds = rf.predict(X_val)

print("Train RMSE:", np.sqrt(mean_squared_error(train_preds, y_train["lceq"])))
print("Val RMSE:", np.sqrt(mean_squared_error(val_preds, y_val["lceq"])))
print("Train MAE:", mean_absolute_error(train_preds, y_train["lceq"]))
print("Val MAE:", mean_absolute_error(val_preds, y_val["lceq"]))

In [None]:

xgb_params = lceq_dict["xgboost"]

xgb = xgboost.XGBRegressor(**xgb_params, random_state=7)

xgb.fit(X_train, y_train["lceq"])

train_preds = xgb.predict(X_train)
val_preds = xgb.predict(X_val)


print("Train RMSE:", np.sqrt(mean_squared_error(train_preds, y_train["lceq"])))
print("Val RMSE:", np.sqrt(mean_squared_error(val_preds, y_val["lceq"])))
print("Train MAE:", mean_absolute_error(train_preds, y_train["lceq"]))
print("Val MAE:", mean_absolute_error(val_preds, y_val["lceq"]))

In [None]:
plt.scatter(val_preds, y_val["lceq"])
plt.xlabel("y pred")
plt.ylabel("y val")

In [None]:
r2_score(val_preds, y_val["lceq"])

In [None]:
## save model

import pickle

pickle.dump(xgb, open("../model/model_noise_level_file42/xgb_lceq.pkl", "wb"))

## Predict lcpeak

In [None]:
# Define the model parameters
model_params = {
    "random_forest": {
        "model": RandomForestRegressor(),
        "params": {
            "n_estimators": randint(50, 100),
            "max_depth": randint(3, 50),
            "max_features": ["auto", "sqrt"],
            "min_samples_split": randint(2, 20),
            "min_samples_leaf": randint(1, 10),
            "bootstrap": [True, False],
        },
    },
    "gradient_boosting": {
        "model": GradientBoostingRegressor(),
        "params": {
            "n_estimators": randint(50, 100),
            "learning_rate": uniform(0.01, 0.5),
            "max_depth": randint(1, 10),
            "min_samples_split": randint(2, 20),
            "min_samples_leaf": randint(1, 10),
        },
    },
    "xgboost": {
        "model": xgboost.XGBRegressor(),
        "params": {
            "n_estimators": randint(50, 100),
            "learning_rate": uniform(0.01, 0.5),
            "max_depth": randint(1, 10),
            "min_child_weight": randint(1, 10),
            "gamma": uniform(0, 1),
            "reg_alpha": uniform(0, 1),
            "reg_lambda": uniform(0, 1),
        },
    },
}

params_dict = {}

# Loop through each model in model_params and run RandomizedSearchCV
for model_name, model_info in model_params.items():
    print("Running RandomizedSearchCV for {}...".format(model_name))

    # Create a RandomizedSearchCV object for the current model
    model = model_info["model"]
    param_dist = model_info["params"]
    random_search = RandomizedSearchCV(
        model, param_distributions=param_dist, n_iter=10, cv=5, n_jobs=1, random_state=7
    )

    # Fit the RandomizedSearchCV object to the data
    random_search.fit(X_train, y_train["lcpeak"])

    # Print the best parameters and score
    params_dict[model_name] = random_search.best_params_
    print("Best parameters for {}: ".format(model_name), random_search.best_params_)
    print("Best score for {}: ".format(model_name), random_search.best_score_)
    print("\n")

In [None]:
# Save optimal param dictionary
pickle.dump(params_dict, open("../model/model_noise_level_file42/lcpeak_dict", "wb"))

In [None]:
# Load param dictionary
lcpeak_dict = pickle.load(open("../model/model_noise_level_file42/lcpeak_dict", "rb"))
lcpeak_dict

In [None]:
gb_params = lcpeak_dict["gradient_boosting"]

gb = GradientBoostingRegressor(**gb_params, random_state=7)

gb.fit(X_train, y_train["lcpeak"])

train_preds = gb.predict(X_train)
val_preds = gb.predict(X_val)

print("Train RMSE:", np.sqrt(mean_squared_error(train_preds, y_train["lcpeak"])))
print("Val RMSE:", np.sqrt(mean_squared_error(val_preds, y_val["lcpeak"])))
print("Train MAE:", mean_absolute_error(train_preds, y_train["lcpeak"]))
print("Val MAE:", mean_absolute_error(val_preds, y_val["lcpeak"]))

In [None]:
rf_params = lcpeak_dict["random_forest"]

rf = RandomForestRegressor(**rf_params, random_state=7)

rf.fit(X_train, y_train["lcpeak"])

train_preds = rf.predict(X_train)
val_preds = rf.predict(X_val)

print("Train RMSE:", np.sqrt(mean_squared_error(train_preds, y_train["lcpeak"])))
print("Val RMSE:", np.sqrt(mean_squared_error(val_preds, y_val["lcpeak"])))
print("Train MAE:", mean_absolute_error(train_preds, y_train["lcpeak"]))
print("Val MAE:", mean_absolute_error(val_preds, y_val["lcpeak"]))

In [None]:
import xgboost

xgb_params = lcpeak_dict["xgboost"]

xgb = xgboost.XGBRegressor(**xgb_params, random_state=7)

xgb.fit(X_train, y_train["lcpeak"])

train_preds = xgb.predict(X_train)
val_preds = xgb.predict(X_val)


print("Train RMSE:", np.sqrt(mean_squared_error(train_preds, y_train["lcpeak"])))
print("Val RMSE:", np.sqrt(mean_squared_error(val_preds, y_val["lcpeak"])))
print("Train MAE:", mean_absolute_error(train_preds, y_train["lcpeak"]))
print("Val MAE:", mean_absolute_error(val_preds, y_val["lcpeak"]))

In [None]:
plt.scatter(val_preds, y_val["lcpeak"])
plt.xlabel("y pred")
plt.ylabel("y val")

In [None]:
r2_score(val_preds, y_val["lcpeak"])

In [None]:
## save model

pickle.dump(xgb, open("../model/model_noise_level_file42/xgb_lcpeak.pkl", "wb"))

## In this section, we would predict lcpeak using data in file meteo


In [None]:
meteo = pd.concat([pd.read_csv("../data/LC_data/LC_2022Q1.csv", delimiter=","),
                  pd.read_csv("../data/LC_data/LC_2022Q2.csv", delimiter=","),
                  pd.read_csv("../data/LC_data/LC_2022Q3.csv", delimiter=","),
                  pd.read_csv("../data/LC_data/LC_2022Q4.csv", delimiter=",")])

In [None]:
meteo.columns

In [None]:
meteo.shape

In [None]:
meteo.head(4)

In [None]:
meteo.dtypes

In [None]:
meteo.isna().mean()

In [None]:
meteo['date'] = pd.to_datetime(meteo['DATEUTC']).dt.date


In [None]:
meteo = meteo[~meteo.LC_HUMIDITY.isnull()]

In [None]:
meteo.isna().mean()

In [None]:
meteo = meteo.groupby(["date", "Month", "Hour"]).mean().reset_index()

In [None]:
meteo.shape

In [None]:
meteo.drop(["Year", "Day", "Minute"], axis = 1, inplace = True)

In [None]:
meteo.rename(columns={'Month': 'month',
                      'Hour': 'hour'}, inplace = True)

In [None]:
data_model_meteo = file42.merge(meteo, how="inner",
                                left_on = ["date", "hour", "month"],
                                right_on = ["date", "hour", "month"])

In [None]:
data_model_meteo.head(3)

In [None]:
data_model_meteo.shape

In [None]:
## split train, test data
from sklearn.model_selection import train_test_split
train_df, val_df = train_test_split(data_model_meteo, test_size=0.2, random_state = 7)

In [None]:
target_variable = ["lamax", "laeq", "lceq", "lcpeak"]


In [None]:
X_train = train_df.drop(target_variable + ["date"], axis = 1)
X_val = val_df.drop(target_variable + ["date"], axis = 1)

In [None]:
X_train.columns

In [None]:
X_train.dtypes

In [None]:
y_train = train_df[target_variable]
y_val = val_df[target_variable]

In [None]:
one_hot_var = ['object_id', "hour", "month", "weekday"]
numerical_var = [col for col in X_train.columns if col not in one_hot_var]

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
t = ColumnTransformer(
    transformers = [
        ('OneHot', OneHotEncoder(handle_unknown='ignore'), one_hot_var),
        ('StandardScaler', StandardScaler(), numerical_var)
        
    ])

# fit the encoder
t.fit(X_train)

In [None]:
## save columnstransformer

with open("../model/model_noise_level_file42/encoder_model_file42_meteo.pkl", 'wb') as file:
    pickle.dump(t, file)

In [None]:
# create pandas DataFrame from sparse matrix

from scipy.sparse import csr_matrix

X_train = pd.DataFrame.sparse.from_spmatrix(csr_matrix(t.fit_transform(X_train)))
X_train.columns  = t.get_feature_names_out()

X_val = pd.DataFrame.sparse.from_spmatrix(csr_matrix(t.fit_transform(X_val)))
X_val.columns  = t.get_feature_names_out()


In [None]:
X_train.columns

In [None]:
# Define the model parameters
model_params = {
    'random_forest': {
        'model': RandomForestRegressor(),
        'params': {
            'n_estimators': randint(50, 100),
            'max_depth': randint(3, 50),
            'max_features': ['auto', 'sqrt'],
            'min_samples_split': randint(2, 20),
            'min_samples_leaf': randint(1, 10),
            'bootstrap': [True, False]
        }
    },
    'gradient_boosting': {
        'model': GradientBoostingRegressor(),
        'params': {
            'n_estimators': randint(50, 100),
            'learning_rate': uniform(0.01, 0.5),
            'max_depth': randint(1, 10),
            'min_samples_split': randint(2, 20),
            'min_samples_leaf': randint(1, 10)
        }
    },
    
    'xgboost': {
        'model': xgboost.XGBRegressor(),
        'params': {
            'n_estimators': randint(50, 100),
            'learning_rate': uniform(0.01, 0.5),
            'max_depth': randint(1, 10),
            'min_child_weight': randint(1, 10),
            'gamma': uniform(0, 1),
            'reg_alpha': uniform(0, 1),
            'reg_lambda': uniform(0, 1),
        }
    }
    
}

params_dict = {}

# Loop through each model in model_params and run RandomizedSearchCV
for model_name, model_info in model_params.items():
    print("Running RandomizedSearchCV for {}...".format(model_name))
    
    # Create a RandomizedSearchCV object for the current model
    model = model_info['model']
    param_dist = model_info['params']
    random_search = RandomizedSearchCV(model, param_distributions=param_dist,
                                       n_iter=10, cv=5, n_jobs=1, random_state=7)
    
    # Fit the RandomizedSearchCV object to the data
    random_search.fit(X_train, y_train['lcpeak'])
    
    # Print the best parameters and score
    params_dict[model_name] = random_search.best_params_
    print("Best parameters for {}: ".format(model_name), random_search.best_params_)
    print("Best score for {}: ".format(model_name), random_search.best_score_)
    print("\n")

In [None]:
# Save optimal param dictionary
pickle.dump(params_dict, open("../model/model_noise_level_file42/lcpeak_dict_meteo", "wb"))

In [None]:
# Load param dictionary
lcpeak_dict_meteo = pickle.load(open("../model/model_noise_level_file42/lcpeak_dict_meteo", "rb"))
lcpeak_dict_meteo

In [None]:
gb_params = lcpeak_dict_meteo["gradient_boosting"]

gb = GradientBoostingRegressor(**gb_params, random_state=7)

gb.fit(X_train, y_train["lcpeak"])

train_preds = gb.predict(X_train)
val_preds = gb.predict(X_val)

print("Train RMSE:", np.sqrt(mean_squared_error(train_preds, y_train["lcpeak"])))
print("Val RMSE:", np.sqrt(mean_squared_error(val_preds, y_val["lcpeak"])))
print("Train MAE:", mean_absolute_error(train_preds, y_train["lcpeak"]))
print("Val MAE:", mean_absolute_error(val_preds, y_val["lcpeak"]))

In [None]:
rf_params = lcpeak_dict_meteo["random_forest"]

rf = RandomForestRegressor(**rf_params, random_state=7)

rf.fit(X_train, y_train["lcpeak"])

train_preds = rf.predict(X_train)
val_preds = rf.predict(X_val)

print("Train RMSE:", np.sqrt(mean_squared_error(train_preds, y_train["lcpeak"])))
print("Val RMSE:", np.sqrt(mean_squared_error(val_preds, y_val["lcpeak"])))
print("Train MAE:", mean_absolute_error(train_preds, y_train["lcpeak"]))
print("Val MAE:", mean_absolute_error(val_preds, y_val["lcpeak"]))

In [None]:
import xgboost

xgb_params = lcpeak_dict_meteo["xgboost"]

xgb = xgboost.XGBRegressor(**xgb_params, random_state=7)

xgb.fit(X_train, y_train["lcpeak"])

train_preds = xgb.predict(X_train)
val_preds = xgb.predict(X_val)


print("Train RMSE:", np.sqrt(mean_squared_error(train_preds, y_train["lcpeak"])))
print("Val RMSE:", np.sqrt(mean_squared_error(val_preds, y_val["lcpeak"])))
print("Train MAE:", mean_absolute_error(train_preds, y_train["lcpeak"]))
print("Val MAE:", mean_absolute_error(val_preds, y_val["lcpeak"]))

In [None]:
plt.scatter(val_preds, y_val["lcpeak"])
plt.xlabel("y pred")
plt.ylabel("y val")

In [None]:
r2_score(val_preds, y_val["lcpeak"])

In [None]:
## save model

pickle.dump(xgb, open("../model/model_noise_level_file42/xgb_lcpeak_meteo.pkl", "wb"))