In [1]:
!pip install chelo
from chelo import DatasetRegistry

dataset = DatasetRegistry.get_dataset("CoalFiredPlantDataset")
dataset.load_data()
X, y = dataset.to_numpy()

y = y.ravel()
print(X.shape, y.shape)

#split data into train and test sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True)

print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)
print(f"Training set size: {X_train.shape[0]}, Test set size: {X_test.shape[0]}")

from sklearn.preprocessing import MinMaxScaler
import numpy as np
# Normalize the training data and apply the same scaling to the test data
scaler_X = MinMaxScaler()
scaler_y = MinMaxScaler()

# Fit on training data
X_train_normalized = scaler_X.fit_transform(X_train)
y_train_normalized = scaler_y.fit_transform(y_train.reshape(-1, 1)).ravel()

# Transform test data using the same scaler
X_test_normalized = scaler_X.transform(X_test)
y_test_normalized = scaler_y.transform(y_test.reshape(-1, 1)).ravel()

print("min X_train & X_train_normalized: ",np.min(X_train), np.min(X_train_normalized))
print("max X_train & X_train_normalized: ",np.max(X_train), np.max(X_train_normalized))

print("min y_train & y_train_normalized: ",np.min(y_train), np.min(y_train_normalized))
print("max y_train & y_train_normalized: ",np.max(y_train), np.max(y_train_normalized))

print(f"First 2 rows of X_train:\n{X_train[:2]}")

print(f"First 2 rows of normalized X_train:\n{X_train_normalized[:2]}")

print(f"First 5 values of y_train:\n{y_train[:5]}")

print(f"First 5 values of normalized y_train:\n{y_train_normalized[:5]}")

print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)
print(f"Training set size: {X_train.shape[0]}, Test set size: {X_test.shape[0]}")

print(X_train_normalized.shape, y_train_normalized.shape, X_test_normalized.shape, y_test_normalized.shape)
print(f"Training set size: {X_train_normalized.shape[0]}, Test set size: {X_test_normalized.shape[0]}")

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, mean_absolute_percentage_error

# Train Linear Regression -> train
linear_model = LinearRegression()
linear_model.fit(X_train_normalized, y_train_normalized)

# Predict on training data
y_train_pred_normalized = linear_model.predict(X_train_normalized).reshape(-1, 1)

# Inverse transform the predictions
linear_train_predictions = scaler_y.inverse_transform(y_train_pred_normalized).ravel()

# Evaluate performance -> train

# Calculate MSE - Must use the original (non-normalized) y_train for comparison
linear_train_mse = mean_squared_error(y_train, linear_train_predictions)
print(f"Linear Regression Train MSE: {linear_train_mse:.4f}")

# Calculate R2 - Must use the original (non-normalized) y_train for comparison
linear_train_r2 = r2_score(y_train, linear_train_predictions)
print(f"Linear Regression Train R2: {linear_train_r2:.4f}")

# Calculate MAE - Must use the original (non-normalized) y_train for comparison
linear_train_mae = mean_absolute_error(y_train, linear_train_predictions)
print(f"Linear Regression Train MAE: {linear_train_mae:.4f}")

# Calculate MAPE - Must use the original (non-normalized) y_train for comparison
linear_train_mape = mean_absolute_percentage_error(y_train, linear_train_predictions)
print(f"Linear Regression Train MAPE: {linear_train_mape:.4f}")

# Evaluate performance -> test

# Predicting using the trained model
y_test_pred_normalized = linear_model.predict(X_test_normalized).reshape(-1, 1)

# Inverse transform predictions back to original scale
linear_test_predictions = scaler_y.inverse_transform(y_test_pred_normalized).ravel()

# Calculate evaluation metrics on original scale
test_mse = mean_squared_error(y_test, linear_test_predictions)
test_r2 = r2_score(y_test, linear_test_predictions)
test_mae = mean_absolute_error(y_test, linear_test_predictions)
test_mape = mean_absolute_percentage_error(y_test, linear_test_predictions)

print("\nTest Set Performance (Original Scale):")
print(f"Test MSE: {test_mse:.4f}")
print(f"Test R²: {test_r2:.4f}")
print(f"Test MAE: {test_mae:.4f}")
print(f"Test MAPE: {test_mape:.4f}")

print("Available features: ", dataset.list_features())
print("Available targets: ", dataset.list_targets())
# Select the first 12 features
dataset.select_features(dataset.list_features()[:12])

# Extract features (X) and target (y)
X, y = dataset.to_numpy()
y = y.reshape(-1)

# By default, boiler efficiency is only used
print("Selected features: ", dataset.selected_features())
print("Selected targets: ", dataset.selected_targets())
print("Features and target shape: ", X.shape, y.shape)
# Initialize the regressor
regressor = LinearRegression()

# Train the regressor
regressor.fit(X_train, y_train)

# Predict on the test set
y_pred = regressor.predict(X_test)
print(f"R2 score: {r2_score(y_test, y_pred):.5f}")

from sklearn.tree import DecisionTreeRegressor

# Train Decision Tree Regressor -> train
tree_model = DecisionTreeRegressor(random_state=42)
tree_model.fit(X_train_normalized, y_train_normalized)

# Predict on training data
y_train_pred_normalized = tree_model.predict(X_train_normalized).reshape(-1, 1)

# Inverse transform the predictions
tree_train_predictions = scaler_y.inverse_transform(y_train_pred_normalized).ravel()

# Evaluate performance -> train

# 1. Mean Squared Error (MSE)
tree_train_mse = mean_squared_error(y_train, tree_train_predictions)
print(f"Decision Tree Train MSE: {tree_train_mse:.4f}")

# 2. Mean Absolute Error (MAE)
tree_train_mae = mean_absolute_error(y_train, tree_train_predictions)
print(f"Decision Tree Train MAE: {tree_train_mae:.4f}")

# 3. Mean Absolute Percentage Error (MAPE)
tree_train_mape = mean_absolute_percentage_error(y_train, tree_train_predictions)
print(f"Decision Tree Train MAPE: {tree_train_mape:.4f}")

# 4. R² Score
tree_train_r2 = r2_score(y_train, tree_train_predictions)
print(f"Decision Tree Train R²: {tree_train_r2:.4f}")

# Predict on test data
y_test_pred_normalized = tree_model.predict(X_test_normalized).reshape(-1, 1)

# Inverse transform the predictions for test set
tree_test_predictions = scaler_y.inverse_transform(y_test_pred_normalized).ravel()

# Evaluate performance -> test

# 1. Mean Squared Error (MSE)
tree_test_mse = mean_squared_error(y_test, tree_test_predictions)
print(f"Decision Tree Test MSE: {tree_test_mse:.4f}")

# 2. Mean Absolute Error (MAE)
tree_test_mae = mean_absolute_error(y_test, tree_test_predictions)
print(f"Decision Tree Test MAE: {tree_test_mae:.4f}")

# 3. Mean Absolute Percentage Error (MAPE)
tree_test_mape = mean_absolute_percentage_error(y_test, tree_test_predictions)
print(f"Decision Tree Test MAPE: {tree_test_mape:.4f}")

# 4. R² Score
tree_test_r2 = r2_score(y_test, tree_test_predictions)
print(f"Decision Tree Test R²: {tree_test_r2:.4f}")

from sklearn.model_selection import GridSearchCV


# Define hyperparameter grid for DecisionTreeRegressor
param_grid_dt = {
    'max_depth': [3, 5, 7, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': [None, 'sqrt', 'log2']
}

# GridSearch for DecisionTreeRegressor
grid_search_dt = GridSearchCV(DecisionTreeRegressor(random_state=42), param_grid_dt, cv=5, n_jobs=-1)
grid_search_dt.fit(X_train_normalized, y_train_normalized)

# Best hyperparameters
print(f"Best hyperparameters: {grid_search_dt.best_params_}")

# Train the best model
best_dt_model = grid_search_dt.best_estimator_

# Predict on training and test data
y_train_pred_dt = best_dt_model.predict(X_train_normalized)
y_test_pred_dt = best_dt_model.predict(X_test_normalized)

# Inverse transform the predictions
train_pred_dt = scaler_y.inverse_transform(y_train_pred_dt.reshape(-1, 1)).ravel()
test_pred_dt = scaler_y.inverse_transform(y_test_pred_dt.reshape(-1, 1)).ravel()

# Evaluate performance on train and test sets
train_mse_dt = mean_squared_error(y_train, train_pred_dt)
test_mse_dt = mean_squared_error(y_test, test_pred_dt)

train_r2_dt = r2_score(y_train, train_pred_dt)
test_r2_dt = r2_score(y_test, test_pred_dt)

print(f"Decision Tree Train MSE: {train_mse_dt:.4f}")
print(f"Decision Tree Test MSE: {test_mse_dt:.4f}")
print(f"Decision Tree Train R²: {train_r2_dt:.4f}")
print(f"Decision Tree Test R²: {test_r2_dt:.4f}")

from sklearn.ensemble import RandomForestRegressor

# Train Random Forest Regressor -> train
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train_normalized, y_train_normalized)

# Predict on training data
y_train_pred_normalized_rf = rf_model.predict(X_train_normalized).reshape(-1, 1)

# Inverse transform the predictions for train set
rf_train_predictions = scaler_y.inverse_transform(y_train_pred_normalized_rf).ravel()
# Evaluate performance -> train

# 1. Mean Squared Error (MSE)
rf_train_mse = mean_squared_error(y_train, rf_train_predictions)
print(f"Random Forest Train MSE: {rf_train_mse:.4f}")

# 2. Mean Absolute Error (MAE)
rf_train_mae = mean_absolute_error(y_train, rf_train_predictions)
print(f"Random Forest Train MAE: {rf_train_mae:.4f}")

# 3. Mean Absolute Percentage Error (MAPE)
rf_train_mape = mean_absolute_percentage_error(y_train, rf_train_predictions)
print(f"Random Forest Train MAPE: {rf_train_mape:.4f}")

# 4. R² Score
rf_train_r2 = r2_score(y_train, rf_train_predictions)
print(f"Random Forest Train R²: {rf_train_r2:.4f}")


# Predict on test data
y_test_pred_normalized_rf = rf_model.predict(X_test_normalized).reshape(-1, 1)

# Inverse transform the predictions for test set
rf_test_predictions = scaler_y.inverse_transform(y_test_pred_normalized_rf).ravel()

# Evaluate performance -> test

# 1. Mean Squared Error (MSE)
rf_test_mse = mean_squared_error(y_test, rf_test_predictions)
print(f"Random Forest Test MSE: {rf_test_mse:.4f}")

# 2. Mean Absolute Error (MAE)
rf_test_mae = mean_absolute_error(y_test, rf_test_predictions)
print(f"Random Forest Test MAE: {rf_test_mae:.4f}")

# 3. Mean Absolute Percentage Error (MAPE)
rf_test_mape = mean_absolute_percentage_error(y_test, rf_test_predictions)
print(f"Random Forest Test MAPE: {rf_test_mape:.4f}")

# 4. R² Score
rf_test_r2 = r2_score(y_test, rf_test_predictions)
print(f"Random Forest Test R²: {rf_test_r2:.4f}")

# Train Random Forest Regressor -> train
rf_model = RandomForestRegressor(n_estimators=50, random_state=42, min_samples_leaf= 3, min_samples_split= 6)
rf_model.fit(X_train_normalized, y_train_normalized)

# Predict on training data
y_train_pred_normalized_rf = rf_model.predict(X_train_normalized).reshape(-1, 1)

# Inverse transform the predictions for train set
rf_train_predictions = scaler_y.inverse_transform(y_train_pred_normalized_rf).ravel()
# Evaluate performance -> train

# 1. Mean Squared Error (MSE)
rf_train_mse = mean_squared_error(y_train, rf_train_predictions)
print(f"Random Forest Train MSE: {rf_train_mse:.4f}")

# 2. Mean Absolute Error (MAE)
rf_train_mae = mean_absolute_error(y_train, rf_train_predictions)
print(f"Random Forest Train MAE: {rf_train_mae:.4f}")

# 3. Mean Absolute Percentage Error (MAPE)
rf_train_mape = mean_absolute_percentage_error(y_train, rf_train_predictions)
print(f"Random Forest Train MAPE: {rf_train_mape:.4f}")

# 4. R² Score
rf_train_r2 = r2_score(y_train, rf_train_predictions)
print(f"Random Forest Train R²: {rf_train_r2:.4f}")


# Predict on test data
y_test_pred_normalized_rf = rf_model.predict(X_test_normalized).reshape(-1, 1)

# Inverse transform the predictions for test set
rf_test_predictions = scaler_y.inverse_transform(y_test_pred_normalized_rf).ravel()

# Evaluate performance -> test

# 1. Mean Squared Error (MSE)
rf_test_mse = mean_squared_error(y_test, rf_test_predictions)
print(f"Random Forest Test MSE: {rf_test_mse:.4f}")

# 2. Mean Absolute Error (MAE)
rf_test_mae = mean_absolute_error(y_test, rf_test_predictions)
print(f"Random Forest Test MAE: {rf_test_mae:.4f}")

# 3. Mean Absolute Percentage Error (MAPE)
rf_test_mape = mean_absolute_percentage_error(y_test, rf_test_predictions)
print(f"Random Forest Test MAPE: {rf_test_mape:.4f}")

# 4. R² Score
rf_test_r2 = r2_score(y_test, rf_test_predictions)
print(f"Random Forest Test R²: {rf_test_r2:.4f}")

from sklearn.model_selection import GridSearchCV

# Define hyperparameter grid for RandomForestRegressor
param_grid_rf = {
    'n_estimators': [10, 50, 100, 200],
    'max_depth': [3, 5, 7, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': [None, 'sqrt', 'log2']
}

# GridSearch for RandomForestRegressor
grid_search_rf = GridSearchCV(RandomForestRegressor(random_state=42), param_grid_rf, cv=5, n_jobs=-1)
grid_search_rf.fit(X_train_normalized, y_train_normalized)

# Best hyperparameters
print(f"Best hyperparameters: {grid_search_rf.best_params_}")

# Train the best model
best_rf_model = grid_search_rf.best_estimator_

# Predict on training and test data
y_train_pred_rf = best_rf_model.predict(X_train_normalized)
y_test_pred_rf = best_rf_model.predict(X_test_normalized)

# Inverse transform the predictions
train_pred_rf = scaler_y.inverse_transform(y_train_pred_rf.reshape(-1, 1)).ravel()
test_pred_rf = scaler_y.inverse_transform(y_test_pred_rf.reshape(-1, 1)).ravel()

# Evaluate performance on train and test sets
train_mse_rf = mean_squared_error(y_train, train_pred_rf)
test_mse_rf = mean_squared_error(y_test, test_pred_rf)

train_r2_rf = r2_score(y_train, train_pred_rf)
test_r2_rf = r2_score(y_test, test_pred_rf)

print(f"Random Forest Train MSE: {train_mse_rf:.4f}")
print(f"Random Forest Test MSE: {test_mse_rf:.4f}")
print(f"Random Forest Train R²: {train_r2_rf:.4f}")
print(f"Random Forest Test R²: {test_r2_rf:.4f}")

from sklearn.neighbors import KNeighborsRegressor

# Train KNeighbors Regressor -> train
knn_model = KNeighborsRegressor(n_neighbors=5)
knn_model.fit(X_train_normalized, y_train_normalized)

# Predict on training data
y_train_pred_normalized_knn = knn_model.predict(X_train_normalized).reshape(-1, 1)

# Inverse transform the predictions for train set
knn_train_predictions = scaler_y.inverse_transform(y_train_pred_normalized_knn).ravel()

# Evaluate performance -> train

# 1. Mean Squared Error (MSE)
knn_train_mse = mean_squared_error(y_train, knn_train_predictions)
print(f"KNN Train MSE: {knn_train_mse:.4f}")

# 2. Mean Absolute Error (MAE)
knn_train_mae = mean_absolute_error(y_train, knn_train_predictions)
print(f"KNN Train MAE: {knn_train_mae:.4f}")

# 3. Mean Absolute Percentage Error (MAPE)
knn_train_mape = mean_absolute_percentage_error(y_train, knn_train_predictions)
print(f"KNN Train MAPE: {knn_train_mape:.4f}")

# 4. R² Score
knn_train_r2 = r2_score(y_train, knn_train_predictions)
print(f"KNN Train R²: {knn_train_r2:.4f}")


# Predict on test data
y_test_pred_normalized_knn = knn_model.predict(X_test_normalized).reshape(-1, 1)

# Inverse transform the predictions for test set
knn_test_predictions = scaler_y.inverse_transform(y_test_pred_normalized_knn).ravel()

# Evaluate performance -> test

# 1. Mean Squared Error (MSE)
knn_test_mse = mean_squared_error(y_test, knn_test_predictions)
print(f"KNN Test MSE: {knn_test_mse:.4f}")

# 2. Mean Absolute Error (MAE)
knn_test_mae = mean_absolute_error(y_test, knn_test_predictions)
print(f"KNN Test MAE: {knn_test_mae:.4f}")

# 3. Mean Absolute Percentage Error (MAPE)
knn_test_mape = mean_absolute_percentage_error(y_test, knn_test_predictions)
print(f"KNN Test MAPE: {knn_test_mape:.4f}")

# 4. R² Score
knn_test_r2 = r2_score(y_test, knn_test_predictions)
print(f"KNN Test R²: {knn_test_r2:.4f}")

from sklearn.model_selection import GridSearchCV

# Define hyperparameter grid for KNeighborsRegressor
param_grid_knn = {
    'n_neighbors': [3, 5, 10, 20],
    'weights': ['uniform', 'distance'],
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
    'leaf_size': [20, 30, 40]
}

# GridSearch for KNeighborsRegressor
grid_search_knn = GridSearchCV(KNeighborsRegressor(), param_grid_knn, cv=5, n_jobs=-1)
grid_search_knn.fit(X_train_normalized, y_train_normalized)

# Best hyperparameters
print(f"Best hyperparameters: {grid_search_knn.best_params_}")

# Train the best model
best_knn_model = grid_search_knn.best_estimator_

# Predict on training and test data
y_train_pred_knn = best_knn_model.predict(X_train_normalized)
y_test_pred_knn = best_knn_model.predict(X_test_normalized)

# Inverse transform the predictions
train_pred_knn = scaler_y.inverse_transform(y_train_pred_knn.reshape(-1, 1)).ravel()
test_pred_knn = scaler_y.inverse_transform(y_test_pred_knn.reshape(-1, 1)).ravel()

# Evaluate performance on train and test sets
train_mse_knn = mean_squared_error(y_train, train_pred_knn)
test_mse_knn = mean_squared_error(y_test, test_pred_knn)

train_r2_knn = r2_score(y_train, train_pred_knn)
test_r2_knn = r2_score(y_test, test_pred_knn)

print(f"KNN Train MSE: {train_mse_knn:.4f}")
print(f"KNN Test MSE: {test_mse_knn:.4f}")
print(f"KNN Train R²: {train_r2_knn:.4f}")
print(f"KNN Test R²: {test_r2_knn:.4f}")

Collecting chelo
  Downloading chelo-0.0.4-py3-none-any.whl.metadata (7.2 kB)
Downloading chelo-0.0.4-py3-none-any.whl (25 kB)
Installing collected packages: chelo
Successfully installed chelo-0.0.4
Configuration file '/root/.chelo/chelo.json' does not exist. Creating a new one with default settings.
Configuration saved to '/root/.chelo/chelo.json'.
Default configuration file '/root/.chelo/chelo.json' created.
Downloading dataset 'ainalirham/coal-fired-power-plant-thermal-performance-dataset' into '/root/.chelo/kaggle/ainalirham_coal-fired-power-plant-thermal-performance-dataset'...
Dataset URL: https://www.kaggle.com/datasets/ainalirham/coal-fired-power-plant-thermal-performance-dataset
(91, 53) (91,)
(72, 53) (72,) (19, 53) (19,)
Training set size: 72, Test set size: 19
min X_train & X_train_normalized:  -93.6236187087165 0.0
max X_train & X_train_normalized:  742858200.0 1.0
min y_train & y_train_normalized:  93.4 0.0
max y_train & y_train_normalized:  94.0 1.0
First 2 rows of X_tra