In [21]:
# Importing essential libraries for data processing, model building, hyperparameter tuning, and evaluation:
# pandas and numpy for data manipulation and numerical operations.
# train_test_split for splitting data, GridSearchCV for hyperparameter optimization.
# StandardScaler for feature scaling to ensure uniform feature magnitudes.
# accuracy_score and f1_score for model performance evaluation.
# DecisionTreeClassifier, RandomForestClassifier, and XGBClassifier for various tree-based models.
# SVC (Support Vector Classifier) for classification using the Support Vector Machine algorithm.
import pandas as pd
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC


In [22]:
# Load data
df = pd.read_csv("Realistic_Solar_PV_Fault_100k.csv")

In [23]:
## Displaying the first five rows of the dataset to quickly inspect the data structure, feature names, and sample values.
df.head()

Unnamed: 0,PV_Energy_Generation (kWh),Latitude,Longitude,GHI (W/m²),BNI (W/m²),DHI (W/m²),POA (W/m²),Ambient_Temperature (°C),Sun_Azimuth (degrees),Solar_Elevation (degrees),Panel_Orientation (degrees),Panel_Tilt (degrees),Efficiency_Coefficient (βₚₒₐ),Fault_Flag
0,106.206415,31.389871,66.454018,456.599827,563.732619,173.792209,555.620538,33.404449,111.300709,54.818941,180,20,0.167186,0
1,139.820201,39.012142,72.898388,865.957872,528.415518,76.132223,995.393317,30.48532,261.923262,36.797108,90,40,0.139599,0
2,125.188036,39.517393,74.895873,798.935733,499.015678,421.34713,598.533862,31.117059,338.552379,23.810527,90,10,0.162754,0
3,59.176892,40.501925,80.265979,872.600321,430.916378,386.3186,851.551133,33.470233,38.002933,46.401793,0,10,0.183869,0
4,77.510396,30.412602,77.865124,336.827815,549.973696,283.04306,455.040028,32.824463,17.161166,32.76834,0,10,0.15247,1


In [24]:
# Defining input features (X) by dropping columns that have less than 5% feature importance 
# based on prior feature importance analysis.
# Dropping 'Fault_Flag' (target variable) and low-importance features
X = df.drop(columns=[
    "Fault_Flag", "DHI (W/m²)",
    "Sun_Azimuth (degrees)", "Solar_Elevation (degrees)",
    "Panel_Orientation (degrees)", "Longitude", "Latitude", "Panel_Tilt (degrees)"
])
y = df["Fault_Flag"]

In [None]:
df1 = df.drop(columns=[
    "DHI (W/m²)",
    "Sun_Azimuth (degrees)", "Solar_Elevation (degrees)",
    "Panel_Orientation (degrees)", "Longitude", "Latitude", "Panel_Tilt (degrees)"
])
df1.head()

In [35]:
# Checking for missing values in each column of the DataFrame 'df1'.
# The isnull().sum() function returns the total count of null (NaN) values for each feature.
df1.isnull().sum()

PV_Energy_Generation (kWh)       0
GHI (W/m²)                       0
BNI (W/m²)                       0
POA (W/m²)                       0
Ambient_Temperature (°C)         0
Efficiency_Coefficient (βₚₒₐ)    0
Fault_Flag                       0
dtype: int64

In [34]:
# Generating descriptive statistics of the DataFrame 'df1' to understand feature distributions.
# Provides count, mean, standard deviation, minimum, 25th percentile, median, 75th percentile, and maximum values for each feature.
df1.describe()

Unnamed: 0,PV_Energy_Generation (kWh),GHI (W/m²),BNI (W/m²),POA (W/m²),Ambient_Temperature (°C),Efficiency_Coefficient (βₚₒₐ),Fault_Flag
count,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0
mean,98.059162,679.179951,580.209785,734.779696,28.514296,0.175858,0.1
std,20.8732,161.97608,134.468253,109.845301,5.215495,0.023276,0.300002
min,7.020875,-43.586221,-91.152981,221.621266,6.413313,0.058767,0.0
25%,84.091063,574.516589,494.921737,663.603047,24.965806,0.161664,0.0
50%,98.273257,682.48227,585.537458,738.240262,28.477657,0.177296,0.0
75%,112.182786,789.082407,671.622239,809.035387,31.977727,0.191649,0.0
max,189.581685,1332.904952,1070.182203,1217.89491,52.903864,0.272328,1.0


In [33]:
df1.info()

Unnamed: 0,PV_Energy_Generation (kWh),GHI (W/m²),BNI (W/m²),POA (W/m²),Ambient_Temperature (°C),Efficiency_Coefficient (βₚₒₐ),Fault_Flag
count,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0
mean,98.059162,679.179951,580.209785,734.779696,28.514296,0.175858,0.1
std,20.8732,161.97608,134.468253,109.845301,5.215495,0.023276,0.300002
min,7.020875,-43.586221,-91.152981,221.621266,6.413313,0.058767,0.0
25%,84.091063,574.516589,494.921737,663.603047,24.965806,0.161664,0.0
50%,98.273257,682.48227,585.537458,738.240262,28.477657,0.177296,0.0
75%,112.182786,789.082407,671.622239,809.035387,31.977727,0.191649,0.0
max,189.581685,1332.904952,1070.182203,1217.89491,52.903864,0.272328,1.0


In [26]:
# Splitting the dataset into training and testing sets.
# Using stratified sampling to maintain the class distribution of 'Fault_Flag' in both sets.
# Allocating 80% of the data for training and 20% for testing.
# Setting random_state to 42 to ensure reproducibility of the split.
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

In [27]:
# Applying feature scaling using StandardScaler to standardize the input features.
# StandardScaler transforms the features to have zero mean and unit variance, improving model performance for algorithms sensitive to feature scales.
# The scaler is fitted on the training data and then applied to both training and testing sets to ensure consistent scaling.
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [28]:
# ----- RLS Classifier -----
class RecursiveLeastSquaresClassifier:
    def __init__(self, num_features, lambda_factor=0.99, delta=1.0):
        self.P = np.eye(num_features) * delta # Covariance matrix initialization
        self.theta = np.zeros(num_features) # Coefficient vector initialization
        self.lambda_factor = lambda_factor # Forgetting factor

    def update(self, X, y):
        X = np.array(X).reshape(-1, 1) # Reshape input as column vector
        K = self.P @ X / (self.lambda_factor + X.T @ self.P @ X) # Calculate the Kalman gain
        self.theta += (y - X.T @ self.theta) * K.flatten() # Update model coefficients
        self.P = (self.P - K @ X.T @ self.P) / self.lambda_factor # Update covariance matrix

    def predict(self, X):
        return (np.dot(X, self.theta) >= 0.5).astype(int) # Make predictions: classify as 1 if predicted value >= 0.5, else 0

rls = RecursiveLeastSquaresClassifier(num_features=X_train_scaled.shape[1])
for i in range(len(X_train_scaled)):
    rls.update(X_train_scaled[i], y_train.iloc[i]) #Update the model iteratively using each training sample
rls_preds = rls.predict(X_test_scaled)

In [29]:
# Performing hyperparameter tuning for a Decision Tree Classifier using GridSearchCV.
# Tuning the 'max_depth' parameter to find the best tree depth using 3-fold cross-validation.
dtr = GridSearchCV(DecisionTreeClassifier(random_state=42), {'max_depth': [5, 10, 20, None]}, cv=3)
# Fitting the Decision Tree model with the training data.
dtr.fit(X_train, y_train)
# Making predictions on the test set using the best found model.
dtr_preds = dtr.predict(X_test)

In [30]:
# Performing hyperparameter tuning for a Random Forest Classifier using GridSearchCV.
# Tuning 'max_depth' with a fixed number of trees (n_estimators=100) using 3-fold cross-validation.
rfc = GridSearchCV(RandomForestClassifier(random_state=42), {'n_estimators': [100], 'max_depth': [10, 20, None]}, cv=3)
rfc.fit(X_train, y_train)
rfc_preds = rfc.predict(X_test)

In [31]:
# Initializing a Support Vector Classifier (SVC) with an RBF kernel for handling non-linear classification.
# Setting probability=True to enable probability estimates (useful for future ensemble methods or evaluation).
svc = SVC(kernel='rbf', probability=True, random_state=42)
svc.fit(X_train_scaled, y_train)  # use scaled data for SVC
svc_preds = svc.predict(X_test_scaled)


In [13]:
# Performing hyperparameter tuning for an XGBoost Classifier using GridSearchCV.
# Tuning 'max_depth' while keeping 'n_estimators' fixed at 100 using 3-fold cross-validation.
# Setting 'use_label_encoder=False' and 'eval_metric=logloss' to avoid XGBoost warnings and ensure proper training.
xgb = GridSearchCV(XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss'), 
                   {'n_estimators': [100], 'max_depth': [3, 5, 10]}, cv=3)
xgb.fit(X_train, y_train)
xgb_preds = xgb.predict(X_test)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [14]:
# Attempting to import TensorFlow and build a simple Artificial Neural Network (ANN) for binary classification.
try:
    import tensorflow as tf
    from tensorflow.keras.models import Sequential
    from tensorflow.keras.layers import Dense
    
    # Building a Sequential ANN model with:
    # - Input layer: 16 neurons, ReLU activation.
    # - Hidden layer: 8 neurons, ReLU activation.
    # - Output layer: 1 neuron, Sigmoid activation for binary classification.


    model = Sequential()
    model.add(Dense(16, activation='relu', input_dim=X_train_scaled.shape[1]))
    model.add(Dense(8, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    # Compiling the ANN with Adam optimizer and binary cross-entropy loss function for binary classification.

    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    # Training the ANN model on scaled training data for 20 epochs with a batch size of 64.
    # verbose=0 suppresses training output.
    model.fit(X_train_scaled, y_train, epochs=20, batch_size=64, verbose=0)
    # Making predictions on the scaled test data and converting probabilities to binary class labels.

    ann_preds = (model.predict(X_test_scaled) > 0.5).astype(int).flatten()
except ModuleNotFoundError:
    ann_preds = None

2025-06-02 15:15:00.568801: W tensorflow/tsl/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz




In [36]:
# Creating a dictionary to store the model names as keys and their corresponding predictions as values.
models = {
 "RLS": rls_preds,
 "Decision Tree": dtr_preds,
 "Random Forest": rfc_preds,
 "XGBoost": xgb_preds,
 "SVC":svc_preds
}

In [37]:
if ann_preds is not None:
    models["ANN"] = ann_preds

In [98]:
# Importing evaluation metrics from sklearn to assess model performance:
# - accuracy_score: Measures the overall correctness of classification models.
# - f1_score: Provides a balance between precision and recall, especially useful for imbalanced datasets.
# - mean_squared_error (MSE): Quantifies the average squared difference between predicted and actual values (mainly for regression but sometimes used for error reporting in classification probabilities).
# - mean_absolute_error (MAE): Measures the average magnitude of prediction errors.
# - r2_score: Indicates how well predictions approximate actual values; mainly used in regression but sometimes reported for overall fit.
from sklearn.metrics import accuracy_score, f1_score, mean_squared_error, mean_absolute_error, r2_score

In [99]:
# Initializing a dictionary to store performance metrics for each model.
# The dictionary will hold lists for model names, accuracy, F1 score, Mean Squared Error (MSE),
# Mean Absolute Error (MAE), and R-squared (R2) scores to systematically compare all models.
metrics = {
    "Model": [],
    "Accuracy": [],
    "F1 Score": [],
    "MSE": [],
    "MAE": [],
    "R2 Score": []
}

In [100]:
# Iterating through each model and its predictions to calculate and store performance metrics.
for name, preds in models.items():
    # Appending the model name.
    metrics["Model"].append(name)
    # Calculating and appending the accuracy score,F1 Score,MSE,MAE,R2 Score for the current model.
    metrics["Accuracy"].append(accuracy_score(y_test, preds))
    metrics["F1 Score"].append(f1_score(y_test, preds))
    metrics["MSE"].append(mean_squared_error(y_test, preds))
    metrics["MAE"].append(mean_absolute_error(y_test, preds))
    metrics["R2 Score"].append(r2_score(y_test, preds))

In [101]:
# Converting the collected performance metrics dictionary into a pandas DataFrame for easy tabular display and analysis.
results = pd.DataFrame(metrics)
# Printing the results sorted by F1 Score in descending order to highlight the best-performing model based on F1 Score.
print(results.sort_values(by="F1 Score", ascending=False))

           Model  Accuracy  F1 Score      MSE      MAE  R2 Score
5            ANN   0.97960  0.897642  0.02040  0.02040  0.773333
4            SVC   0.97985  0.894970  0.02015  0.02015  0.776111
3        XGBoost   0.97865  0.890033  0.02135  0.02135  0.762778
2  Random Forest   0.97885  0.889354  0.02115  0.02115  0.765000
1  Decision Tree   0.97025  0.844688  0.02975  0.02975  0.669444
0            RLS   0.93190  0.483700  0.06810  0.06810  0.243333


In [21]:
# Exporting the model performance comparison results to an Excel file named 'model_comparison_results.xlsx'.
# Setting index=False to exclude the DataFrame index from the Excel file.
results.to_excel("model_comparison_results.xlsx", index=False)