In [15]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import os
import warnings
warnings.filterwarnings("ignore")
from collections import Counter
from sklearn.preprocessing import StandardScaler, MinMaxScaler, PowerTransformer
import json 
base_dir = 'data\\LBNL_FDD_Dataset_SDAHU_all_3\\LBNL_FDD_Dataset_SDAHU\\'
import pickle
from sklearn.metrics import precision_recall_curve, auc,confusion_matrix
import json

In [82]:
def load_data(file_name):
    df = pd.read_csv(base_dir+file_name, index_col='Datetime')
    df.index = pd.to_datetime(df.index, format='%Y-%m-%d %H:%M:%S')
    return df

def plot_line_graph(df1, df2, feature):
    plt.plot(df1[feature].values, 'g')
    plt.plot(df2[feature].values, 'r')
    plt.show()

import pandas as pd
from statsmodels.tsa.stattools import adfuller

def check_stationarity(data, column_name, significance_level=0.05):
    """
    Check the stationarity of a time series using the Augmented Dickey-Fuller (ADF) test.

    Parameters:
    - data: pd.DataFrame
        Input DataFrame with time series data.
    - column_name: str
        Name of the column containing the time series data.
    - significance_level: float, optional (default=0.05)
        Significance level for the ADF test.

    Returns:
    - bool
        True if the time series is stationary, False otherwise.
    """

    # Check if the specified column exists in the DataFrame
    if column_name not in data.columns:
        raise ValueError(f"Column '{column_name}' not found in the DataFrame.")

    try:
        # Perform the ADF test
        result = adfuller(data[column_name])

        # Extract ADF test statistic and p-value
        adf_statistic = result[0]
        p_value = result[1]
    
        # Compare p-value with significance level
        is_stationary = p_value <= significance_level
        
        if stationary:
            print(f"The time series is stationary. ADF Statistic: {adf_statistic}, p-value: {p_value}")
        else:
            print(f"The time series is not stationary. ADF Statistic: {adf_statistic}, p-value: {p_value}")
            
        #return is_stationary, adf_statistic, p_value
    except Exception as e:
        print(f"An unexpected error occurred: {e}")

import pandas as pd
from sklearn.feature_selection import VarianceThreshold

def drop_low_variance_features(data, threshold=0.01):
    
    # Calculate variance of each feature
    variances = data.var()

    #print("Variances of features:")
    #print(variances)

    # Use VarianceThreshold to identify low variance features
    selector = VarianceThreshold(threshold)
    selector.fit(data)

    # Get indices of features to keep
    keep_indices = selector.get_support(indices=True)

    # Subset the DataFrame with selected features
    selected_data = data.iloc[:, keep_indices]

    return selected_data


from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def evaluate_classification_metrics(y_true, y_pred):
    """
    Evaluate classification metrics such as accuracy, precision, recall, and F1 score.

    Parameters:
    - y_test: list or array-like
        True labels.
    - y_pred: list or array-like
        Predicted labels.

    Returns:
    - dict
        Dictionary containing evaluation metrics.
    """
    # Initialize counts
    TP = TN = FP = FN = 0

    # Calculate confusion matrix elements
    for true, predicted in zip(y_true, y_pred):
        if true == 1 and predicted == 1:
            TP += 1
        elif true == 0 and predicted == 0:
            TN += 1
        elif true == 0 and predicted == 1:
            FP += 1
        elif true == 1 and predicted == 0:
            FN += 1

    print("TP:", TP)
    print("TN:", TN)
    print("FP:", FP)
    print("FN:", FN)

    # Compute classification metrics
    accuracy = round(accuracy_score(y_true, y_pred), 2)
    precision = round(precision_score(y_true, y_pred))
    recall = round(recall_score(y_true, y_pred), 2)
    f1 = round(f1_score(y_true, y_pred), 2)

    # Create a dictionary to store the metrics
    metrics_dict = {
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1
    }

    return metrics_dict

from sklearn.cluster import KMeans
def visualize_optimal_clusters(data, max_clusters=10):
    """
    Visualize the optimal number of clusters using the elbow method.

    Parameters:
    - data: array-like
        The training data for clustering.
    - max_clusters: int, optional (default=10)
        Maximum number of clusters to consider.

    Returns:
    None (plots the elbow curve).
    """

    distortions = []
    
    for i in range(1, max_clusters + 1):
        kmeans = KMeans(n_clusters=i, random_state=42)
        kmeans.fit(data)
        distortions.append(kmeans.inertia_)

    # Plotting the elbow curve
    plt.figure(figsize=(8, 6))
    plt.plot(range(1, max_clusters + 1), distortions, marker='o')
    plt.title('Elbow Method for Optimal Number of Clusters')
    plt.xlabel('Number of Clusters')
    plt.ylabel('Distortion (Inertia)')
    plt.show()



def predict_and_assign_labels(kmeans_model, test_data):
    """
    Predict cluster labels for test data and assign binary labels.

    Parameters:
    - kmeans_model: KMeans
        Trained KMeans model.
    - test_data: array-like
        Test data for prediction.

    Returns:
    - binary_labels: array-like
        Binary labels (0 or 1) assigned based on cluster membership.
    """

    # Predict cluster labels for the test data
    cluster_labels = kmeans_model.predict(test_data)
    print("cluster labels",cluster_labels) 
    # Assign binary labels (0 if in any cluster, 1 if not in any cluster)
    binary_labels = np.where(cluster_labels >= 0, 0, 1)

    return binary_labels

from sklearn.neighbors import LocalOutlierFactor
from sklearn.model_selection import GridSearchCV

def fine_tune_lof_model(non_faulty_data):
    """
    Fine-tune Local Outlier Factor (LOF) model on non-faulty data.

    Parameters:
    - non_faulty_data: pd.DataFrame or 2D array
        Non-faulty data for training the LOF model.

    Returns:
    - Trained LOF model.
    """

    # Fine-tune LOF model using GridSearchCV
    param_grid = {'n_neighbors': [5, 10, 15, 20], 'contamination': [0.01, 0.05, 0.1, 0.2]}
    lof = LocalOutlierFactor(novelty=True)
    grid_search = GridSearchCV(lof, param_grid, scoring='neg_mean_squared_error', cv=5, verbose=True)
    grid_search.fit(non_faulty_data)

    # Train the best LOF model with optimal hyperparameters
    best_lof_model = LocalOutlierFactor(n_neighbors=grid_search.best_params_['n_neighbors'],
                                        contamination=grid_search.best_params_['contamination'], novelty=True)
    best_lof_model.fit(non_faulty_data)

    return best_lof_model

def evaluate_test_data(dict_fault, scaler, model, model_name, freq):
    
    results = {}
    df_predict_faults_vals = pd.DataFrame()
    df_hold_out_set_exp = pd.DataFrame()

    for file in dict_fault.keys():
        print(file)
        df_test = dict_faulty_df[file][training_start_date_time : training_end_date_time].resample(freq).mean()
        
        df_hold_out_set_exp = pd.concat([df_hold_out_set_exp, dict_faulty_df[file][test_start_data:]])
        
        test_fft = np.fft.fft(df_test)
        
        y_true = [1]*len(df_test)
        
        labels = trained_lof_model.predict(scaler.transform(test_fft.real))
        
        binary_labels = np.where(labels == -1, 1, 0)

        
        metrics = evaluate_classification_metrics(y_true, binary_labels)
        print(metrics)
        results[file] = metrics
        print()

        fault_assignment = np.where(binary_labels == 1, True, False)
        
        fault_indices = np.where(fault_assignment)[0]
        
        df_faults_vals = df_test.iloc[fault_indices]
        
        df_predict_faults_vals = pd.concat([df_predict_faults_vals, df_faults_vals], ignore_index=True)
        
        
    with open('results/'+model_name+'_results'+str(freq)+'.json', "w") as outfile: 
        json.dump(results, outfile, indent=4)
        
    df_predict_faults_vals.to_csv('results/'+model_name+'_predicted_faults_vals'+'_sampling_freq_'+str(freq)+'.csv', sep=',', index=False, encoding='utf-8')

    df_hold_out_set_exp.to_csv('results/hold_out_test_set_explanations_sampling_freq_'+str(freq)+'.csv', sep=',', index=False, encoding='utf-8')

In [3]:
correct_data = load_data('AHU_annual.csv')

In [4]:
dict_faulty_df = {}
for c, file in enumerate(os.listdir(base_dir)):
    if c>0:
        dict_faulty_df[file] = load_data(file)

In [5]:
training_start_date_time = '2018-01-01 01:00:00'
training_end_date_time = '2018-10-31 23:59:00'

test_start_data = '2018-11-01 00:00:00'

In [141]:
resampling_freq = '1H'

In [142]:
df_train = correct_data[training_start_date_time : training_end_date_time]
df_valid = correct_data[test_start_data :]

In [143]:
df_train_resampled = df_train.resample(resampling_freq).mean()
df_valid_resampled = df_valid.resample(resampling_freq).mean()

In [144]:
# transforming data from the time domain to the frequency domain using fast Fourier transform
train_fft = np.fft.fft(df_train_resampled)
valid_fft = np.fft.fft(df_train_resampled)

In [145]:
scaler = MinMaxScaler()
train_scaled_fft = scaler.fit_transform(train_fft.real)

### Local Outlier Factor

In [146]:
trained_lof_model = fine_tune_lof_model(train_scaled_fft)

Fitting 5 folds for each of 16 candidates, totalling 80 fits


In [147]:
valid_pred_labels = trained_lof_model.predict(scaler.transform(valid_fft.real))
predicted_labels = np.where(valid_pred_labels == -1, 1, 0)
valid_true_labels = [0]*len(valid_fft)

value_counts = Counter(predicted_labels)

#Print the counts
for value, count in value_counts.items():
    print(f"{value}: {count} occurrences")

0: 7262 occurrences
1: 33 occurrences


In [148]:
metrics = evaluate_classification_metrics(valid_true_labels, predicted_labels)

print(metrics)

TP: 0
TN: 7262
FP: 33
FN: 0
{'Accuracy': 1.0, 'Precision': 0, 'Recall': 0.0, 'F1 Score': 0.0}


In [149]:
evaluate_test_data(dict_faulty_df, scaler, trained_lof_model, 'LOF', resampling_freq)

coi_bias_-2_annual.csv
TP: 5989
TN: 0
FP: 0
FN: 1306
{'Accuracy': 0.82, 'Precision': 1, 'Recall': 0.82, 'F1 Score': 0.9}

coi_bias_-4_annual.csv
TP: 5391
TN: 0
FP: 0
FN: 1904
{'Accuracy': 0.74, 'Precision': 1, 'Recall': 0.74, 'F1 Score': 0.85}

coi_bias_2_annual.csv
TP: 6239
TN: 0
FP: 0
FN: 1056
{'Accuracy': 0.86, 'Precision': 1, 'Recall': 0.86, 'F1 Score': 0.92}

coi_bias_4_annual.csv
TP: 6011
TN: 0
FP: 0
FN: 1284
{'Accuracy': 0.82, 'Precision': 1, 'Recall': 0.82, 'F1 Score': 0.9}

coi_leakage_010_annual.csv
TP: 6203
TN: 0
FP: 0
FN: 1092
{'Accuracy': 0.85, 'Precision': 1, 'Recall': 0.85, 'F1 Score': 0.92}

coi_leakage_025_annual.csv
TP: 6203
TN: 0
FP: 0
FN: 1092
{'Accuracy': 0.85, 'Precision': 1, 'Recall': 0.85, 'F1 Score': 0.92}

coi_leakage_040_annual.csv
TP: 6203
TN: 0
FP: 0
FN: 1092
{'Accuracy': 0.85, 'Precision': 1, 'Recall': 0.85, 'F1 Score': 0.92}

coi_leakage_050_annual.csv
TP: 6203
TN: 0
FP: 0
FN: 1092
{'Accuracy': 0.85, 'Precision': 1, 'Recall': 0.85, 'F1 Score': 0.92}

coi_

In [150]:
import joblib

# Save the model to a file
joblib.dump(trained_lof_model, 'models/lof_model_'+str(resampling_freq)+'.pkl')
joblib.dump(scaler, 'models/minmax_model_'+str(resampling_freq)+'.pkl')

['models/minmax_model_1H.pkl']