In [None]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import warnings
warnings.filterwarnings('ignore')

In [None]:
def load_data(file_name):
    df = pd.read_csv('dataset/LBNL_FDD_Dataset_SDAHU_all_3/LBNL_FDD_Dataset_SDAHU/'+file_name, index_col='Datetime')
    return df

In [None]:
df_correct_data = load_data('AHU_annual.csv')

In [None]:
df_damper_stuck_010 = load_data('damper_stuck_010_annual.csv')
df_damper_stuck_025 = load_data('damper_stuck_025_annual.csv')
df_damper_stuck_075 = load_data('damper_stuck_075_annual.csv')
df_damper_stuck_100 = load_data('damper_stuck_100_annual_short.csv')

In [None]:
sa_bias_minus2 = load_data('sa_bias_-2_annual.csv')
sa_bias_minus4 = load_data('sa_bias_-4_annual.csv')
sa_bias_2 = load_data('sa_bias_4_annual.csv')
sa_bias_4 = load_data('sa_bias_4_annual.csv')

In [None]:
df_damper_stuck_010['fault'] = [1]*len(df_damper_stuck_010)
df_damper_stuck_025['fault'] = [1]*len(df_damper_stuck_025)
df_damper_stuck_075['fault'] = [1]*len(df_damper_stuck_075)
df_damper_stuck_100['fault'] = [1]*len(df_damper_stuck_100)

In [None]:
correlation_matrix = df_correct_data.corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Matrix')
plt.show()

In [None]:
def histplot_helper(cols, df):
    # select the columns to be plotted
    #cols = list(df_log.keys())
    #cols.remove('LABEL')

    # create the figure and axes
    fig, axes = plt.subplots(2, 3)
    axes = axes.ravel()  # flattening the array makes indexing easier

    for col, ax in zip(cols, axes):
        sns.histplot(data=df, x = col, kde=True, stat='density', ax=ax, hue='fault')

    fig.tight_layout()
    plt.show()

In [None]:
feature = list(set(pd.concat([df_correct_data, df_damper_stuck_010]).keys()))

In [None]:
histplot_helper(feature[0:5], pd.concat([df_correct_data, df_damper_stuck_010]))

In [None]:
histplot_helper(feature[5:10], pd.concat([df_correct_data, df_damper_stuck_010]))

In [None]:
histplot_helper(feature[10:15], pd.concat([df_correct_data, df_damper_stuck_010]))

In [None]:
histplot_helper(feature[15:20], pd.concat([df_correct_data, df_damper_stuck_010]))

In [None]:
histplot_helper(feature[20:25], pd.concat([df_correct_data, df_damper_stuck_010]))

In [None]:
histplot_helper(feature[25:30], pd.concat([df_correct_data, df_damper_stuck_010]))

In [None]:
from sklearn.feature_selection import VarianceThreshold

def remove_low_variance_features(df, threshold=0.1):
    """
    Prints the variance of each feature and removes features with variance below the threshold.

    Parameters:
    - df: pandas DataFrame
        The input DataFrame with features.
    - threshold: float, optional (default=0.1)
        The threshold below which features will be removed.

    Returns:
    - df_filtered: pandas DataFrame
        The DataFrame with low variance features removed.
    """
    # Calculate variance for each feature
    feature_variances = df.var()

    # Print the variance of each feature
    print("Feature Variances:")
    print(feature_variances)

    # Create a VarianceThreshold instance
    variance_threshold = VarianceThreshold(threshold=threshold)

    # Fit and transform the data to remove low variance features
    df_filtered = pd.DataFrame(variance_threshold.fit_transform(df), columns=df.columns[feature_variances >= threshold])

    # Print the selected features
    selected_features = df.columns[feature_variances >= threshold]
    print(f"\nSelected Features (with variance >= {threshold}):")
    print(selected_features)

    return df_filtered

# Example Usage:
# Assuming df is your DataFrame
# Adjust the threshold as needed
df_filtered = remove_low_variance_features(df_correct_data, threshold=0.1)

In [None]:
for key in df_filtered:
    print(key)
    plt.plot(df_filtered[key].values, 'g')
    plt.plot(df_damper_stuck_010[key].values, 'r')
    plt.show()

In [None]:
selected_features = ['MA_TEMP', 'RA_TEMP', 'RF_WAT']

In [None]:
df_filtered_features_selected = df_filtered[selected_features]

In [None]:
df_filtered_features_selected.to_csv('df_correct_features_selected.csv')

In [None]:
df_filtered_features_selected.describe()

In [None]:
df_damper_stuck_025[selected_features].describe()

In [None]:
plt.plot(df_correct_data['MA_TEMP'])