In [1]:
import pandas as pd
from tsfresh import extract_features
from tsfresh.transformers import RelevantFeatureAugmenter
from tsfresh.utilities.dataframe_functions import impute
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_validate, TimeSeriesSplit
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from xgboost import XGBClassifier

In [9]:
def loading_data():
    # Read each file into a DataFrame
    df_PS2 = pd.read_csv("../data_subset/PS2.txt", sep="\t", header=None)
    df_FS1 = pd.read_csv("../data_subset/FS1.txt", sep="\t", header=None)
    df_profile = pd.read_csv("../data_subset/profile.txt", sep="\t", header=None)
    
    df_profile.columns = ['cooler_condition_%', 'valve_condition_%', 'internal_pump_leakage',
                          'hydraulic_accumulator_bar', 'stable_flag' ]
    return df_PS2, df_FS1, df_profile

In [10]:
def resample_dataframes(df_PS2, df_FS1):
    # Transpose the FS1 DataFrame to have time series as rows
    df_FS1_transposed = df_FS1.T
    
    # Create a datetime index for the transposed FS1 DataFrame
    fs1_time_index = pd.date_range(start='2024-01-01', periods=len(df_FS1_transposed), freq='10ms')
    df_FS1_transposed.index = fs1_time_index
    
    # Resample FS1 to match the frequency of PS2 (from 10 Hz to 100 Hz)
    df_FS1_resampled_transposed = df_FS1_transposed.resample('10ms').interpolate()
    
    # Transpose back to the original format
    df_FS1_resampled = df_FS1_resampled_transposed.T
    
    # Transpose the PS2 DataFrame to have time series as rows
    df_PS2_transposed = df_PS2.T
    
    # Create a datetime index for the transposed PS2 DataFrame
    ps2_time_index = pd.date_range(start='2024-01-01', periods=len(df_PS2_transposed), freq='10ms')
    df_PS2_transposed.index = ps2_time_index
    
    # Resample PS2 to match the frequency of FS1 (from 100 Hz to 10 Hz)
    df_PS2_resampled_transposed = df_PS2_transposed.resample('100ms').interpolate()
    
    # Transpose back to the original format
    df_PS2_resampled = df_PS2_resampled_transposed.T
    return df_PS2_resampled, df_FS1_resampled

In [16]:
def wrangling_for_tsfresh(df):
    # Reset index to convert the timestamps to a regular column
    df = df.reset_index()
    
    # Melt the DataFrame to create the 'id', 'timestamp', and 'value' columns
    df_melted = pd.melt(df, id_vars=['index'], var_name='timestamp', value_name='value')
    
    # Rename the columns
    df_melted.columns = ['id', 'timestamp', 'value']
    
    # Sort by 'id' for better clarity
    df_melted = df_melted.sort_values(by='id').reset_index(drop=True)
    return df_melted

In [None]:
# Fit the pipeline to your data
df["valve_condition_%"].iloc[:2000]


In [17]:
def train_test_split_(df, nb_cycles=2000):
    X = df.drop(columns = ["valve_condition_%"]).copy()
    y = df["valve_condition_%"].copy()
    X_train_test, X_val = X.iloc[:nb_cycles], X.iloc[nb_cycles:]
    y_train_test, y_val = y.iloc[:nb_cycles], y.iloc[nb_cycles:]
    return X_train_test, X_val, y_train_test, y_val
    

In [18]:
def pipeline_creation(X_train, y_train):
    # Define the pipeline
    pipeline = Pipeline([
        ('augmenter', RelevantFeatureAugmenter(column_id='id', column_sort='timestamp', n_jobs=3)),
        ('classifier', XGBClassifier(
            # random_state=42,
            # n_estimators=100,  # Number of trees in the ensemble
            # max_depth=2,  # Maximum depth of each tree
            # learning_rate=0.1,  # Step size shrinkage used in update to prevent overfitting
            # min_child_weight=1,  # Minimum sum of instance weight (hessian) needed in a child
            # subsample=0.8,  # Subsample ratio of the training instance
            # colsample_bytree=0.8,  # Subsample ratio of columns when constructing each tree
            # reg_alpha=0,  # L1 regularization term on weights
            # reg_lambda=1,  # L2 regularization term on weights
            # gamma=0,  # Minimum loss reduction required to make a further partition on a leaf node of the tree
        ))  # XGBoost classifier
    ])
    
    # Define the evaluation metrics
    scoring = {
        'accuracy': 'accuracy',
        'precision': 'precision',
        'recall': 'recall',
        'f1': 'f1',
        'roc_auc': 'roc_auc'
    }
    pipeline.set_params(augmenter__timeseries_container=X_train)
    # Perform time series cross-validation
    cv_results = cross_validate(pipeline, X_train, y_train, cv=TimeSeriesSplit(n_splits=5), scoring=scoring)
    
    # Compute relevant metrics
    metrics_df = pd.DataFrame({
        'Accuracy': cv_results['test_accuracy'],
        'Precision': cv_results['test_precision'],
        'Recall': cv_results['test_recall'],
        'F1': cv_results['test_f1'],
        'AUC': cv_results['test_roc_auc']
    })
    return pipeline, cv_results, metrics_df


In [19]:
def validation_score(pipeline, X_train, y_train, X_test, y_test):
    pipeline.set_params(augmenter__timeseries_container=X_train)
    # Fit the pipeline to the training data
    pipeline.fit(X_train, y_train)
    
    # Predict the target variable on the test data
    y_pred = pipeline.predict(X_test)
    
    # Compute evaluation metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred)
    
    # Create a DataFrame to store the metrics
    metrics_df = pd.DataFrame({
        'Accuracy': [accuracy],
        'Precision': [precision],
        'Recall': [recall],
        'F1': [f1],
        'AUC': [roc_auc]
    })
    return metrics_df

In [21]:
# def main():
# loading data
df_PS2, df_FS1, df_profile = loading_data()
# resampling data with same frequency 
df_PS2_resampled, df_FS1_resampled = resample_dataframes(df_PS2, df_FS1)
# data wrangling - for tsfresh 
df_PS2_melted = wrangling_for_tsfresh(df_PS2_resampled)
df_FS1_melted = wrangling_for_tsfresh(df_FS1_resampled) #[lambda x: x.id==0]
# full dataframe 
df_full = pd.concat([df_FS1_melted.rename(columns={'value': 'fs1'}),
                     df_PS2_melted.rename(columns={'value': 'ps2'})['ps2']], axis=1)
# adding target variable to dataframe 
df_to_model = pd.concat([df_full, df_profile["valve_condition_%"]], axis=1)
# Changing target variable into a binary variable 100=1, the rest=0
df_to_model['valve_condition_%'] = np.where(df_to_model['valve_condition_%'] == 100, 1, 0)
# train-test split
X_train_test, X_val, y_train_test, y_val = train_test_split_(df_to_model, nb_cycles=2000)

In [33]:
df_full

Unnamed: 0,id,timestamp,fs1,ps2
0,0,2024-01-01 00:00:00,8.990,125.50
1,0,2024-01-01 00:00:00.550000,0.000,0.00
2,0,2024-01-01 00:00:03.430000,7.773,140.41
3,0,2024-01-01 00:00:03.440000,7.968,140.60
4,0,2024-01-01 00:00:00.540000,0.001,0.00
...,...,...,...,...
1322995,2204,2024-01-01 00:00:03.970000,8.131,130.86
1322996,2204,2024-01-01 00:00:03.960000,7.908,130.93
1322997,2204,2024-01-01 00:00:03.950000,8.288,130.65
1322998,2204,2024-01-01 00:00:03.930000,9.444,131.94


In [None]:
df_temp = df_profile.copy()
df_temp["valve_condition_%"] = np.where(df_profile["valve_condition_%"]== 100, 1, 0)
df = df_full.copy()
nb_cycles=100

X = df.copy()
y = df_temp["valve_condition_%"].copy()
X_train_test, X_val = X.iloc[:nb_cycles], X.iloc[nb_cycles:]
# Define the pipeline with only the first step
pipeline_first_step = Pipeline([
    ('augmenter', RelevantFeatureAugmenter(column_id='id', column_sort='timestamp', n_jobs=3))
])

# Set the timeseries container
pipeline_first_step.set_params(augmenter__timeseries_container=df)
# Fit the pipeline to your data
pipeline_first_step.fit(X_train_test, y.iloc[:nb_cycles])
# Transform the data using only the first step
X_transformed = pipeline_first_step.transform(X_train_test)

Feature Extraction:  20%|███████████████▊                                                               | 3/15 [00:13<00:37,  3.11s/it]

In [None]:
X_transformed

In [None]:
# cross-validation
print('CROSS-VALIDATION STARTED')
pipeline, cv_results, cv_metrics_df = pipeline_creation(X_train_test, y_train_test)
cv_metrics_df.to_csv("../data/cv_metrics_df.csv")
# validation performance 
print('VALIDATION STARTED')
val_metrics_df = validation_score(pipeline, X_train_test, y_train_test, X_val, y_val)
val_metrics_df.to_csv("../data/val_metrics_df.csv")