In [1]:
import pathlib
import pandas as pd

DATA_DIR = pathlib.Path("..\Data")
df = pd.read_feather(DATA_DIR / "Processed.feather")
df.head(10)

Unnamed: 0,timestamp,TP2,TP3,H1,DV_pressure,Reservoirs,Oil_temperature,Flowmeter,Motor_current,COMP,DV_eletric,Towers,MPG,LPS,Oil_level,Caudal_impulses,gpsSpeed
0,2022-01-01 06:00:00,-0.012001,9.757812,9.757812,-0.028,1.576172,63.34375,19.046875,3.955078,True,False,True,True,False,False,False,0
1,2022-01-01 06:00:01,-0.012001,9.757812,9.757812,-0.028,1.578125,63.25,19.046875,4.027344,True,False,True,True,False,False,False,0
2,2022-01-01 06:00:02,-0.010002,9.757812,9.757812,-0.028,1.578125,63.3125,19.046875,3.945312,True,False,True,True,False,False,False,0
3,2022-01-01 06:00:03,-0.012001,9.757812,9.757812,-0.029999,1.576172,63.1875,19.046875,3.929688,True,False,True,True,False,False,False,0
4,2022-01-01 06:00:04,-0.012001,9.757812,9.757812,-0.029999,1.578125,63.15625,19.046875,3.994141,True,False,True,True,False,False,False,0
5,2022-01-01 06:00:05,-0.012001,9.757812,9.757812,-0.029999,1.576172,63.0625,19.046875,3.994141,True,False,True,True,False,False,False,0
6,2022-01-01 06:00:06,-0.012001,9.757812,9.757812,-0.028,1.578125,63.0,19.046875,3.947266,True,False,True,True,False,False,False,0
7,2022-01-01 06:00:07,-0.010002,9.75,9.757812,-0.028,1.576172,63.0625,19.046875,4.023438,True,False,True,True,False,False,False,0
8,2022-01-01 06:00:08,-0.012001,9.75,9.75,-0.028,1.576172,63.0625,19.046875,3.953125,True,False,True,True,False,False,False,0
9,2022-01-01 06:00:09,-0.010002,9.75,9.75,-0.028,1.576172,63.0,19.046875,3.925781,True,False,True,True,False,False,False,0


## Preparing the Data for Classification

for classification we will be treating this as a multi-class classification problem, where we are trying to label each point as 3 things:

- If the point is normal operating condition, we will label it as 0
- If the point is at least 2 hours before given failure points we label it as 1
- If the point is in the failure interval we label it as 2

We decide to do this, it is hard to predict a continuous timestamp from a ML model thus there needs to be some discretization, there is point to note here which is the threshold, it maybe possible the accuracy will decrease if this threshold `N_Hours` is to high, because the model is likely to get confused on what is normal and before failure states

In [11]:
# Define the failure periods, Given in the data description
Classification_df = df.copy(deep=True)

failure_periods = [
    ("2022-02-28 21:53", "2022-03-01 02:00"),
    ("2022-03-23 14:54", "2022-03-23 15:24"),
    ("2022-05-30 12:00", "2022-06-02 06:18"),
]

failure_periods = [
    (pd.to_datetime(start), pd.to_datetime(end)) for start, end in failure_periods
]

N_Hours = 2

Classification_df["Target"] = 0

# Iterate through the failure periods and assign labels
for start, end in failure_periods:
    mask = (Classification_df["timestamp"] >= start) & (
        Classification_df["timestamp"] <= end
    )
    Classification_df.loc[mask, "Target"] = 2  # Label as 2 for failure state
    
    two_hours_before_start = start - pd.Timedelta(hours=N_Hours)
    mask = (Classification_df["timestamp"] >= two_hours_before_start) & (
        Classification_df["timestamp"] < start
    )
    Classification_df.loc[mask, "Target"] = 1  # Label as 1 for 2 hours before failure

# Print the first few rows of the updated DataFrame
Classification_df.head()

Unnamed: 0,timestamp,TP2,TP3,H1,DV_pressure,Reservoirs,Oil_temperature,Flowmeter,Motor_current,COMP,DV_eletric,Towers,MPG,LPS,Oil_level,Caudal_impulses,gpsSpeed,Target
0,2022-01-01 06:00:00,-0.012001,9.757812,9.757812,-0.028,1.576172,63.34375,19.046875,3.955078,True,False,True,True,False,False,False,0,0
1,2022-01-01 06:00:01,-0.012001,9.757812,9.757812,-0.028,1.578125,63.25,19.046875,4.027344,True,False,True,True,False,False,False,0,0
2,2022-01-01 06:00:02,-0.010002,9.757812,9.757812,-0.028,1.578125,63.3125,19.046875,3.945312,True,False,True,True,False,False,False,0,0
3,2022-01-01 06:00:03,-0.012001,9.757812,9.757812,-0.029999,1.576172,63.1875,19.046875,3.929688,True,False,True,True,False,False,False,0,0
4,2022-01-01 06:00:04,-0.012001,9.757812,9.757812,-0.029999,1.578125,63.15625,19.046875,3.994141,True,False,True,True,False,False,False,0,0


In [12]:
Classification_df.to_feather(DATA_DIR / "Classification.feather")

## Creating a Regression data frame

this is data used for regression, this is also known as time to failure prediction or remaining useful life prediction

In [2]:
Reg_df = df.copy(deep=True)

failure_periods = [
    ("2022-02-28 21:53", "2022-03-01 02:00"),
    ("2022-03-23 14:54", "2022-03-23 15:24"),
    ("2022-05-30 12:00", "2022-06-02 06:18"),
]

failure_periods = [
    (pd.to_datetime(start), pd.to_datetime(end)) for start, end in failure_periods
]

In [4]:
# Sort failure periods by start time
failure_periods.sort(key=lambda x: x[0])

def find_time_till_failure(curr_time):
    for start, end in failure_periods:
        if curr_time < start:
            return (start - curr_time).total_seconds() / 3600
    return 0

# Apply the function to create the "Hours_till_Failure" column
Reg_df["Hours_till_Failure"] = Reg_df["timestamp"].apply(find_time_till_failure)


In [8]:
Reg_df.to_feather(DATA_DIR / "Regression.feather")

## Creating a failure detection Dataset
- 0 on normal conditions
- 1 if the point lies in known failure times

In [60]:
FD_df = df.copy(deep=True)

failure_periods = [
    ("2022-02-28 21:53", "2022-03-01 02:00"),
    ("2022-03-23 14:54", "2022-03-23 15:24"),
    ("2022-05-30 12:00", "2022-06-02 06:18"),
]

failure_periods = [
    (pd.to_datetime(start), pd.to_datetime(end)) for start, end in failure_periods
]

FD_df["Failure"] = 0

for start, end in failure_periods:
    mask = (FD_df["timestamp"] >= start) & (
        FD_df["timestamp"] <= end
    )
    FD_df.loc[mask, "Failure"] = 1

FD_df.to_feather(DATA_DIR / "Failure_detection.feather")