In [5]:
import pandas as pd
import numpy as np 

# Load the data and drop missing values

train = pd.read_csv('CMaps/train_FD001.txt', sep=' ', header=None)
test = pd.read_csv('CMaps/test_FD001.txt', sep=' ', header=None)
rul = pd.read_csv('CMaps/RUL_FD001.txt', sep=' ', header=None)

train = train.dropna(axis=1, how='all')
test = test.dropna(axis=1, how='all')
rul = rul.iloc[:, 0]

# Label the data

columns = (
    ["engine_id", "time_in_cycles"]
    + [f"op_setting_{i}" for i in range(1, 4)]  
    + [f"sensor_measurement_{i}" for i in range(1, 22)]  
)
train.columns = columns
test.columns = columns

# Check the data

print("Train DataFrame:")
print(train.head())
print("\nTest DataFrame:")
print(test.head())
print("\nRUL DataFrame:")
print(rul.head())

Train DataFrame:
   engine_id  time_in_cycles  op_setting_1  op_setting_2  op_setting_3  \
0          1               1       -0.0007       -0.0004         100.0   
1          1               2        0.0019       -0.0003         100.0   
2          1               3       -0.0043        0.0003         100.0   
3          1               4        0.0007        0.0000         100.0   
4          1               5       -0.0019       -0.0002         100.0   

   sensor_measurement_1  sensor_measurement_2  sensor_measurement_3  \
0                518.67                641.82               1589.70   
1                518.67                642.15               1591.82   
2                518.67                642.35               1587.99   
3                518.67                642.35               1582.79   
4                518.67                642.37               1582.85   

   sensor_measurement_4  sensor_measurement_5  ...  sensor_measurement_12  \
0               1400.60           

In [6]:
# Check and elimate outliers with Isolation Forest

from sklearn.ensemble import IsolationForest

# Extract sensor columns from the dataset
sensor_columns = [col for col in train.columns if "sensor_measurement" in col]
sensor_data = train[sensor_columns]

# Fit the Isolation Forest model
iso = IsolationForest(contamination=0.01, random_state=42)
outlier_flags = iso.fit_predict(sensor_data)

# The outlier flags are:
# - `1` for normal data
# - `-1` for anomalies

# Add the flags to the original DataFrame for inspection
train["outlier_flag"] = outlier_flags

# Count the number of outliers
print(f"Number of outliers detected by Isolation Forest: {(outlier_flags == -1).sum()}")

# Remove outliers
train_cleaned = train[train["outlier_flag"] == 1].drop(columns=["outlier_flag"])

# Verify the cleaned dataset
print(f"Shape of dataset after outlier removal: {train_cleaned.shape}")

Number of outliers detected by Isolation Forest: 207
Shape of dataset after outlier removal: (20424, 26)


In [7]:
outliers = train[outlier_flags == -1]
print(outliers.describe())

        engine_id  time_in_cycles  op_setting_1  op_setting_2  op_setting_3  \
count  207.000000      207.000000    207.000000    207.000000         207.0   
mean    46.637681      197.777778     -0.000183     -0.000012         100.0   
std     29.085564       44.567271      0.002160      0.000300           0.0   
min      1.000000        1.000000     -0.005600     -0.000500         100.0   
25%     18.500000      176.000000     -0.001700     -0.000300         100.0   
50%     48.000000      195.000000     -0.000300      0.000000         100.0   
75%     72.000000      213.000000      0.001350      0.000200         100.0   
max     99.000000      362.000000      0.006800      0.000600         100.0   

       sensor_measurement_1  sensor_measurement_2  sensor_measurement_3  \
count                207.00            207.000000            207.000000   
mean                 518.67            643.706329           1602.880000   
std                    0.00              0.349036              