## Project: FactoryGuard-AI
### Dataset: [NASA C-MAPSS-1 Turbofan Engine Degradation](https://www.kaggle.com/datasets/bishals098/nasa-turbofan-engine-degradation-simulation)
#### Week 1: Feature Engineering


In [16]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Column names as defined by NASA
columns = ['engine_id', 'cycle'] + \
          [f'op_setting_{i}' for i in range(1, 4)] + \
          [f'sensor_{i}' for i in range(1, 22)]

# Load training data
train_df = pd.read_csv(
    "/mnt/f/Zaalima Internship/Zaalima Project/factoryguard-ai/data/raw/train_FD001.txt",
    sep=' ',
    header=None
)

# Drop completely empty columns (IMPORTANT FIX)
train_df = train_df.dropna(axis=1, how='all')

# Assign column names
train_df.columns = columns

train_df.head()


Unnamed: 0,engine_id,cycle,op_setting_1,op_setting_2,op_setting_3,sensor_1,sensor_2,sensor_3,sensor_4,sensor_5,...,sensor_12,sensor_13,sensor_14,sensor_15,sensor_16,sensor_17,sensor_18,sensor_19,sensor_20,sensor_21
0,1,1,-0.0007,-0.0004,100.0,518.67,641.82,1589.7,1400.6,14.62,...,521.66,2388.02,8138.62,8.4195,0.03,392,2388,100.0,39.06,23.419
1,1,2,0.0019,-0.0003,100.0,518.67,642.15,1591.82,1403.14,14.62,...,522.28,2388.07,8131.49,8.4318,0.03,392,2388,100.0,39.0,23.4236
2,1,3,-0.0043,0.0003,100.0,518.67,642.35,1587.99,1404.2,14.62,...,522.42,2388.03,8133.23,8.4178,0.03,390,2388,100.0,38.95,23.3442
3,1,4,0.0007,0.0,100.0,518.67,642.35,1582.79,1401.87,14.62,...,522.86,2388.08,8133.83,8.3682,0.03,392,2388,100.0,38.88,23.3739
4,1,5,-0.0019,-0.0002,100.0,518.67,642.37,1582.85,1406.22,14.62,...,522.19,2388.04,8133.8,8.4294,0.03,393,2388,100.0,38.9,23.4044


In [17]:
train_df.describe()

Unnamed: 0,engine_id,cycle,op_setting_1,op_setting_2,op_setting_3,sensor_1,sensor_2,sensor_3,sensor_4,sensor_5,...,sensor_12,sensor_13,sensor_14,sensor_15,sensor_16,sensor_17,sensor_18,sensor_19,sensor_20,sensor_21
count,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,...,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0
mean,51.506568,108.807862,-9e-06,2e-06,100.0,518.67,642.680934,1590.523119,1408.933782,14.62,...,521.41347,2388.096152,8143.752722,8.442146,0.03,393.210654,2388.0,100.0,38.816271,23.289705
std,29.227633,68.88099,0.002187,0.000293,0.0,6.537152e-11,0.500053,6.13115,9.000605,3.3947e-12,...,0.737553,0.071919,19.076176,0.037505,1.556432e-14,1.548763,0.0,0.0,0.180746,0.108251
min,1.0,1.0,-0.0087,-0.0006,100.0,518.67,641.21,1571.04,1382.25,14.62,...,518.69,2387.88,8099.94,8.3249,0.03,388.0,2388.0,100.0,38.14,22.8942
25%,26.0,52.0,-0.0015,-0.0002,100.0,518.67,642.325,1586.26,1402.36,14.62,...,520.96,2388.04,8133.245,8.4149,0.03,392.0,2388.0,100.0,38.7,23.2218
50%,52.0,104.0,0.0,0.0,100.0,518.67,642.64,1590.1,1408.04,14.62,...,521.48,2388.09,8140.54,8.4389,0.03,393.0,2388.0,100.0,38.83,23.2979
75%,77.0,156.0,0.0015,0.0003,100.0,518.67,643.0,1594.38,1414.555,14.62,...,521.95,2388.14,8148.31,8.4656,0.03,394.0,2388.0,100.0,38.95,23.3668
max,100.0,362.0,0.0087,0.0006,100.0,518.67,644.53,1616.91,1441.49,14.62,...,523.38,2388.56,8293.72,8.5848,0.03,400.0,2388.0,100.0,39.43,23.6184


In [18]:
description=train_df.describe().loc["std"].reset_index()
description[description["std"]<0.01] # filtering almost zero variance columns to drop

Unnamed: 0,index,std
2,op_setting_1,0.002187313
3,op_setting_2,0.0002930621
4,op_setting_3,0.0
5,sensor_1,6.537152e-11
9,sensor_5,3.3947e-12
10,sensor_6,0.001388985
14,sensor_10,4.660829e-13
20,sensor_16,1.556432e-14
22,sensor_18,0.0
23,sensor_19,0.0


In [None]:
col_to_remove=description[description["std"]<0.01]["index"].values # Dropping the above columns
train_df.drop(col_to_remove,axis="columns",inplace=True)

In [20]:
train_df

Unnamed: 0,engine_id,cycle,sensor_2,sensor_3,sensor_4,sensor_7,sensor_8,sensor_9,sensor_11,sensor_12,sensor_13,sensor_14,sensor_15,sensor_17,sensor_20,sensor_21
0,1,1,641.82,1589.70,1400.60,554.36,2388.06,9046.19,47.47,521.66,2388.02,8138.62,8.4195,392,39.06,23.4190
1,1,2,642.15,1591.82,1403.14,553.75,2388.04,9044.07,47.49,522.28,2388.07,8131.49,8.4318,392,39.00,23.4236
2,1,3,642.35,1587.99,1404.20,554.26,2388.08,9052.94,47.27,522.42,2388.03,8133.23,8.4178,390,38.95,23.3442
3,1,4,642.35,1582.79,1401.87,554.45,2388.11,9049.48,47.13,522.86,2388.08,8133.83,8.3682,392,38.88,23.3739
4,1,5,642.37,1582.85,1406.22,554.00,2388.06,9055.15,47.28,522.19,2388.04,8133.80,8.4294,393,38.90,23.4044
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20626,100,196,643.49,1597.98,1428.63,551.43,2388.19,9065.52,48.07,519.49,2388.26,8137.60,8.4956,397,38.49,22.9735
20627,100,197,643.54,1604.50,1433.58,550.86,2388.23,9065.11,48.04,519.68,2388.22,8136.50,8.5139,395,38.30,23.1594
20628,100,198,643.42,1602.46,1428.18,550.94,2388.24,9065.90,48.09,520.01,2388.24,8141.05,8.5646,398,38.44,22.9333
20629,100,199,643.23,1605.26,1426.53,550.68,2388.25,9073.72,48.39,519.67,2388.23,8139.29,8.5389,395,38.29,23.0640


In [None]:
max_cycle=train_df.groupby("engine_id")["cycle"].max().reset_index().rename({"cycle":"max_cycle"},axis="columns") # MAX cycle for Remaining Useful Life computation
max_cycle

Unnamed: 0,engine_id,max_cycle
0,1,192
1,2,287
2,3,179
3,4,189
4,5,269
...,...,...
95,96,336
96,97,202
97,98,156
98,99,185


In [22]:
train_df=train_df.merge(max_cycle,on="engine_id")
train_df

Unnamed: 0,engine_id,cycle,sensor_2,sensor_3,sensor_4,sensor_7,sensor_8,sensor_9,sensor_11,sensor_12,sensor_13,sensor_14,sensor_15,sensor_17,sensor_20,sensor_21,max_cycle
0,1,1,641.82,1589.70,1400.60,554.36,2388.06,9046.19,47.47,521.66,2388.02,8138.62,8.4195,392,39.06,23.4190,192
1,1,2,642.15,1591.82,1403.14,553.75,2388.04,9044.07,47.49,522.28,2388.07,8131.49,8.4318,392,39.00,23.4236,192
2,1,3,642.35,1587.99,1404.20,554.26,2388.08,9052.94,47.27,522.42,2388.03,8133.23,8.4178,390,38.95,23.3442,192
3,1,4,642.35,1582.79,1401.87,554.45,2388.11,9049.48,47.13,522.86,2388.08,8133.83,8.3682,392,38.88,23.3739,192
4,1,5,642.37,1582.85,1406.22,554.00,2388.06,9055.15,47.28,522.19,2388.04,8133.80,8.4294,393,38.90,23.4044,192
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20626,100,196,643.49,1597.98,1428.63,551.43,2388.19,9065.52,48.07,519.49,2388.26,8137.60,8.4956,397,38.49,22.9735,200
20627,100,197,643.54,1604.50,1433.58,550.86,2388.23,9065.11,48.04,519.68,2388.22,8136.50,8.5139,395,38.30,23.1594,200
20628,100,198,643.42,1602.46,1428.18,550.94,2388.24,9065.90,48.09,520.01,2388.24,8141.05,8.5646,398,38.44,22.9333,200
20629,100,199,643.23,1605.26,1426.53,550.68,2388.25,9073.72,48.39,519.67,2388.23,8139.29,8.5389,395,38.29,23.0640,200


In [None]:
train_df["RUL"]=train_df["max_cycle"]-train_df["cycle"] # RUL Computation
train_df

Unnamed: 0,engine_id,cycle,sensor_2,sensor_3,sensor_4,sensor_7,sensor_8,sensor_9,sensor_11,sensor_12,sensor_13,sensor_14,sensor_15,sensor_17,sensor_20,sensor_21,max_cycle,RUL
0,1,1,641.82,1589.70,1400.60,554.36,2388.06,9046.19,47.47,521.66,2388.02,8138.62,8.4195,392,39.06,23.4190,192,191
1,1,2,642.15,1591.82,1403.14,553.75,2388.04,9044.07,47.49,522.28,2388.07,8131.49,8.4318,392,39.00,23.4236,192,190
2,1,3,642.35,1587.99,1404.20,554.26,2388.08,9052.94,47.27,522.42,2388.03,8133.23,8.4178,390,38.95,23.3442,192,189
3,1,4,642.35,1582.79,1401.87,554.45,2388.11,9049.48,47.13,522.86,2388.08,8133.83,8.3682,392,38.88,23.3739,192,188
4,1,5,642.37,1582.85,1406.22,554.00,2388.06,9055.15,47.28,522.19,2388.04,8133.80,8.4294,393,38.90,23.4044,192,187
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20626,100,196,643.49,1597.98,1428.63,551.43,2388.19,9065.52,48.07,519.49,2388.26,8137.60,8.4956,397,38.49,22.9735,200,4
20627,100,197,643.54,1604.50,1433.58,550.86,2388.23,9065.11,48.04,519.68,2388.22,8136.50,8.5139,395,38.30,23.1594,200,3
20628,100,198,643.42,1602.46,1428.18,550.94,2388.24,9065.90,48.09,520.01,2388.24,8141.05,8.5646,398,38.44,22.9333,200,2
20629,100,199,643.23,1605.26,1426.53,550.68,2388.25,9073.72,48.39,519.67,2388.23,8139.29,8.5389,395,38.29,23.0640,200,1


In [24]:
train_df.columns

Index(['engine_id', 'cycle', 'sensor_2', 'sensor_3', 'sensor_4', 'sensor_7',
       'sensor_8', 'sensor_9', 'sensor_11', 'sensor_12', 'sensor_13',
       'sensor_14', 'sensor_15', 'sensor_17', 'sensor_20', 'sensor_21',
       'max_cycle', 'RUL'],
      dtype='object')

In [25]:
sensors=['sensor_2', 'sensor_3', 'sensor_4', 'sensor_7',
       'sensor_8', 'sensor_9', 'sensor_11', 'sensor_12', 'sensor_13',
       'sensor_14', 'sensor_15', 'sensor_17', 'sensor_20', 'sensor_21']

In [None]:
# Creating t-1, t-2 lag features

for s in sensors:
    train_df[f"{s}_lag_1"]=train_df.groupby("engine_id")[s].shift(1)
    train_df[f'{s}_lag_2']=train_df.groupby("engine_id")[s].shift(2)

In [27]:
train_df.head()

Unnamed: 0,engine_id,cycle,sensor_2,sensor_3,sensor_4,sensor_7,sensor_8,sensor_9,sensor_11,sensor_12,...,sensor_14_lag_1,sensor_14_lag_2,sensor_15_lag_1,sensor_15_lag_2,sensor_17_lag_1,sensor_17_lag_2,sensor_20_lag_1,sensor_20_lag_2,sensor_21_lag_1,sensor_21_lag_2
0,1,1,641.82,1589.7,1400.6,554.36,2388.06,9046.19,47.47,521.66,...,,,,,,,,,,
1,1,2,642.15,1591.82,1403.14,553.75,2388.04,9044.07,47.49,522.28,...,8138.62,,8.4195,,392.0,,39.06,,23.419,
2,1,3,642.35,1587.99,1404.2,554.26,2388.08,9052.94,47.27,522.42,...,8131.49,8138.62,8.4318,8.4195,392.0,392.0,39.0,39.06,23.4236,23.419
3,1,4,642.35,1582.79,1401.87,554.45,2388.11,9049.48,47.13,522.86,...,8133.23,8131.49,8.4178,8.4318,390.0,392.0,38.95,39.0,23.3442,23.4236
4,1,5,642.37,1582.85,1406.22,554.0,2388.06,9055.15,47.28,522.19,...,8133.83,8133.23,8.3682,8.4178,392.0,390.0,38.88,38.95,23.3739,23.3442


In [28]:
# Creating Rolling mean, Exponential Moving Average and Standard Deviation of Sensor readings on differnt windows
window=[1,6,12]

for w in window:
    for s in sensors:
        train_df[f"{s}_mean_{w}"]=train_df.groupby("engine_id")[s].transform(lambda x: x.rolling(window=w).mean())
        train_df[f'{s}_std_{w}']=train_df.groupby("engine_id")[s].transform(lambda x: x.rolling(window=w).std())

for s in sensors:
    train_df[f"{s}_ema_12"]=train_df.groupby("engine_id")[s].transform(lambda x: x.ewm(span=12).mean())


In [29]:
train_df.columns.to_list()

['engine_id',
 'cycle',
 'sensor_2',
 'sensor_3',
 'sensor_4',
 'sensor_7',
 'sensor_8',
 'sensor_9',
 'sensor_11',
 'sensor_12',
 'sensor_13',
 'sensor_14',
 'sensor_15',
 'sensor_17',
 'sensor_20',
 'sensor_21',
 'max_cycle',
 'RUL',
 'sensor_2_lag_1',
 'sensor_2_lag_2',
 'sensor_3_lag_1',
 'sensor_3_lag_2',
 'sensor_4_lag_1',
 'sensor_4_lag_2',
 'sensor_7_lag_1',
 'sensor_7_lag_2',
 'sensor_8_lag_1',
 'sensor_8_lag_2',
 'sensor_9_lag_1',
 'sensor_9_lag_2',
 'sensor_11_lag_1',
 'sensor_11_lag_2',
 'sensor_12_lag_1',
 'sensor_12_lag_2',
 'sensor_13_lag_1',
 'sensor_13_lag_2',
 'sensor_14_lag_1',
 'sensor_14_lag_2',
 'sensor_15_lag_1',
 'sensor_15_lag_2',
 'sensor_17_lag_1',
 'sensor_17_lag_2',
 'sensor_20_lag_1',
 'sensor_20_lag_2',
 'sensor_21_lag_1',
 'sensor_21_lag_2',
 'sensor_2_mean_1',
 'sensor_2_std_1',
 'sensor_3_mean_1',
 'sensor_3_std_1',
 'sensor_4_mean_1',
 'sensor_4_std_1',
 'sensor_7_mean_1',
 'sensor_7_std_1',
 'sensor_8_mean_1',
 'sensor_8_std_1',
 'sensor_9_mean_1',
 

In [30]:
# Target column indicating failure in 24 hrs ( 1: Failure, 0: Working)
train_df["label_24"]=(train_df["RUL"]<=24).astype(int)


In [33]:
train_df

Unnamed: 0,engine_id,cycle,sensor_2,sensor_3,sensor_4,sensor_7,sensor_8,sensor_9,sensor_11,sensor_12,...,sensor_9_ema_12,sensor_11_ema_12,sensor_12_ema_12,sensor_13_ema_12,sensor_14_ema_12,sensor_15_ema_12,sensor_17_ema_12,sensor_20_ema_12,sensor_21_ema_12,label_24
0,1,1,641.82,1589.70,1400.60,554.36,2388.06,9046.19,47.47,521.66,...,9046.190000,47.470000,521.660000,2388.020000,8138.620000,8.419500,392.000000,39.060000,23.419000,0
1,1,2,642.15,1591.82,1403.14,553.75,2388.04,9044.07,47.49,522.28,...,9045.041667,47.480833,521.995833,2388.047083,8134.757917,8.426162,392.000000,39.027500,23.421492,0
2,1,3,642.35,1587.99,1404.20,554.26,2388.08,9052.94,47.27,522.42,...,9048.124388,47.398545,522.161386,2388.040416,8134.161570,8.422899,391.219400,38.997252,23.391325,0
3,1,4,642.35,1582.79,1401.87,554.45,2388.11,9049.48,47.13,522.86,...,9048.552302,47.313776,522.381911,2388.052911,8134.056907,8.405632,391.465805,38.960240,23.385824,0
4,1,5,642.37,1582.85,1406.22,554.00,2388.06,9055.15,47.28,522.19,...,9050.344873,47.304599,522.329769,2388.049403,8133.987106,8.412090,391.882640,38.943873,23.390871,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20626,100,196,643.49,1597.98,1428.63,551.43,2388.19,9065.52,48.07,519.49,...,9066.884787,48.085490,519.920653,2388.235148,8141.022486,8.510795,395.759712,38.441959,23.096156,1
20627,100,197,643.54,1604.50,1433.58,550.86,2388.23,9065.11,48.04,519.68,...,9066.611742,48.078491,519.883630,2388.232817,8140.326719,8.511273,395.642833,38.420119,23.105886,1
20628,100,198,643.42,1602.46,1428.18,550.94,2388.24,9065.90,48.09,520.01,...,9066.502244,48.080262,519.903071,2388.233922,8140.437993,8.519477,396.005474,38.423177,23.079334,1
20629,100,199,643.23,1605.26,1426.53,550.68,2388.25,9073.72,48.39,519.67,...,9067.612668,48.127914,519.867214,2388.233319,8140.261379,8.522465,395.850786,38.402689,23.076975,1


In [None]:
null_df=train_df.isna().sum().sort_values(ascending=False).reset_index() # Creating dataframe for easier null value filteration
null_df=null_df[null_df[0]>0]
null_df

Unnamed: 0,index,0
0,sensor_2_std_1,20631
1,sensor_13_std_1,20631
2,sensor_14_std_1,20631
3,sensor_12_std_1,20631
4,sensor_15_std_1,20631
...,...,...
93,sensor_11_lag_1,100
94,sensor_7_lag_1,100
95,sensor_4_lag_1,100
96,sensor_3_lag_1,100


In [34]:
null_df[null_df["index"].str.contains("_std_")]

Unnamed: 0,index,0
0,sensor_2_std_1,20631
1,sensor_13_std_1,20631
2,sensor_14_std_1,20631
3,sensor_12_std_1,20631
4,sensor_15_std_1,20631
5,sensor_17_std_1,20631
6,sensor_11_std_1,20631
7,sensor_20_std_1,20631
8,sensor_21_std_1,20631
9,sensor_9_std_1,20631


std_1 of all sensors have null values so dropping those columns

In [35]:
col_to_remove=[col for col in train_df.columns if "_std_1" in col]
train_df.drop(col_to_remove,axis="columns",inplace=True)

In [37]:
(train_df.isna().sum()).sort_values(ascending=False)

sensor_4_mean_12     1100
sensor_14_mean_12    1100
sensor_3_mean_12     1100
sensor_20_mean_12    1100
sensor_21_mean_12    1100
                     ... 
sensor_13_mean_1        0
sensor_14_mean_1        0
sensor_15_mean_1        0
sensor_17_mean_1        0
label_24                0
Length: 117, dtype: int64

In [38]:
# Minor null values compared to the size of dataset so dropping NaN values
train_df.dropna(inplace=True)

In [39]:
train_df

Unnamed: 0,engine_id,cycle,sensor_2,sensor_3,sensor_4,sensor_7,sensor_8,sensor_9,sensor_11,sensor_12,...,sensor_9_ema_12,sensor_11_ema_12,sensor_12_ema_12,sensor_13_ema_12,sensor_14_ema_12,sensor_15_ema_12,sensor_17_ema_12,sensor_20_ema_12,sensor_21_ema_12,label_24
11,1,12,642.06,1583.41,1400.15,554.52,2388.09,9049.37,47.18,521.80,...,9049.579221,47.211608,521.938474,2388.035130,8133.076871,8.408378,391.769252,38.992928,23.401434,0
12,1,13,643.07,1582.19,1400.83,553.44,2388.12,9046.82,47.38,521.85,...,9049.100115,47.240847,521.923112,2388.042921,8132.212697,8.409562,391.982957,38.982001,23.379602,0
13,1,14,642.35,1592.95,1399.16,554.48,2388.09,9047.37,47.44,521.67,...,9048.805532,47.274756,521.880015,2388.035613,8132.590233,8.407321,392.156127,39.015714,23.380112,0
14,1,15,642.43,1583.82,1402.13,553.64,2388.11,9052.22,47.30,522.50,...,9049.377514,47.278985,521.983873,2388.043048,8131.747583,8.409428,391.962456,39.011407,23.375068,0
15,1,16,642.13,1587.98,1404.50,553.94,2388.05,9049.34,47.24,521.49,...,9049.371314,47.272543,521.902257,2388.047502,8132.468507,8.406813,391.968660,39.004564,23.388277,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20626,100,196,643.49,1597.98,1428.63,551.43,2388.19,9065.52,48.07,519.49,...,9066.884787,48.085490,519.920653,2388.235148,8141.022486,8.510795,395.759712,38.441959,23.096156,1
20627,100,197,643.54,1604.50,1433.58,550.86,2388.23,9065.11,48.04,519.68,...,9066.611742,48.078491,519.883630,2388.232817,8140.326719,8.511273,395.642833,38.420119,23.105886,1
20628,100,198,643.42,1602.46,1428.18,550.94,2388.24,9065.90,48.09,520.01,...,9066.502244,48.080262,519.903071,2388.233922,8140.437993,8.519477,396.005474,38.423177,23.079334,1
20629,100,199,643.23,1605.26,1426.53,550.68,2388.25,9073.72,48.39,519.67,...,9067.612668,48.127914,519.867214,2388.233319,8140.261379,8.522465,395.850786,38.402689,23.076975,1


In [45]:
import os
os.makedirs("/mnt/f/Zaalima Internship/Zaalima Project/factoryguard-ai/data/processed",exist_ok=True)
train_df.to_csv("/mnt/f/Zaalima Internship/Zaalima Project/factoryguard-ai/data/processed/processed_data.csv",index=False)