In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

---
### 1. Schema (Based on NASA readme.txt)
---
  The documentation tells us:
- Col 1: Unit Number (The Engine ID)
- Col 2: Time in Cycles (The Age of the Engine)
- Col 3-5: Operational Settings (Altitude, Mach, Throttle)
- Col 6-26: Sensor Readings (21 distinct sensors)

In [2]:

index_names = ['unit_nr', 'time_cycles']
setting_names = ['setting_1', 'setting_2', 'setting_3']
sensor_names = ['s_{}'.format(i) for i in range(1, 22)] 
col_names = index_names + setting_names + sensor_names



---
### 2. Ingesting the Raw Data.
---


In [3]:

train = pd.read_csv('../data/raw/train_FD001.txt', sep='\s+', header=None, names=col_names)
test = pd.read_csv('../data/raw/test_FD001.txt', sep='\s+', header=None, names=col_names)
y_test = pd.read_csv('../data/raw/RUL_FD001.txt', sep='\s+', header=None, names=['RUL'])


---
### 3. The "Sanity Check"
---


In [4]:

print(f"Train Shape: {train.shape}")
print(f"Test Shape: {test.shape}")
print(f"y_test Shape: {y_test.shape}")


Train Shape: (20631, 26)
Test Shape: (13096, 26)
y_test Shape: (100, 1)


Verify the head to ensure columns aligned correctly


In [5]:

train.head()

Unnamed: 0,unit_nr,time_cycles,setting_1,setting_2,setting_3,s_1,s_2,s_3,s_4,s_5,...,s_12,s_13,s_14,s_15,s_16,s_17,s_18,s_19,s_20,s_21
0,1,1,-0.0007,-0.0004,100.0,518.67,641.82,1589.7,1400.6,14.62,...,521.66,2388.02,8138.62,8.4195,0.03,392,2388,100.0,39.06,23.419
1,1,2,0.0019,-0.0003,100.0,518.67,642.15,1591.82,1403.14,14.62,...,522.28,2388.07,8131.49,8.4318,0.03,392,2388,100.0,39.0,23.4236
2,1,3,-0.0043,0.0003,100.0,518.67,642.35,1587.99,1404.2,14.62,...,522.42,2388.03,8133.23,8.4178,0.03,390,2388,100.0,38.95,23.3442
3,1,4,0.0007,0.0,100.0,518.67,642.35,1582.79,1401.87,14.62,...,522.86,2388.08,8133.83,8.3682,0.03,392,2388,100.0,38.88,23.3739
4,1,5,-0.0019,-0.0002,100.0,518.67,642.37,1582.85,1406.22,14.62,...,522.19,2388.04,8133.8,8.4294,0.03,393,2388,100.0,38.9,23.4044


Checking for Missing Values (NaN)

In [6]:

print("Missing Values in Train:", train.isnull().sum().sum())


Missing Values in Train: 0


Checking Data Types

In [7]:

print("\nData Types:")
print(train.dtypes)



Data Types:
unit_nr          int64
time_cycles      int64
setting_1      float64
setting_2      float64
setting_3      float64
s_1            float64
s_2            float64
s_3            float64
s_4            float64
s_5            float64
s_6            float64
s_7            float64
s_8            float64
s_9            float64
s_10           float64
s_11           float64
s_12           float64
s_13           float64
s_14           float64
s_15           float64
s_16           float64
s_17             int64
s_18             int64
s_19           float64
s_20           float64
s_21           float64
dtype: object


Check for Duplicates

In [8]:
print("\nDuplicate Rows:", train.duplicated().sum())


Duplicate Rows: 0


---
### 4. Calculating the Target Variable (RUL) for Training Data
---
1. Get the maximum cycle number for each engine (unit_nr)
   This represents the total life of that specific engine.

In [9]:
max_cycle = train.groupby('unit_nr')['time_cycles'].max().reset_index()
max_cycle.columns = ['unit_nr', 'max_cycle']


2. Merging this 'max_life' back into the main dataframe
   Now every row knows how long that specific engine ended up living.

In [10]:
train = train.merge(max_cycle, on='unit_nr', how='left')

3. Calculating RUL
   RUL = Total Life - Current Age

In [11]:
train['RUL'] = train['max_cycle'] - train['time_cycles']

4. Dropping the helper column

In [12]:
train.drop('max_cycle', axis=1, inplace=True)

5. Verifying the Result

In [13]:
train[['unit_nr', 'time_cycles', 'RUL']].head(10)

Unnamed: 0,unit_nr,time_cycles,RUL
0,1,1,191
1,1,2,190
2,1,3,189
3,1,4,188
4,1,5,187
5,1,6,186
6,1,7,185
7,1,8,184
8,1,9,183
9,1,10,182


---
### 5. Checkpoint: Saved to `processed` folder
---


In [14]:
train.to_csv('../data/processed/train_FD001_with_RUL.csv', index=False)

print("Data saved successfully to data/processed/train_FD001_with_RUL.csv")

Data saved successfully to data/processed/train_FD001_with_RUL.csv
