In [1]:
import numpy as np
import pandas as pd

In [2]:
df_raw = pd.read_csv('thermal_data.csv')

In [3]:
display(df_raw.head())
display(df_raw.tail())
df_raw.info()

Unnamed: 0,timestamp,unix_time,cpu_load,ram_usage,ambient_temp,cpu_temp
0,2026-01-31 16:56:43,1769858803,1.5,24.9,22.02,41.88
1,2026-01-31 16:56:45,1769858805,3.0,24.9,22.02,41.38
2,2026-01-31 16:56:46,1769858806,3.5,24.9,22.03,40.75
3,2026-01-31 16:56:48,1769858808,2.0,24.9,22.04,40.25
4,2026-01-31 16:56:49,1769858809,4.0,24.9,22.04,40.0


Unnamed: 0,timestamp,unix_time,cpu_load,ram_usage,ambient_temp,cpu_temp
1158,2026-01-31 17:26:37,1769860597,20.0,25.2,22.56,59.0
1159,2026-01-31 17:26:39,1769860599,1.2,25.1,22.54,57.0
1160,2026-01-31 17:26:40,1769860600,2.7,24.8,22.52,55.38
1161,2026-01-31 17:26:42,1769860602,2.0,24.8,22.5,53.88
1162,2026-01-31 17:26:43,1769860603,0.7,24.7,22.48,52.38


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1163 entries, 0 to 1162
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   timestamp     1163 non-null   object 
 1   unix_time     1163 non-null   int64  
 2   cpu_load      1163 non-null   float64
 3   ram_usage     1163 non-null   float64
 4   ambient_temp  1163 non-null   float64
 5   cpu_temp      1163 non-null   float64
dtypes: float64(4), int64(1), object(1)
memory usage: 54.6+ KB


In [4]:
df_raw.duplicated().sum()

np.int64(0)

In [5]:
def removeOutliers(df, columns):
    df_clean = df.copy()
    initial_rows = len(df)

    for col in columns:
        if col in df_clean.columns:
            Q1 = df_clean[col].quantile(0.25)
            Q3 = df_clean[col].quantile(0.75)
            IQR = Q3 - Q1

            lower_bound = Q1 - 1.5 * IQR
            upper_bound = Q3 + 1.5 * IQR

            df_clean = df_clean[(df_clean[col]>=lower_bound) & (df_clean[col]<=upper_bound)]

    return df_clean

column_to_clean = ['cpu_load', 'ram_usage', 'cpu_temp', 'ambient_temp']

df_clean = removeOutliers(df = df_raw, columns=column_to_clean)

In [6]:
df_clean = df_clean.sort_values('unix_time').reset_index(drop=True)

In [7]:
df_clean

Unnamed: 0,timestamp,unix_time,cpu_load,ram_usage,ambient_temp,cpu_temp
0,2026-01-31 16:56:43,1769858803,1.5,24.9,22.02,41.88
1,2026-01-31 16:56:45,1769858805,3.0,24.9,22.02,41.38
2,2026-01-31 16:56:46,1769858806,3.5,24.9,22.03,40.75
3,2026-01-31 16:56:48,1769858808,2.0,24.9,22.04,40.25
4,2026-01-31 16:56:49,1769858809,4.0,24.9,22.04,40.00
...,...,...,...,...,...,...
1056,2026-01-31 17:26:32,1769860592,21.6,25.1,22.63,59.00
1057,2026-01-31 17:26:39,1769860599,1.2,25.1,22.54,57.00
1058,2026-01-31 17:26:40,1769860600,2.7,24.8,22.52,55.38
1059,2026-01-31 17:26:42,1769860602,2.0,24.8,22.50,53.88


In [8]:
def addFeatures(df_clean):
    df = df_clean.copy()

    # lag features
    df['cpu_temp_lag1'] = df['cpu_temp'].shift(1)
    df['cpu_temp_lag5'] = df['cpu_temp'].shift(5)
    df['cpu_load_lag1'] = df['cpu_load'].shift(1)
    df['cpu_load_lag5'] = df['cpu_load'].shift(5)
    df['cpu_load_lag10'] = df['cpu_load'].shift(10)

    # rate features
    df['temp_rate'] = df['cpu_temp'].diff()
    df['temp_accelaration'] = df['temp_rate'].diff()
    df['load_rate'] = df['cpu_load'].diff()

    # rolling features
    df['cpu_temp_roll10'] = df['cpu_temp'].rolling(window = 10).mean()
    df['cpu_load_roll10'] = df['cpu_load'].rolling(window = 10).mean()
    df['cpu_load_roll30'] = df['cpu_load'].rolling(window = 30).mean()
    df['cpu_load_std10'] = df['cpu_load'].rolling(window=10).std()

    # interaction features
    df['load_ambient_interaction'] = df['cpu_load'] * df['ambient_temp']
    df['thermal_stress'] = df['cpu_load'] * df['cpu_temp']
    df['temp_above_ambient'] = df['cpu_temp'] - df['ambient_temp']


    df = df.dropna()

    return df

df_features = addFeatures(df_clean)

In [10]:
print("\nSample of Engineered Features:")
print("="*100)

# Show subset of features
sample_cols = [
    'cpu_load', 'cpu_temp',
    'cpu_load_lag1', 'cpu_temp_lag1',
    'temp_rate', 'cpu_load_roll10',
    'thermal_stress'
]

display(df_features[sample_cols].head(20))

print("\nAll Feature Names:")
feature_cols = [col for col in df_features.columns 
                if col not in ['timestamp', 'unix_time', 'cpu_temp']]
for i, col in enumerate(feature_cols, 1):
    print(f"{i:2d}. {col}")
    
print(f"\nTotal features for training: {len(feature_cols)}")


Sample of Engineered Features:


Unnamed: 0,cpu_load,cpu_temp,cpu_load_lag1,cpu_temp_lag1,temp_rate,cpu_load_roll10,thermal_stress
29,11.7,45.38,3.2,45.25,0.13,8.16,530.946
30,1.8,44.88,11.7,45.38,-0.5,7.14,80.784
31,12.7,45.38,1.8,44.88,0.5,8.26,576.326
32,1.3,44.88,12.7,45.38,-0.5,7.21,58.344
33,11.9,45.12,1.3,44.88,0.24,8.13,536.928
34,2.0,44.62,11.9,45.12,-0.5,7.09,89.24
35,11.2,45.0,2.0,44.62,0.38,7.89,504.0
36,2.8,44.5,11.2,45.0,-0.5,7.01,124.6
37,12.0,44.5,2.8,44.5,0.0,7.06,534.0
38,12.7,44.62,12.0,44.5,0.12,8.01,566.674



All Feature Names:
 1. cpu_load
 2. ram_usage
 3. ambient_temp
 4. cpu_temp_lag1
 5. cpu_temp_lag5
 6. cpu_load_lag1
 7. cpu_load_lag5
 8. cpu_load_lag10
 9. temp_rate
10. temp_accelaration
11. load_rate
12. cpu_temp_roll10
13. cpu_load_roll10
14. cpu_load_roll30
15. cpu_load_std10
16. load_ambient_interaction
17. thermal_stress
18. temp_above_ambient

Total features for training: 18
