In [3]:
import numpy as np
import pandas as pd

In [4]:
df_raw = pd.read_csv('thermal_data.csv')

In [5]:
display(df_raw.head())
display(df_raw.tail())
df_raw.info()

Unnamed: 0,timestamp,unix_time,cpu_load,ram_usage,ambient_temp,cpu_temp
0,2026-02-14 17:09:36,1771069000.0,3.2,32.6,21.6875,42.875
1,2026-02-14 17:09:37,1771069000.0,3.1,32.6,21.75,42.875
2,2026-02-14 17:09:38,1771069000.0,3.7,32.7,21.75,42.75
3,2026-02-14 17:09:39,1771069000.0,2.9,32.7,21.8125,42.875
4,2026-02-14 17:09:40,1771069000.0,3.4,32.7,21.8125,42.625


Unnamed: 0,timestamp,unix_time,cpu_load,ram_usage,ambient_temp,cpu_temp
15752,2026-02-14 21:32:08,1771085000.0,28.3,33.5,21.5,70.25
15753,2026-02-14 21:32:09,1771085000.0,27.5,33.5,21.5,70.25
15754,2026-02-14 21:32:10,1771085000.0,28.9,33.5,21.5,70.5
15755,2026-02-14 21:32:11,1771085000.0,27.8,33.5,21.5,70.375
15756,2026-02-14 21:32:12,1771085000.0,27.2,33.5,21.5625,70.5


<class 'pandas.DataFrame'>
RangeIndex: 15757 entries, 0 to 15756
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   timestamp     15757 non-null  str    
 1   unix_time     15757 non-null  float64
 2   cpu_load      15757 non-null  float64
 3   ram_usage     15757 non-null  float64
 4   ambient_temp  15757 non-null  float64
 5   cpu_temp      15757 non-null  float64
dtypes: float64(5), str(1)
memory usage: 738.7 KB


In [6]:
df_raw.duplicated().sum()

np.int64(0)

In [7]:
def removeOutliers(df, columns):
    df_clean = df.copy()
    initial_rows = len(df)

    for col in columns:
        if col in df_clean.columns:
            Q1 = df_clean[col].quantile(0.25)
            Q3 = df_clean[col].quantile(0.75)
            IQR = Q3 - Q1

            lower_bound = Q1 - 1.5 * IQR
            upper_bound = Q3 + 1.5 * IQR

            df_clean = df_clean[(df_clean[col]>=lower_bound) & (df_clean[col]<=upper_bound)]

    return df_clean

column_to_clean = ['cpu_load', 'ram_usage', 'cpu_temp', 'ambient_temp']

df_clean = removeOutliers(df = df_raw, columns=column_to_clean)

In [8]:
df_clean = df_clean.sort_values('unix_time').reset_index(drop=True)

In [9]:
df_clean

Unnamed: 0,timestamp,unix_time,cpu_load,ram_usage,ambient_temp,cpu_temp
0,2026-02-14 17:09:36,1.771069e+09,3.2,32.6,21.6875,42.875
1,2026-02-14 17:09:37,1.771069e+09,3.1,32.6,21.7500,42.875
2,2026-02-14 17:09:38,1.771069e+09,3.7,32.7,21.7500,42.750
3,2026-02-14 17:09:39,1.771069e+09,2.9,32.7,21.8125,42.875
4,2026-02-14 17:09:40,1.771069e+09,3.4,32.7,21.8125,42.625
...,...,...,...,...,...,...
15032,2026-02-14 21:32:08,1.771085e+09,28.3,33.5,21.5000,70.250
15033,2026-02-14 21:32:09,1.771085e+09,27.5,33.5,21.5000,70.250
15034,2026-02-14 21:32:10,1.771085e+09,28.9,33.5,21.5000,70.500
15035,2026-02-14 21:32:11,1.771085e+09,27.8,33.5,21.5000,70.375


In [10]:
def addFeatures(df_clean):
    df = df_clean.copy()

    # lag features
    df['cpu_temp_lag1'] = df['cpu_temp'].shift(1)
    df['cpu_temp_lag5'] = df['cpu_temp'].shift(5)
    df['cpu_load_lag1'] = df['cpu_load'].shift(1)
    df['cpu_load_lag5'] = df['cpu_load'].shift(5)
    df['cpu_load_lag10'] = df['cpu_load'].shift(10)

    # rate features
    df['temp_rate'] = df['cpu_temp'].diff()
    df['temp_accelaration'] = df['temp_rate'].diff()
    df['load_rate'] = df['cpu_load'].diff()

    # rolling features
    df['cpu_temp_roll10'] = df['cpu_temp'].rolling(window = 10).mean()
    df['cpu_load_roll10'] = df['cpu_load'].rolling(window = 10).mean()
    df['cpu_load_roll30'] = df['cpu_load'].rolling(window = 30).mean()
    df['cpu_load_std10'] = df['cpu_load'].rolling(window=10).std()

    # interaction features
    df['load_ambient_interaction'] = df['cpu_load'] * df['ambient_temp']
    df['thermal_stress'] = df['cpu_load'] * df['cpu_temp']
    df['temp_above_ambient'] = df['cpu_temp'] - df['ambient_temp']


    df = df.dropna()

    return df

df_features = addFeatures(df_clean)

In [11]:
print("\nSample of Engineered Features:")
print("="*100)

# Show subset of features
sample_cols = [
    'cpu_load', 'cpu_temp',
    'cpu_load_lag1', 'cpu_temp_lag1',
    'temp_rate', 'cpu_load_roll10',
    'thermal_stress'
]

display(df_features[sample_cols].head(20))

print("\nAll Feature Names:")
feature_cols = [col for col in df_features.columns 
                if col not in ['timestamp', 'unix_time', 'cpu_temp']]
for i, col in enumerate(feature_cols, 1):
    print(f"{i:2d}. {col}")
    
print(f"\nTotal features for training: {len(feature_cols)}")


Sample of Engineered Features:


Unnamed: 0,cpu_load,cpu_temp,cpu_load_lag1,cpu_temp_lag1,temp_rate,cpu_load_roll10,thermal_stress
29,9.2,50.75,7.6,50.875,-0.125,7.8,466.9
30,7.9,50.75,9.2,50.75,0.0,7.88,400.925
31,8.3,50.625,7.9,50.75,-0.125,7.89,420.1875
32,8.1,50.375,8.3,50.625,-0.25,7.95,408.0375
33,7.7,50.5,8.1,50.375,0.125,7.96,388.85
34,8.0,50.375,7.7,50.5,-0.125,7.97,403.0
35,7.6,50.25,8.0,50.375,-0.125,7.98,381.9
36,7.5,50.25,7.6,50.25,0.0,7.99,376.875
37,7.3,50.0,7.5,50.25,-0.25,7.92,365.0
38,7.5,50.0,7.3,50.0,0.0,7.91,375.0



All Feature Names:
 1. cpu_load
 2. ram_usage
 3. ambient_temp
 4. cpu_temp_lag1
 5. cpu_temp_lag5
 6. cpu_load_lag1
 7. cpu_load_lag5
 8. cpu_load_lag10
 9. temp_rate
10. temp_accelaration
11. load_rate
12. cpu_temp_roll10
13. cpu_load_roll10
14. cpu_load_roll30
15. cpu_load_std10
16. load_ambient_interaction
17. thermal_stress
18. temp_above_ambient

Total features for training: 18


In [12]:
df_features['target_temp_5s'] = df_features['cpu_temp'].shift(-5)
df_features = df_features.dropna()

In [13]:
feature_cols = [
    col for col in df_features.columns
    if col not in ['timestamp', 'unix_time', 'cpu_temp', 'target_temp_5s']
]

X = df_features[feature_cols]
y = df_features['target_temp_5s']


In [14]:
split_index = int(len(X) * 0.8)

X_train = X.iloc[:split_index]
X_test = X.iloc[split_index:]

y_train = y.iloc[:split_index]
y_test = y.iloc[split_index:]

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

model = LinearRegression()
model.fit(X_train, y_train)

predictions = model.predict(X_test)

rmse = np.sqrt(mean_squared_error(y_test, predictions))
print("RMSE:", rmse)

RMSE: 1.9642846674303494


In [19]:
from sklearn.ensemble import RandomForestRegressor

rf_model = RandomForestRegressor(n_estimators=100)
rf_model.fit(X_train, y_train)

rf_predictions = rf_model.predict(X_test)

print("RF RMSE:", np.sqrt(mean_squared_error(y_test, rf_predictions)))


RF RMSE: 1.465869418351369


In [None]:
# Can add Joblib to save the model and then load that saved model to predict and turn the fan on and off

In [20]:
print("Linear Regression RMSE:", rmse)
print("Random Forest RMSE:", np.sqrt(mean_squared_error(y_test, rf_predictions)))


Linear Regression RMSE: 1.9642846674303494
Random Forest RMSE: 1.465869418351369
