In [19]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np

In [3]:
alarms = pd.read_csv('Alarms_of_the_sensors.csv', sep=None, engine='python')

In [4]:
alarms.head()

Unnamed: 0,time,sensor_id,alarm_type,severity,status,Assignee,atm_pm_1_0,atm_pm_2_5,co2,delta,ma,temperature
0,2025-11-10 16:19:48,Sensor_0057,Уровень CO2,Критический,Сброшенные неподтвержденные,2,0,1,709,38,1,27
1,2025-11-09 05:38:38,Sensor_0013,Потеря соединения,Критический,Сброшенные неподтвержденные,0,0,0,1144,27,0,30
2,2025-11-09 05:38:38,Sensor_0089,Потеря соединения,Критический,Сброшенные неподтвержденные,0,0,0,2474,28,0,27
3,2025-11-09 05:38:38,Sensor_0079,Потеря соединения,Критический,Сброшенные неподтвержденные,0,0,0,402,29,0,27
4,2025-11-09 05:38:38,Sensor_0068,Потеря соединения,Критический,Сброшенные неподтвержденные,0,0,0,606,25,0,27


In [5]:
print(alarms.columns)

Index(['time', 'sensor_id', 'alarm_type', 'severity', 'status', 'Assignee',
       'atm_pm_1_0', 'atm_pm_2_5', 'co2', 'delta', 'ma', 'temperature'],
      dtype='object')


In [6]:
alarms.head()

Unnamed: 0,time,sensor_id,alarm_type,severity,status,Assignee,atm_pm_1_0,atm_pm_2_5,co2,delta,ma,temperature
0,2025-11-10 16:19:48,Sensor_0057,Уровень CO2,Критический,Сброшенные неподтвержденные,2,0,1,709,38,1,27
1,2025-11-09 05:38:38,Sensor_0013,Потеря соединения,Критический,Сброшенные неподтвержденные,0,0,0,1144,27,0,30
2,2025-11-09 05:38:38,Sensor_0089,Потеря соединения,Критический,Сброшенные неподтвержденные,0,0,0,2474,28,0,27
3,2025-11-09 05:38:38,Sensor_0079,Потеря соединения,Критический,Сброшенные неподтвержденные,0,0,0,402,29,0,27
4,2025-11-09 05:38:38,Sensor_0068,Потеря соединения,Критический,Сброшенные неподтвержденные,0,0,0,606,25,0,27


In [7]:
alarms = alarms.drop(columns=['Assignee', 'delta'], errors='ignore')

In [8]:
columns_to_show = ['Timestamp', 'sensor_id', 'alarm_type', 'severity', 'status']
existing_columns = [col for col in columns_to_show if col in alarms.columns]
print(alarms[existing_columns].head())

     sensor_id         alarm_type     severity                       status
0  Sensor_0057        Уровень CO2  Критический  Сброшенные неподтвержденные
1  Sensor_0013  Потеря соединения  Критический  Сброшенные неподтвержденные
2  Sensor_0089  Потеря соединения  Критический  Сброшенные неподтвержденные
3  Sensor_0079  Потеря соединения  Критический  Сброшенные неподтвержденные
4  Sensor_0068  Потеря соединения  Критический  Сброшенные неподтвержденные


In [9]:
alarms['alarm_type'].unique()

array(['Уровень CO2', 'Потеря соединения', 'Курение',
       'Высокая концентрация CO2', 'Высокая температура',
       'Низкая влажность'], dtype=object)

In [10]:
alarms['severity'].unique()

array(['Критический'], dtype=object)

In [11]:
from datetime import datetime, timezone

In [12]:
alarms = pd.read_csv('Alarms_of_the_sensors.csv', sep=None, engine='python')

In [13]:
alarms.head()

Unnamed: 0,time,sensor_id,alarm_type,severity,status,Assignee,atm_pm_1_0,atm_pm_2_5,co2,delta,ma,temperature
0,2025-11-10 16:19:48,Sensor_0057,Уровень CO2,Критический,Сброшенные неподтвержденные,2,0,1,709,38,1,27
1,2025-11-09 05:38:38,Sensor_0013,Потеря соединения,Критический,Сброшенные неподтвержденные,0,0,0,1144,27,0,30
2,2025-11-09 05:38:38,Sensor_0089,Потеря соединения,Критический,Сброшенные неподтвержденные,0,0,0,2474,28,0,27
3,2025-11-09 05:38:38,Sensor_0079,Потеря соединения,Критический,Сброшенные неподтвержденные,0,0,0,402,29,0,27
4,2025-11-09 05:38:38,Sensor_0068,Потеря соединения,Критический,Сброшенные неподтвержденные,0,0,0,606,25,0,27


In [14]:
alarms['time'] = pd.to_datetime(alarms['time'], utc=True)

# Convert from its current timezone to UTC+5 if needed
# If you know the timestamps are already UTC+5 logically but in UTC format:
alarms['time'] = alarms['time'].dt.tz_convert('Etc/GMT-5')

# Then convert to pure UTC for database consistency
alarms['time_utc'] = alarms['time'].dt.tz_convert('UTC')

# Create UNIX timestamp column (seconds)
alarms['timestamp'] = alarms['time_utc'].astype('int64') // 10**9

alarms.head()

Unnamed: 0,time,sensor_id,alarm_type,severity,status,Assignee,atm_pm_1_0,atm_pm_2_5,co2,delta,ma,temperature,time_utc,timestamp
0,2025-11-10 21:19:48+05:00,Sensor_0057,Уровень CO2,Критический,Сброшенные неподтвержденные,2,0,1,709,38,1,27,2025-11-10 16:19:48+00:00,1762791588
1,2025-11-09 10:38:38+05:00,Sensor_0013,Потеря соединения,Критический,Сброшенные неподтвержденные,0,0,0,1144,27,0,30,2025-11-09 05:38:38+00:00,1762666718
2,2025-11-09 10:38:38+05:00,Sensor_0089,Потеря соединения,Критический,Сброшенные неподтвержденные,0,0,0,2474,28,0,27,2025-11-09 05:38:38+00:00,1762666718
3,2025-11-09 10:38:38+05:00,Sensor_0079,Потеря соединения,Критический,Сброшенные неподтвержденные,0,0,0,402,29,0,27,2025-11-09 05:38:38+00:00,1762666718
4,2025-11-09 10:38:38+05:00,Sensor_0068,Потеря соединения,Критический,Сброшенные неподтвержденные,0,0,0,606,25,0,27,2025-11-09 05:38:38+00:00,1762666718


In [18]:
sensor_ids = [
    "Sensor_0013",
    "Sensor_0039",
    "Sensor_0045",
    "Sensor_0057",
    "Sensor_0068",
    "Sensor_0079",
    "Sensor_0080",
    "Sensor_0089"
]

sensor_data = {}

for sensor in sensor_ids:
    filename = f"{sensor}_150_days.csv"
    df = pd.read_csv(filename, sep=None, engine="python")

    # Convert timestamp
    df['time'] = pd.to_datetime(df['time'], utc=True)
    df['time'] = df['time'].dt.tz_convert("Etc/GMT-5")
    df['time_utc'] = df['time'].dt.tz_convert("UTC")
    df['timestamp'] = df['time_utc'].astype("int64") // 10**9

    # Drop unwanted columns if they exist
    for col in ['delta', 'time', 'time_utc', 'ma']:
        if col in df.columns:
            df.drop(columns=[col], inplace=True)

    # Reorder columns
    new_order = ['timestamp', 'atm_pm_1_0', 'atm_pm_2_5', 'co2', 'hum', 'temp']
    # keep only those columns that exist
    new_order = [col for col in new_order if col in df.columns]
    df = df[new_order]

    # Save into dictionary
    sensor_data[sensor] = df

# Example access:
print(sensor_data["Sensor_0057"].head())

    timestamp  atm_pm_1_0  atm_pm_2_5  co2    hum   temp
0  1762611990           3           6  400  36.79  26.99
1  1762612000           3           6  402  36.80  26.98
2  1762612010           3           5  403  36.81  26.99
3  1762612020           3           7  400  36.81  27.01
4  1762612030           4           7  404  36.81  27.00


In [29]:
from math import sqrt
from sklearn.metrics import mean_squared_error

In [30]:
def make_lagged_features(df, target_col="co2", n_lags=10, horizon_steps=1):
    """
    df: DataFrame with time column + numeric sensor columns
    target_col: which column we want to forecast
    n_lags: how many past steps to use as features
    horizon_steps: how many steps ahead to forecast (1 = next record)
    """
    df = df.sort_values("time").reset_index(drop=True)
    
    # Make sure time is datetime (drop tz if needed for simplicity)
    df["time"] = pd.to_datetime(df["time"])
    
    # numeric features
    feature_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    if target_col not in feature_cols:
        feature_cols.append(target_col)
    
    # Build lag features based on target_col
    for lag in range(1, n_lags + 1):
        df[f"{target_col}_lag_{lag}"] = df[target_col].shift(lag)
    
    # Target is horizon_steps ahead
    df[f"{target_col}_future_{horizon_steps}"] = df[target_col].shift(-horizon_steps)
    
    # Drop rows with NaNs created by shifts
    df_model = df.dropna().reset_index(drop=True)
    
    # X = lag features (+ optionally other numeric features)
    lag_cols = [f"{target_col}_lag_{lag}" for lag in range(1, n_lags + 1)]
    # optionally add other sensors as current-step features
    extra_cols = [c for c in feature_cols if c != target_col]
    
    X = df_model[lag_cols + extra_cols]
    y = df_model[f"{target_col}_future_{horizon_steps}"]
    
    return df_model, X, y

In [31]:
# 1) Load
sensor_0013_df = pd.read_csv("Sensor_0013_150_days.csv", sep=None, engine="python")

# If your file already has 'timestamp' and 'time_utc' etc, keep at least 'time' and numeric columns
# e.g. sensor_0013_df = sensor_0013_df[["time", "co2", "atm_pm_2_5", "ma", "temp"]]

# 2) Build lagged dataset (using co2 as target, 10 lags, forecast 1 step ahead)
df_0013_model, X, y = make_lagged_features(sensor_0013_df, target_col="co2",
                                           n_lags=10, horizon_steps=1)

# 3) Time-based train/test split (last 20% as test)
split_idx = int(len(df_0013_model) * 0.8)
X_train, X_test = X.iloc[:split_idx], X.iloc[split_idx:]
y_train, y_test = y.iloc[:split_idx], y.iloc[split_idx:]

# 4) Train model
rf = RandomForestRegressor(
    n_estimators=200,
    random_state=42,
    n_jobs=-1
)
rf.fit(X_train, y_train)

# 5) Evaluate
y_pred = rf.predict(X_test)
rmse = sqrt(mean_squared_error(y_test, y_pred))
# rmse = mean_squared_error(y_test, y_pred, squared=False)
print(f"RMSE (Sensor_0013, forecast 1 step ahead): {rmse:.3f}")


RMSE (Sensor_0013, forecast 1 step ahead): 13.392


In [32]:
# Attach predictions & residuals to the test part of df
df_test = df_0013_model.iloc[split_idx:].copy()
df_test["co2_pred"] = y_pred
df_test["residual"] = df_test["co2_pred"] - df_test["co2"]

# Rolling statistics of residuals
window = 300  # e.g. 300 points (tune to your frequency)
df_test["resid_mean"] = df_test["residual"].rolling(window).mean()
df_test["resid_std"] = df_test["residual"].rolling(window).std()

# Simple drift/anomaly rule:
k = 3  # 3-sigma rule
df_test["drift_flag"] = (
    (df_test["residual"] - df_test["resid_mean"]).abs() > k * df_test["resid_std"]
)

print(df_test[["time", "co2", "co2_pred", "residual", "drift_flag"]].tail())


                    time  co2  co2_pred  residual  drift_flag
8338 2025-11-14 10:15:11  506   504.215    -1.785       False
8339 2025-11-14 10:15:21  502   495.085    -6.915       False
8340 2025-11-14 10:15:31  512   495.750   -16.250       False
8341 2025-11-14 10:15:41  515   505.005    -9.995       False
8342 2025-11-14 10:15:52  530   513.560   -16.440       False


In [34]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from math import sqrt

files = [
    "Sensor_0013_150_days.csv",
    "Sensor_0039_150_days.csv",
    "Sensor_0045_150_days.csv",
    "Sensor_0057_150_days.csv",
    "Sensor_0068_150_days.csv",
    "Sensor_0079_150_days.csv",
    "Sensor_0080_150_days.csv",
    "Sensor_0089_150_days.csv",
]

models = {}
metrics = {}

for fname in files:
    df = pd.read_csv(fname, sep=None, engine="python")
    
    # Build supervised data (adjust n_lags, target col as needed)
    df_model, X, y = make_lagged_features(df,
                                          target_col="co2",
                                          n_lags=10,
                                          horizon_steps=1)
    
    split_idx = int(len(df_model) * 0.8)
    X_train, X_test = X.iloc[:split_idx], X.iloc[split_idx:]
    y_train, y_test = y.iloc[:split_idx], y.iloc[split_idx:]
    
    model = RandomForestRegressor(
        n_estimators=200,
        random_state=42,
        n_jobs=-1
    )
    
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    # FIXED RMSE
    rmse = sqrt(mean_squared_error(y_test, y_pred))
    
    models[fname] = model
    metrics[fname] = rmse

print("RMSE per file:")
for fname, rmse in metrics.items():
    print(f"{fname}: {rmse:.3f}")


ValueError: Found array with 0 sample(s) (shape=(0, 17)) while a minimum of 1 is required by RandomForestRegressor.

In [35]:
print(df_model.shape)
print("Feature rows:", X.shape[0])
print("Target rows:", y.shape[0])
print("Time unique count:", df["time"].nunique())
print("Total rows loaded:", df.shape[0])
print("NaN rows dropped:", df.shape[0] - df_model.shape[0])

(0, 20)
Feature rows: 0
Target rows: 0
Time unique count: 49800
Total rows loaded: 50051
NaN rows dropped: 50051
