In [14]:
# cell 1: imports and load
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.tsa.stattools import acf, pacf
from statsmodels.tsa.seasonal import STL, seasonal_decompose
from scipy import stats
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf


In [15]:
df = pd.read_csv("air_quality_cleaned.csv")

In [16]:
df.columns

Index(['datetime', 'Date', 'Time', 'CO(GT)', 'PT08.S1(CO)', 'C6H6(GT)',
       'NOx(GT)', 'PT08.S3(NOx)', 'NO2(GT)', 'PT08.S4(NO2)', 'PT08.S5(O3)',
       'T', 'RH', 'AH', 'hour', 'dayofweek', 'CO(GT)_lag_1', 'CO(GT)_lag_3',
       'CO(GT)_lag_6', 'CO(GT)_lag_12', 'CO(GT)_lag_24', 'CO_roll_std',
       'CO_roll_max', 'datetime.1', 'month', 'hour_sin', 'hour_cos',
       'CO(GT)_rollmean_3', 'CO(GT)_rollstd_3', 'CO(GT)_rollmean_24',
       'CO(GT)_rollstd_24'],
      dtype='object')

In [17]:
# List of target-based lag/rolling columns to remove
lag_roll_cols = [
    'CO(GT)_lag_1', 'CO(GT)_lag_3', 'CO(GT)_lag_6', 'CO(GT)_lag_12', 'CO(GT)_lag_24',
    'CO_roll_std', 'CO_roll_max',
    'CO(GT)_rollmean_3', 'CO(GT)_rollstd_3', 'CO(GT)_rollmean_24', 'CO(GT)_rollstd_24','datetime.1' 
]

# Drop them if they exist in the DataFrame
df = df.drop(columns=[col for col in lag_roll_cols if col in df.columns])

print("Remaining columns:", df.columns.tolist())

Remaining columns: ['datetime', 'Date', 'Time', 'CO(GT)', 'PT08.S1(CO)', 'C6H6(GT)', 'NOx(GT)', 'PT08.S3(NOx)', 'NO2(GT)', 'PT08.S4(NO2)', 'PT08.S5(O3)', 'T', 'RH', 'AH', 'hour', 'dayofweek', 'month', 'hour_sin', 'hour_cos']


In [18]:
df.head(5)

Unnamed: 0,datetime,Date,Time,CO(GT),PT08.S1(CO),C6H6(GT),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH,hour,dayofweek,month,hour_sin,hour_cos
0,2004-03-10 18:00:00,10/03/2004,18.00.00,2.6,1360.0,11.9,166.0,1056.0,113.0,1692.0,1268.0,13.6,48.9,0.7578,18,2,3,-1.0,-1.83697e-16
1,2004-03-10 19:00:00,10/03/2004,19.00.00,2.0,1292.0,9.4,103.0,1174.0,92.0,1559.0,972.0,13.3,47.7,0.7255,19,2,3,-0.965926,0.258819
2,2004-03-10 20:00:00,10/03/2004,20.00.00,2.2,1402.0,9.0,131.0,1140.0,114.0,1555.0,1074.0,11.9,54.0,0.7502,20,2,3,-0.866025,0.5
3,2004-03-10 21:00:00,10/03/2004,21.00.00,2.2,1376.0,9.2,172.0,1092.0,122.0,1584.0,1203.0,11.0,60.0,0.7867,21,2,3,-0.707107,0.7071068
4,2004-03-10 22:00:00,10/03/2004,22.00.00,1.6,1272.0,6.5,131.0,1205.0,116.0,1490.0,1110.0,11.2,59.6,0.7888,22,2,3,-0.5,0.8660254


In [19]:
df.shape

(9357, 19)

In [20]:
# This determines the index at the 90% mark of the dataset.
split_point = int(len(df) * 0.9)

# The training/testing set is the first 90% of the data
training_df = df.iloc[:split_point]

# The holdout set is the final 10% of the data
holdout_df = df.iloc[split_point:]

# The index=False argument prevents pandas from writing the DataFrame index as a column.
training_df.to_csv('sarima_training_data.csv', index=True)
holdout_df.to_csv('sarima_holdout_data.csv', index=True)

print("\nSplit complete!")
print(f"'{'training_data.csv'}' created with {len(training_df)} rows (90%).")
print(f"'{'holdout_data.csv'}' created with {len(holdout_df)} rows (10%).")


Split complete!
'training_data.csv' created with 8421 rows (90%).
'holdout_data.csv' created with 936 rows (10%).
