In [17]:
import pandas as pd
import numpy as np

# Load your preprocessed dataframe
df = pd.read_csv("preprocessed_aqi_data.csv")  # <- change path
target = "us_aqi"

# 1) Drop obvious non-features
drop_cols = {"time", target}

# 2) Drop all scaled_* columns (scaling should happen in the pipeline)
scaled_cols = [c for c in df.columns if c.startswith("scaled_")]
drop_cols.update(scaled_cols)

# 3) Define raw↔log pairs (choose one)
raw_log_pairs = [
    ("pm2_5", "log_pm2_5"),
    ("pm10", "log_pm10"),
    ("carbon_monoxide", "log_carbon_monoxide"),
    ("ozone", "log_ozone"),
    ("sulphur_dioxide", "log_sulphur_dioxide"),
    ("nitrogen_dioxide", "log_nitrogen_dioxide"),
]

# Heuristic: if raw is highly skewed (|skew| > 1), prefer log; else prefer raw
# (Only decide among pairs that exist in your df)
to_keep = set(df.columns) - drop_cols
for raw_col, log_col in raw_log_pairs:
    if raw_col in to_keep and log_col in to_keep:
        raw_skew = df[raw_col].dropna().skew()
        # prefer log if strongly right-skewed
        if abs(raw_skew) > 1:
            to_drop = raw_col
        else:
            to_drop = log_col
        drop_cols.add(to_drop)

# Apply drops
keep_cols = [c for c in df.columns if c not in drop_cols]
clean_df = df[keep_cols + [target]].copy()

print("Dropped columns:")
print(sorted(drop_cols))
print("Kept features:", len(keep_cols))


Dropped columns:
['carbon_monoxide', 'log_ozone', 'nitrogen_dioxide', 'pm10', 'pm2_5', 'scaled_carbon_monoxide', 'scaled_log_carbon_monoxide', 'scaled_log_nitrogen_dioxide', 'scaled_log_ozone', 'scaled_log_pm10', 'scaled_log_pm2_5', 'scaled_log_sulphur_dioxide', 'scaled_nitrogen_dioxide', 'scaled_ozone', 'scaled_ozone_per_humidity', 'scaled_pm10', 'scaled_pm2_5', 'scaled_pm2_5_diff', 'scaled_pm2_5_temp_interaction', 'scaled_relative_humidity_2m', 'scaled_sulphur_dioxide', 'scaled_temperature_2m', 'scaled_us_aqi_diff', 'scaled_us_aqi_lag1', 'scaled_us_aqi_lag24', 'scaled_us_aqi_lag6', 'scaled_us_aqi_roll3', 'scaled_wind_speed_10m', 'sulphur_dioxide', 'time', 'us_aqi']
Kept features: 23


In [18]:
X = clean_df.drop(columns=[target]).copy()

corr = X.corr().abs()
upper = corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool))
high_corr_cols = [col for col in upper.columns if any(upper[col] > 0.95)]
#X_corr = X.drop(columns=high_corr_cols)

print("high correlation:", high_corr_cols)
#even thought usaqiroll3 has high correlation, it is an imp feature so i wont drop


high correlation: ['us_aqi_roll3']


In [4]:
from sklearn.model_selection import TimeSeriesSplit
import lightgbm as lgb
import numpy as np
import pandas as pd

y = clean_df[target]
X = clean_df.drop(columns=[target])

tscv = TimeSeriesSplit(n_splits=5)
importances = np.zeros(X.shape[1])

for tr, va in tscv.split(X):
    model = lgb.LGBMRegressor(
        n_estimators=400,
        learning_rate=0.05,
        num_leaves=31,
        subsample=0.9,
        colsample_bytree=0.9,
        random_state=42
    )
    model.fit(X.iloc[tr], y.iloc[tr])
    importances += model.feature_importances_

importances /= tscv.get_n_splits()

ranking = (
    pd.DataFrame({"feature": X.columns, "importance": importances})
    .sort_values("importance", ascending=False)
    .reset_index(drop=True)
)
print(ranking.head(30))

# Choose top-K (start with 25–30, adjust by CV performance)
K = 28
topk_features = ranking["feature"].head(K).tolist()
X_topk = X[topk_features]
X_topk.to_csv("aqi_features_topk.csv", index=False)

# Save the final feature list for inference
pd.Series(topk_features).to_json("final_feature_list.json", orient="values")


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000266 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1356
[LightGBM] [Info] Number of data points in the train set: 352, number of used features: 21
[LightGBM] [Info] Start training from score 76.863636
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000183 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2187
[LightGBM] [Info] Number of data points in the train set: 704, number of used features: 21
[LightGBM] [Info] Start training from score 76.490057
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000227 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2596
[LightGBM] [Info] Number of data points in the train set: 