In [20]:
%reload_ext autoreload
%autoreload 2

In [21]:
# from sklearn.model_selection import train_test_split
# import torch
# import torch.nn as nn
# from torch.utils.data import DataLoader, TensorDataset
# from torchvision import datasets, transforms
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import r2_score

# import wandb
import typing as t

In [22]:
from sklearn.utils._set_output import _SetOutputMixin
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, PowerTransformer, RobustScaler, StandardScaler

In [23]:
from models_pmml.beijing.beijing_data import load_beijing_air_quality_data, create_sequences

In [24]:
from kret_sandbox.VIS import dtt
from kret_np_pd.UTILS_np_pd import NP_PD_Utils
from pprint import pprint

## Load Data

In [25]:
from kret_sklearn.custom_transformers import MissingValueRemover, DateTimeSinCosNormalizer
from kret_sklearn.pd_pipeline import PipelinePD
from sklearn.preprocessing import OrdinalEncoder

In [26]:
missing_value_remover = MissingValueRemover(how="any")  # Remove rows with any NaN values
remove_nans_pipeline = PipelinePD(steps=[("remove_nans", missing_value_remover)])

In [27]:
# Cell: Load and split data FIRST (temporal split)
X, y = load_beijing_air_quality_data()
X.shape

(43824, 12)

In [28]:
# dtt([X, y], 10, filter=X.index > 20, how="head")

### Remove Nans

In [29]:
X_no_nans = remove_nans_pipeline.fit_transform_df(X, y)
y_no_nans = y.loc[X_no_nans.index]

In [30]:
dtt([X_no_nans, y_no_nans], 3, how="head")

Unnamed: 0_level_0,year,month,day,hour,pm2.5,DEWP,TEMP,PRES,cbwd,Iws,Is,Ir
Unnamed: 0_level_1,int64,int64,int64,int64,float64,int64,float64,float64,object,float64,int64,int64
24,2010,1,2,0,129.0,-16,-4.0,1020.0,SE,1.79,0,0
25,2010,1,2,1,148.0,-15,-4.0,1020.0,SE,2.68,0,0
26,2010,1,2,2,159.0,-11,-5.0,1021.0,SE,3.57,0,0

Unnamed: 0_level_0,pm2.5
Unnamed: 0_level_1,float64
24,129.0
25,148.0
26,159.0


In [31]:
# CRITICAL: Split BEFORE normalization to avoid data leakage
split_idx = int(0.8 * len(X_no_nans))
X_train_raw = X_no_nans.iloc[:split_idx]
X_test_raw = X_no_nans.iloc[split_idx:]
y_train_raw = y_no_nans.iloc[:split_idx]
y_test_raw = y_no_nans.iloc[split_idx:]
print(f"Train: {len(X_train_raw)} samples | Test: {len(X_test_raw)} samples")

Train: 33405 samples | Test: 8352 samples


# Redo Pipeline

NOTE Nans already removed

In [None]:
float_cols = ["pm2.5", "year", "DEWP", "TEMP", "PRES", "Iws", "Is", "Ir"]
date_cols = ["month", "day", "hour"]
wind_cols = ["cbwd"]

In [33]:
date_time_normalizer = DateTimeSinCosNormalizer(
    datetime_cols={"month": 12, "day": 31, "hour": 24}
)  # Normalize 'month' and 'hour' columns
power_transformer = PowerTransformer(method="yeo-johnson", standardize=True)

wind_encoder = OrdinalEncoder()

column_transform = ColumnTransformer(
    transformers=[
        ("datetime", date_time_normalizer, date_cols),
        ("scaler", power_transformer, float_cols),
        ("windlabel", wind_encoder, wind_cols),
    ],
    remainder="passthrough",
    verbose_feature_names_out=False,
    verbose=True,
)

In [34]:
pipeline_x = PipelinePD(steps=[("column_transform", column_transform)])
pipeline_y = PipelinePD(steps=[("scaler", power_transformer)])

In [35]:
X_train_cleaned = NP_PD_Utils.move_columns(pipeline_x.fit_transform_df(X_train_raw), ["year"], ["pm2.5", "cbwd"])
X_test_cleaned = NP_PD_Utils.move_columns(pipeline_x.transform_df(X_test_raw), ["year"], ["pm2.5", "cbwd"])

[ColumnTransformer] ...... (1 of 3) Processing datetime, total=   0.0s
[ColumnTransformer] ........ (2 of 3) Processing scaler, total=   0.1s
[ColumnTransformer] ..... (3 of 3) Processing windlabel, total=   0.0s


In [36]:
y_train_cleaned = pipeline_y.fit_transform_df(y_train_raw)
y_test_cleaned = pipeline_y.transform_df(y_test_raw)

In [37]:
dtt([X_train_cleaned, y_train_cleaned])

Unnamed: 0_level_0,year,month_sin,month_cos,day_sin,day_cos,hour_sin,hour_cos,DEWP,TEMP,PRES,Iws,Is,Ir,pm2.5,cbwd
Unnamed: 0_level_1,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64
7159,-1.358,-0.866,0.5,-0.849,0.529,0.966,-0.259,-0.585,-1.015,0.0,0.814,-0.101,-0.213,-2.121,1.0
17968,0.39,0.5,0.866,-0.651,-0.759,-0.866,-0.5,-0.517,-0.921,0.0,-0.928,-0.101,-0.213,2.032,2.0
19594,0.39,1.0,0.0,-0.725,0.689,0.5,-0.866,-0.517,0.439,0.0,0.994,-0.101,-0.213,0.247,1.0
20266,0.39,0.866,-0.5,-0.988,0.151,0.5,-0.866,0.997,0.363,-0.0,-0.055,-0.101,4.694,-0.131,1.0
27723,1.266,1.0,0.0,0.201,0.98,0.707,0.707,-1.886,-1.113,0.0,1.809,-0.101,-0.213,-2.008,1.0

Unnamed: 0_level_0,pm2.5
Unnamed: 0_level_1,float64
7159,-2.121
17968,2.032
19594,0.247
20266,-0.131
27723,-2.008


In [38]:
dtt([X_test_cleaned, y_test_cleaned])

Unnamed: 0_level_0,year,month_sin,month_cos,day_sin,day_cos,hour_sin,hour_cos,DEWP,TEMP,PRES,Iws,Is,Ir,pm2.5,cbwd
Unnamed: 0_level_1,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64
36051,2.144,0.866,0.5,0.791,-0.612,0.707,0.707,-1.202,-2.169,0.0,-1.714,-0.101,-0.213,0.592,3.0
37264,2.144,0.866,-0.5,0.394,0.919,-0.866,-0.5,-0.174,0.739,-0.0,0.64,-0.101,-0.213,0.235,2.0
38264,2.144,0.5,-0.866,0.299,-0.954,0.866,-0.5,-0.243,0.589,-0.0,1.752,-0.101,-0.213,-1.047,1.0
40622,2.144,-0.866,-0.5,-0.791,-0.612,-0.5,-0.866,1.066,1.618,-0.0,0.585,-0.101,-0.213,0.009,2.0
41301,2.144,-1.0,-0.0,-0.299,-0.954,-0.707,0.707,0.721,0.439,0.0,-1.377,-0.101,-0.213,-0.328,3.0

Unnamed: 0_level_0,pm2.5
Unnamed: 0_level_1,float64
36051,0.592
37264,0.235
38264,-1.047
40622,0.009
41301,-0.328
