In [1]:
%reload_ext autoreload
%autoreload 2

In [2]:
# from sklearn.model_selection import train_test_split
# import torch
# import torch.nn as nn
# from torch.utils.data import DataLoader, TensorDataset
# from torchvision import datasets, transforms
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import r2_score
from kret_np_pd.np_pd_nb_imports import *

# import wandb
import typing as t

[kret_np_pd.np_pd_nb_imports] Imported kret_np_pd.np_pd_nb_imports in 1.3607 seconds


In [3]:
from sklearn.utils._set_output import _SetOutputMixin
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, PowerTransformer, RobustScaler, StandardScaler

In [4]:
from projects.beijing.load_beijing_data import load_beijing_air_quality_data  # project_kretsinger

## Load Data

In [5]:
from kret_sklearn.custom_transformers import MissingValueRemover, DateTimeSinCosNormalizer
from kret_sklearn.pd_pipeline import PipelinePD
from sklearn.preprocessing import OrdinalEncoder

In [6]:
missing_value_remover = MissingValueRemover(how="any")  # Remove rows with any NaN values
remove_nans_pipeline = PipelinePD(steps=[("remove_nans", missing_value_remover)])

In [7]:
# Cell: Load and split data FIRST (temporal split)
X, y = load_beijing_air_quality_data()
X.shape

(43824, 12)

In [8]:
# dtt([X, y], 10, filter=X.index > 20, how="head")

### Remove Nans

In [9]:
X_no_nans = remove_nans_pipeline.fit_transform_df(X, y)
y_no_nans = y.loc[X_no_nans.index]

Removed 2067 rows, representing 4.72% of the data


In [10]:
dtt([X_no_nans, y_no_nans], 3, how="head")

Unnamed: 0_level_0,year,month,day,hour,pm2.5,DEWP,TEMP,PRES,cbwd,Iws,Is,Ir
Unnamed: 0_level_1,int64,int64,int64,int64,float64,int64,float64,float64,object,float64,int64,int64
24,2010,1,2,0,129.0,-16,-4.0,1020.0,SE,1.79,0,0
25,2010,1,2,1,148.0,-15,-4.0,1020.0,SE,2.68,0,0
26,2010,1,2,2,159.0,-11,-5.0,1021.0,SE,3.57,0,0

Unnamed: 0_level_0,pm2.5
Unnamed: 0_level_1,float64
24,129.0
25,148.0
26,159.0


In [11]:
# CRITICAL: Split BEFORE normalization to avoid data leakage
split_idx = int(0.8 * len(X_no_nans))
X_train_raw = X_no_nans.iloc[:split_idx]
X_test_raw = X_no_nans.iloc[split_idx:]
y_train_raw = y_no_nans.iloc[:split_idx]
y_test_raw = y_no_nans.iloc[split_idx:]
print(f"Train: {len(X_train_raw)} samples | Test: {len(X_test_raw)} samples")

Train: 33405 samples | Test: 8352 samples


# Redo Pipeline

NOTE Nans already removed

In [12]:
float_cols = ["pm2.5", "year", "DEWP", "TEMP", "PRES", "Iws", "Is", "Ir"]
date_cols = ["month", "day", "hour"]
wind_cols = ["cbwd"]

In [13]:
date_time_normalizer = DateTimeSinCosNormalizer(
    datetime_cols={"month": 12, "day": 31, "hour": 24}
)  # Normalize 'month' and 'hour' columns
power_transformer = PowerTransformer(method="yeo-johnson", standardize=True)

wind_encoder = OrdinalEncoder()

column_transform = ColumnTransformer(
    transformers=[
        ("datetime", date_time_normalizer, date_cols),
        ("scaler", power_transformer, float_cols),
        ("windlabel", wind_encoder, wind_cols),
    ],
    remainder="passthrough",
    verbose_feature_names_out=False,
    verbose=True,
)

In [14]:
pipeline_x = PipelinePD(steps=[("column_transform", column_transform)])
pipeline_y = PipelinePD(steps=[("scaler", power_transformer)])

In [15]:
X_train_cleaned = UKS_NP_PD.move_columns(pipeline_x.fit_transform_df(X_train_raw), ["year"], ["pm2.5", "cbwd"])
X_test_cleaned = UKS_NP_PD.move_columns(pipeline_x.transform_df(X_test_raw), ["year"], ["pm2.5", "cbwd"])

[ColumnTransformer] ...... (1 of 3) Processing datetime, total=   0.0s
[ColumnTransformer] ........ (2 of 3) Processing scaler, total=   0.1s
[ColumnTransformer] ..... (3 of 3) Processing windlabel, total=   0.0s


In [16]:
y_train_cleaned = pipeline_y.fit_transform_df(y_train_raw)
y_test_cleaned = pipeline_y.transform_df(y_test_raw)

In [17]:
dtt([X_train_cleaned, y_train_cleaned])

Unnamed: 0_level_0,year,month_sin,month_cos,day_sin,day_cos,hour_sin,hour_cos,DEWP,TEMP,PRES,Iws,Is,Ir,pm2.5,cbwd
Unnamed: 0_level_1,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64
3056,-1.358,0.5,-0.866,0.999,-0.051,0.866,-0.5,0.583,0.514,-0.0,-0.924,-0.101,-0.213,1.119,1.0
11179,-0.485,0.866,-0.5,0.791,-0.612,-0.966,0.259,-0.928,0.439,0.0,0.849,-0.101,-0.213,-0.905,2.0
23911,0.39,-1.0,-0.0,-0.999,-0.051,0.966,-0.259,0.928,0.363,-0.0,-0.328,-0.101,-0.213,0.373,1.0
30347,1.266,0.0,-1.0,-0.485,-0.874,0.259,-0.966,0.721,1.545,-0.0,0.095,-0.101,-0.213,-0.751,2.0
30840,1.266,-0.5,-0.866,0.968,-0.251,0.0,1.0,1.549,1.035,-0.0,-0.924,-0.101,-0.213,0.841,1.0

Unnamed: 0_level_0,pm2.5
Unnamed: 0_level_1,float64
3056,1.119
11179,-0.905
23911,0.373
30347,-0.751
30840,0.841


In [18]:
dtt([X_test_cleaned, y_test_cleaned])

Unnamed: 0_level_0,year,month_sin,month_cos,day_sin,day_cos,hour_sin,hour_cos,DEWP,TEMP,PRES,Iws,Is,Ir,pm2.5,cbwd
Unnamed: 0_level_1,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64
37075,2.144,1.0,0.0,-0.938,0.347,-0.966,0.259,0.101,0.514,-0.0,0.604,-0.101,-0.213,0.949,2.0
38222,2.144,0.5,-0.866,0.651,-0.759,-0.5,-0.866,0.239,1.328,-0.0,0.604,-0.101,-0.213,-1.01,2.0
41368,2.144,-1.0,-0.0,-0.791,-0.612,-0.866,-0.5,0.859,1.181,-0.0,-0.63,-0.101,-0.213,0.34,3.0
41505,2.144,-1.0,-0.0,-0.849,0.529,0.707,-0.707,0.997,0.589,-0.0,0.566,-0.101,-0.213,0.601,2.0
42793,2.144,-0.5,0.866,-0.651,-0.759,0.259,0.966,-0.448,-0.921,0.0,-1.377,-0.101,-0.213,1.475,3.0

Unnamed: 0_level_0,pm2.5
Unnamed: 0_level_1,float64
37075,0.949
38222,-1.01
41368,0.34
41505,0.601
42793,1.475
