In [1]:
%reload_ext autoreload
%autoreload 2

# Imports

In [2]:
from kret_notebook import *  # NOTE import first
from kret_matplotlib.mpl_nb_imports import *
from kret_np_pd.np_pd_nb_imports import *
from kret_sklearn.sklearn_nb_imports import *
from kret_torch_utils.torch_nb_imports import *
from kret_lightning.lightning_nb_imports import *
from kret_tqdm.tqdm_nb_imports import *
from kret_type_hints.types_nb_imports import *
from kret_utils.utils_nb_imports import *

# from kret_wandb.wandb_nb_imports import *  # NOTE this is slow to import

Loaded environment variables from /Users/Akseldkw/coding/kretsinger/.env
[kret_matplotlib.mpl_nb_imports] Imported kret_matplotlib.mpl_nb_imports in 0.0785 seconds
[kret_np_pd.np_pd_nb_imports] Imported kret_np_pd.np_pd_nb_imports in 0.0000 seconds
[kret_sklearn.sklearn_nb_imports] Imported kret_sklearn.sklearn_nb_imports in 0.2553 seconds
[kret_torch_utils.torch_nb_imports] Imported kret_torch_utils.torch_nb_imports in 0.3772 seconds
[kret_lightning.lightning_nb_imports] Imported kret_lightning.lightning_nb_imports in 0.0015 seconds
[kret_tqdm.tqdm_nb_imports] Imported kret_tqdm.tqdm_nb_imports in 0.0000 seconds
[kret_type_hints.types_nb_imports] Imported kret_type_hints.types_nb_imports in 0.0007 seconds
[kret_utils.utils_nb_imports] Imported kret_utils.utils_nb_imports in 0.0004 seconds


# LLM Ideas

In [3]:
import pandas as pd

df = pd.DataFrame({"feature_1": [1, 4, 7], "feature_2": [2, 5, 8], "label": [0, 1, 0]})
df  # Beautiful table in Jupyter

Unnamed: 0,feature_1,feature_2,label
0,1,2,0
1,4,5,1
2,7,8,0


In [4]:
import torch
import pandas as pd

data = torch.randn(100, 10)
labels = torch.randint(0, 2, (100,))

df = pd.DataFrame(data.numpy(), columns=[f"feature_{i}" for i in range(10)])
df["label"] = labels.numpy()
df  # Beautiful display!

Unnamed: 0,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,label
0,0.581795,-0.053343,-0.236651,0.220841,-1.785590,0.892059,-1.026192,1.027303,1.862387,-0.862997,0
1,0.356215,-0.802198,-0.565041,1.559830,0.744119,1.758877,-1.797387,-0.372224,0.357132,0.518570,1
2,-0.919906,-2.277210,-0.917966,0.295994,-0.640813,0.077768,1.034509,-1.001516,0.195073,-1.981521,0
3,1.299927,-0.772294,0.087103,0.484107,0.018459,-2.032370,-0.497235,-0.083255,-0.032028,0.429435,1
4,-0.497028,1.862187,-1.277491,0.680105,0.299781,-2.905462,0.427729,-0.333153,-0.868289,1.075813,1
...,...,...,...,...,...,...,...,...,...,...,...
95,-0.086551,0.105955,-0.846023,-1.131786,0.279752,-0.081504,-0.079213,-1.477738,-0.118635,0.489396,0
96,-1.771481,1.360847,0.029661,-1.554391,-0.872021,-1.767359,0.865106,-1.363450,0.341201,-0.911946,1
97,0.510287,-0.186715,-0.582330,1.130483,-0.122299,-0.421901,1.331151,-0.370953,0.273563,-0.660083,1
98,0.765677,-0.136423,0.168052,-0.960741,-2.057658,0.835402,-1.559946,-0.460513,0.383645,0.201873,1


In [5]:
from datasets import Dataset

dataset = Dataset.from_dict({"feature_1": [1, 4, 7], "feature_2": [2, 5, 8], "label": [0, 1, 0]})
dataset  # HTML table with column names!

Dataset({
    features: ['feature_1', 'feature_2', 'label'],
    num_rows: 3
})

In [6]:
from datasets import DatasetDict, Dataset

splits = DatasetDict(
    {
        "train": Dataset.from_dict({"feature_1": [1, 4], "feature_2": [2, 5], "label": [0, 1]}),
        "val": Dataset.from_dict({"feature_1": [7], "feature_2": [8], "label": [0]}),
        "test": Dataset.from_dict({"feature_1": [10], "feature_2": [11], "label": [1]}),
    }
)
splits  # Shows all splits with stats!

DatasetDict({
    train: Dataset({
        features: ['feature_1', 'feature_2', 'label'],
        num_rows: 2
    })
    val: Dataset({
        features: ['feature_1', 'feature_2', 'label'],
        num_rows: 1
    })
    test: Dataset({
        features: ['feature_1', 'feature_2', 'label'],
        num_rows: 1
    })
})

In [7]:
import torch
import pandas as pd
from torch.utils.data import DataLoader

# 1. Create DataFrame with column names
df = pd.DataFrame({"feature_1": [1, 4, 7], "feature_2": [2, 5, 8], "label": [0, 1, 0]})
df  # Display in notebook

Unnamed: 0,feature_1,feature_2,label
0,1,2,0
1,4,5,1
2,7,8,0


In [8]:
# 2. Convert to torch for training
data = torch.tensor(df[["feature_1", "feature_2"]].values, dtype=torch.float32)
labels = torch.tensor(df["label"].values)

# 3. Create DataLoader for training
loader = DataLoader(TensorDataset(data, labels), batch_size=2)

In [9]:
from datasets import DatasetDict, Dataset
from torch.utils.data import DataLoader

# 1. Create HuggingFace Dataset with column names
dataset = Dataset.from_dict({"feature_1": [1, 4, 7], "feature_2": [2, 5, 8], "label": [0, 1, 0]})
dataset  # Beautiful display!

Dataset({
    features: ['feature_1', 'feature_2', 'label'],
    num_rows: 3
})

In [10]:
# 2. Split into train/val/test
splits = dataset.train_test_split(test_size=0.2, seed=42)
splits = splits["train"].train_test_split(test_size=0.25, seed=42)

split_dict = DatasetDict({"train": splits["train"], "val": splits["test"], "test": splits})  # simplified for example
split_dict  # Shows all splits!

DatasetDict({
    train: Dataset({
        features: ['feature_1', 'feature_2', 'label'],
        num_rows: 1
    })
    val: Dataset({
        features: ['feature_1', 'feature_2', 'label'],
        num_rows: 1
    })
    test: DatasetDict({
        train: Dataset({
            features: ['feature_1', 'feature_2', 'label'],
            num_rows: 1
        })
        test: Dataset({
            features: ['feature_1', 'feature_2', 'label'],
            num_rows: 1
        })
    })
})

In [11]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader

# Sample DataFrame
df = pd.DataFrame({"feature_1": [1, 2, 3], "feature_2": [4, 5, 6], "label": [0, 1, 0]})

# Display column names from pandas
print(f"DataFrame columns: {df.columns.tolist()}")


# Create a custom PyTorch Dataset class
class CustomTabularDataset(Dataset):
    def __init__(self, dataframe):
        self.data = torch.tensor(dataframe.drop("label", axis=1).values, dtype=torch.float32)
        self.labels = torch.tensor(dataframe["label"].values, dtype=torch.long)
        # Store column names separately if needed for reference
        self.column_names = list(dataframe.drop("label", axis=1).columns)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.data[idx], self.labels[idx]


# Instantiate and access column names
torch_dataset = CustomTabularDataset(df)
print(f"PyTorch Dataset column names: {torch_dataset.column_names}")

DataFrame columns: ['feature_1', 'feature_2', 'label']
PyTorch Dataset column names: ['feature_1', 'feature_2']


In [12]:
UKS_PATHS.HUGGING_FACE_DIR.mkdir(parents=True, exist_ok=True)

# Beijing By Hand

## Load

In [13]:
from projects.beijing.load_beijing_data import load_beijing_air_quality_data  # project_kretsinger

In [14]:
from kret_sklearn.custom_transformers import MissingValueRemover, DateTimeSinCosNormalizer
from kret_sklearn.pd_pipeline import PipelinePD
from sklearn.preprocessing import OrdinalEncoder

In [15]:
missing_value_remover = MissingValueRemover(how="any")  # Remove rows with any NaN values
remove_nans_pipeline = PipelinePD(steps=[("remove_nans", missing_value_remover)])

In [16]:
# Cell: Load and split data FIRST (temporal split)
X, y = load_beijing_air_quality_data()
X.shape

(43824, 12)

In [17]:
# dtt([X, y], 10, filter=X.index > 20, how="head")

### Remove Nans

In [18]:
X_no_nans = remove_nans_pipeline.fit_transform_df(X, y)
y_no_nans = y.loc[X_no_nans.index]

Removed 2067 rows, representing 4.72% of the data


In [19]:
dtt([X_no_nans, y_no_nans], 3, how="head")

Unnamed: 0_level_0,year,month,day,hour,pm2.5,DEWP,TEMP,PRES,cbwd,Iws,Is,Ir
Unnamed: 0_level_1,int64,int64,int64,int64,float64,int64,float64,float64,object,float64,int64,int64
24,2010,1,2,0,129.0,-16,-4.0,1020.0,SE,1.79,0,0
25,2010,1,2,1,148.0,-15,-4.0,1020.0,SE,2.68,0,0
26,2010,1,2,2,159.0,-11,-5.0,1021.0,SE,3.57,0,0

Unnamed: 0_level_0,pm2.5
Unnamed: 0_level_1,float64
24,129.0
25,148.0
26,159.0


In [20]:
# CRITICAL: Split BEFORE normalization to avoid data leakage
split_idx = int(0.8 * len(X_no_nans))
X_train_raw = X_no_nans.iloc[:split_idx]
X_test_raw = X_no_nans.iloc[split_idx:]
y_train_raw = y_no_nans.iloc[:split_idx]
y_test_raw = y_no_nans.iloc[split_idx:]
print(f"Train: {len(X_train_raw)} samples | Test: {len(X_test_raw)} samples")

Train: 33405 samples | Test: 8352 samples


## Redo Pipeline

NOTE Nans already removed

In [21]:
float_cols = ["pm2.5", "year", "DEWP", "TEMP", "PRES", "Iws", "Is", "Ir"]
date_cols = ["month", "day", "hour"]
wind_cols = ["cbwd"]

In [22]:
date_time_normalizer = DateTimeSinCosNormalizer(
    datetime_cols={"month": 12, "day": 31, "hour": 24}
)  # Normalize 'month' and 'hour' columns
power_transformer = PowerTransformer(method="yeo-johnson", standardize=True)

wind_encoder = OrdinalEncoder()

column_transform = ColumnTransformer(
    transformers=[
        ("datetime", date_time_normalizer, date_cols),
        ("scaler", power_transformer, float_cols),
        ("windlabel", wind_encoder, wind_cols),
    ],
    remainder="passthrough",
    verbose_feature_names_out=False,
    verbose=True,
)

In [23]:
pipeline_x = PipelinePD(steps=[("column_transform", column_transform)])
pipeline_y = PipelinePD(steps=[("scaler", power_transformer)])

In [24]:
X_train_cleaned = UKS_NP_PD.move_columns(pipeline_x.fit_transform_df(X_train_raw), ["year"], ["pm2.5", "cbwd"])
X_test_cleaned = UKS_NP_PD.move_columns(pipeline_x.transform_df(X_test_raw), ["year"], ["pm2.5", "cbwd"])

[ColumnTransformer] ...... (1 of 3) Processing datetime, total=   0.0s
[ColumnTransformer] ........ (2 of 3) Processing scaler, total=   0.1s
[ColumnTransformer] ..... (3 of 3) Processing windlabel, total=   0.0s


In [25]:
y_train_cleaned = pipeline_y.fit_transform_df(y_train_raw)
y_test_cleaned = pipeline_y.transform_df(y_test_raw)

## View

In [26]:
UKS_NP_PD.dtt([X_train_cleaned, y_train_cleaned], how="head")

Unnamed: 0_level_0,year,month_sin,month_cos,day_sin,day_cos,hour_sin,hour_cos,DEWP,TEMP,PRES,Iws,Is,Ir,pm2.5,cbwd
Unnamed: 0_level_1,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64
24,-1.358,0.5,0.866,0.394,0.919,0.0,1.0,-1.202,-1.315,0.0,-0.924,-0.101,-0.213,0.671,2.0
25,-1.358,0.5,0.866,0.394,0.919,0.259,0.966,-1.133,-1.315,0.0,-0.63,-0.101,-0.213,0.825,2.0
26,-1.358,0.5,0.866,0.394,0.919,0.5,0.866,-0.859,-1.419,0.0,-0.417,-0.101,-0.213,0.907,2.0
27,-1.358,0.5,0.866,0.394,0.919,0.707,0.707,-0.585,-1.419,0.0,-0.114,9.921,-0.213,1.056,2.0
28,-1.358,0.5,0.866,0.394,0.919,0.866,0.5,-0.585,-1.419,0.0,-0.002,9.921,-0.213,0.747,2.0

Unnamed: 0_level_0,pm2.5
Unnamed: 0_level_1,float64
24,0.671
25,0.825
26,0.907
27,1.056
28,0.747


In [27]:
UKS_NP_PD.dtt([X_test_cleaned, y_test_cleaned])

Unnamed: 0_level_0,year,month_sin,month_cos,day_sin,day_cos,hour_sin,hour_cos,DEWP,TEMP,PRES,Iws,Is,Ir,pm2.5,cbwd
Unnamed: 0_level_1,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64
35804,2.144,0.5,0.866,-0.0,1.0,-0.866,0.5,-0.448,-0.745,0.0,-0.419,-0.101,-0.213,0.825,3.0
37028,2.144,1.0,0.0,-0.999,-0.051,-0.866,0.5,-0.243,0.211,0.0,1.102,-0.101,-0.213,0.573,2.0
39291,2.144,0.0,-1.0,-0.849,0.529,0.707,0.707,1.273,0.887,-0.0,1.36,-0.101,-0.213,1.132,2.0
42942,2.144,-0.5,0.866,-0.938,0.347,1.0,0.0,-0.448,-1.113,0.0,-1.377,-0.101,-0.213,0.516,0.0
43132,2.144,-0.0,1.0,0.571,0.821,0.866,0.5,-1.612,-1.315,0.0,-0.328,-0.101,-0.213,-1.592,1.0

Unnamed: 0_level_0,pm2.5
Unnamed: 0_level_1,float64
35804,0.825
37028,0.573
39291,1.132
42942,0.516
43132,-1.592


In [28]:
tensor_ds = TensorDataset(
    torch.tensor(X_train_cleaned.values, dtype=torch.float32), torch.tensor(y_train_cleaned.values, dtype=torch.float32)
)

In [29]:
type(tensor_ds[0])

tuple

In [30]:
# """
# Protocol classes for pandas conversion.

# Protocols define a structural interface - any class that implements
# the required methods is considered to satisfy the protocol, without
# explicit inheritance.
# """

# from typing import Protocol, runtime_checkable, Self
# import pandas as pd
# import torch
# from abc import ABC, abstractmethod
# import typing as t

# # ============================================================================
# # PROTOCOL APPROACH (Recommended for typing/duck typing)
# # ============================================================================

# T = t.TypeVar("T", bound="PandasConvertibleWithColumns")


# @runtime_checkable
# class PandasConvertibleWithColumns(Protocol):
#     """Protocol for objects with column information and pandas conversion."""

#     @property
#     def columns(self) -> list[str]:
#         """Get column names."""
#         ...

#     def to_pandas(self) -> pd.DataFrame:
#         """Convert to pandas DataFrame."""
#         ...

#     @staticmethod
#     def from_pd(df: pd.DataFrame, **kwargs) -> "PandasConvertibleWithColumns":
#         """Create from pandas DataFrame."""
#         ...

In [31]:
from kret_torch_utils.tensor_ds_custom import TensorDatasetCustom

from kret_rosetta.conversion_protocols import PandasConvertibleWithColumns

In [32]:
isinstance(TensorDatasetCustom, PandasConvertibleWithColumns)

True