In [1]:
%reload_ext autoreload
%autoreload 2

# Imports

In [2]:
from kret_notebook import *  # NOTE import first
from kret_matplotlib.mpl_nb_imports import *
from kret_np_pd.np_pd_nb_imports import *
from kret_sklearn.sklearn_nb_imports import *
from kret_torch_utils.torch_nb_imports import *
from kret_lightning.lightning_nb_imports import *
from kret_tqdm.tqdm_nb_imports import *
from kret_type_hints.types_nb_imports import *
from kret_utils.utils_nb_imports import *

# from kret_wandb.wandb_nb_imports import *  # NOTE this is slow to import

Loaded environment variables from /Users/Akseldkw/coding/kretsinger/.env
[kret_matplotlib.mpl_nb_imports] Imported kret_matplotlib.mpl_nb_imports in 0.0873 seconds
[kret_np_pd.np_pd_nb_imports] Imported kret_np_pd.np_pd_nb_imports in 0.0000 seconds
[kret_sklearn.sklearn_nb_imports] Imported kret_sklearn.sklearn_nb_imports in 0.2707 seconds
[kret_torch_utils.torch_nb_imports] Imported kret_torch_utils.torch_nb_imports in 0.4042 seconds
[kret_lightning.lightning_nb_imports] Imported kret_lightning.lightning_nb_imports in 0.0018 seconds
[kret_tqdm.tqdm_nb_imports] Imported kret_tqdm.tqdm_nb_imports in 0.0000 seconds
[kret_type_hints.types_nb_imports] Imported kret_type_hints.types_nb_imports in 0.0008 seconds
[kret_utils.utils_nb_imports] Imported kret_utils.utils_nb_imports in 0.0005 seconds


# LLM Ideas

In [3]:
import pandas as pd

df = pd.DataFrame({"feature_1": [1, 4, 7], "feature_2": [2, 5, 8], "label": [0, 1, 0]})
df  # Beautiful table in Jupyter

Unnamed: 0,feature_1,feature_2,label
0,1,2,0
1,4,5,1
2,7,8,0


In [4]:
import torch
import pandas as pd

data = torch.randn(100, 10)
labels = torch.randint(0, 2, (100,))

df = pd.DataFrame(data.numpy(), columns=[f"feature_{i}" for i in range(10)])
df["label"] = labels.numpy()
df  # Beautiful display!

Unnamed: 0,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,label
0,-0.398950,2.047863,1.073255,1.157549,-0.517617,-0.109115,-0.187338,-0.871437,-0.250600,-0.995472,0
1,0.487322,0.041458,0.541525,0.574840,-1.185994,-0.619114,0.150731,1.330778,0.901957,0.581013,0
2,-0.301203,-2.441841,-1.128175,0.886861,-0.399021,0.346630,0.173172,-1.585736,-0.473105,-1.379309,0
3,-0.217556,0.344086,-0.545171,-0.998719,-0.003012,-0.902434,-0.376523,0.954365,0.113175,0.276920,1
4,-0.728448,-0.603899,0.184129,-0.652101,1.011651,0.888038,-1.982775,-0.574989,-0.435532,1.433029,1
...,...,...,...,...,...,...,...,...,...,...,...
95,-1.632505,-0.474345,-0.253609,0.317095,-0.902011,0.056975,-0.715954,-1.704141,-1.312369,-0.221707,0
96,0.639501,0.102288,0.517457,0.976758,0.186014,-0.104582,1.249392,-0.142877,-0.352718,0.049828,0
97,-1.713290,-2.858117,-0.785031,-0.936278,-1.289138,-0.972920,-0.339586,0.562561,-0.309482,-0.604626,0
98,-0.273326,-0.201545,-0.191712,0.927783,2.336765,1.634744,1.136149,-0.118683,-0.003284,0.806591,0


In [5]:
from datasets import Dataset

dataset = Dataset.from_dict({"feature_1": [1, 4, 7], "feature_2": [2, 5, 8], "label": [0, 1, 0]})
dataset  # HTML table with column names!

Dataset({
    features: ['feature_1', 'feature_2', 'label'],
    num_rows: 3
})

In [6]:
from datasets import DatasetDict, Dataset

splits = DatasetDict(
    {
        "train": Dataset.from_dict({"feature_1": [1, 4], "feature_2": [2, 5], "label": [0, 1]}),
        "val": Dataset.from_dict({"feature_1": [7], "feature_2": [8], "label": [0]}),
        "test": Dataset.from_dict({"feature_1": [10], "feature_2": [11], "label": [1]}),
    }
)
splits  # Shows all splits with stats!

DatasetDict({
    train: Dataset({
        features: ['feature_1', 'feature_2', 'label'],
        num_rows: 2
    })
    val: Dataset({
        features: ['feature_1', 'feature_2', 'label'],
        num_rows: 1
    })
    test: Dataset({
        features: ['feature_1', 'feature_2', 'label'],
        num_rows: 1
    })
})

In [7]:
import torch
import pandas as pd
from torch.utils.data import DataLoader

# 1. Create DataFrame with column names
df = pd.DataFrame({"feature_1": [1, 4, 7], "feature_2": [2, 5, 8], "label": [0, 1, 0]})
df  # Display in notebook

Unnamed: 0,feature_1,feature_2,label
0,1,2,0
1,4,5,1
2,7,8,0


In [8]:
# 2. Convert to torch for training
data = torch.tensor(df[["feature_1", "feature_2"]].values, dtype=torch.float32)
labels = torch.tensor(df["label"].values)

# 3. Create DataLoader for training
loader = DataLoader(TensorDataset(data, labels), batch_size=2)

In [9]:
from datasets import DatasetDict, Dataset
from torch.utils.data import DataLoader

# 1. Create HuggingFace Dataset with column names
dataset = Dataset.from_dict({"feature_1": [1, 4, 7], "feature_2": [2, 5, 8], "label": [0, 1, 0]})
dataset  # Beautiful display!

Dataset({
    features: ['feature_1', 'feature_2', 'label'],
    num_rows: 3
})

In [10]:
# 2. Split into train/val/test
splits = dataset.train_test_split(test_size=0.2, seed=42)
splits = splits["train"].train_test_split(test_size=0.25, seed=42)

split_dict = DatasetDict({"train": splits["train"], "val": splits["test"], "test": splits})  # simplified for example
split_dict  # Shows all splits!

DatasetDict({
    train: Dataset({
        features: ['feature_1', 'feature_2', 'label'],
        num_rows: 1
    })
    val: Dataset({
        features: ['feature_1', 'feature_2', 'label'],
        num_rows: 1
    })
    test: DatasetDict({
        train: Dataset({
            features: ['feature_1', 'feature_2', 'label'],
            num_rows: 1
        })
        test: Dataset({
            features: ['feature_1', 'feature_2', 'label'],
            num_rows: 1
        })
    })
})

In [None]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader

# Sample DataFrame
df = pd.DataFrame({"feature_1": [1, 2, 3], "feature_2": [4, 5, 6], "label": [0, 1, 0]})

# Display column names from pandas
print(f"DataFrame columns: {df.columns.tolist()}")


# Create a custom PyTorch Dataset class
class CustomTabularDataset(Dataset):
    def __init__(self, dataframe):
        self.data = torch.tensor(dataframe.drop("label", axis=1).values, dtype=torch.float32)
        self.labels = torch.tensor(dataframe["label"].values, dtype=torch.long)
        # Store column names separately if needed for reference
        self.column_names = list(dataframe.drop("label", axis=1).columns)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.data[idx], self.labels[idx]


# Instantiate and access column names
torch_dataset = CustomTabularDataset(df)
print(f"PyTorch Dataset column names: {torch_dataset.column_names}")

DataFrame columns: ['feature_1', 'feature_2', 'label']
PyTorch Dataset column names: ['feature_1', 'feature_2']


In [14]:
UKS_PATHS.HUGGING_FACE_DIR.mkdir(parents=True, exist_ok=True)

# Beijing By Hand

## Load

In [25]:
from projects.beijing.load_beijing_data import load_beijing_air_quality_data  # project_kretsinger

In [26]:
from kret_sklearn.custom_transformers import MissingValueRemover, DateTimeSinCosNormalizer
from kret_sklearn.pd_pipeline import PipelinePD
from sklearn.preprocessing import OrdinalEncoder

In [27]:
missing_value_remover = MissingValueRemover(how="any")  # Remove rows with any NaN values
remove_nans_pipeline = PipelinePD(steps=[("remove_nans", missing_value_remover)])

In [28]:
# Cell: Load and split data FIRST (temporal split)
X, y = load_beijing_air_quality_data()
X.shape

(43824, 12)

In [29]:
# dtt([X, y], 10, filter=X.index > 20, how="head")

### Remove Nans

In [30]:
X_no_nans = remove_nans_pipeline.fit_transform_df(X, y)
y_no_nans = y.loc[X_no_nans.index]

Removed 2067 rows, representing 4.72% of the data


In [31]:
dtt([X_no_nans, y_no_nans], 3, how="head")

Unnamed: 0_level_0,year,month,day,hour,pm2.5,DEWP,TEMP,PRES,cbwd,Iws,Is,Ir
Unnamed: 0_level_1,int64,int64,int64,int64,float64,int64,float64,float64,object,float64,int64,int64
24,2010,1,2,0,129.0,-16,-4.0,1020.0,SE,1.79,0,0
25,2010,1,2,1,148.0,-15,-4.0,1020.0,SE,2.68,0,0
26,2010,1,2,2,159.0,-11,-5.0,1021.0,SE,3.57,0,0

Unnamed: 0_level_0,pm2.5
Unnamed: 0_level_1,float64
24,129.0
25,148.0
26,159.0


In [32]:
# CRITICAL: Split BEFORE normalization to avoid data leakage
split_idx = int(0.8 * len(X_no_nans))
X_train_raw = X_no_nans.iloc[:split_idx]
X_test_raw = X_no_nans.iloc[split_idx:]
y_train_raw = y_no_nans.iloc[:split_idx]
y_test_raw = y_no_nans.iloc[split_idx:]
print(f"Train: {len(X_train_raw)} samples | Test: {len(X_test_raw)} samples")

Train: 33405 samples | Test: 8352 samples


## Redo Pipeline

NOTE Nans already removed

In [37]:
float_cols = ["pm2.5", "year", "DEWP", "TEMP", "PRES", "Iws", "Is", "Ir"]
date_cols = ["month", "day", "hour"]
wind_cols = ["cbwd"]

In [38]:
date_time_normalizer = DateTimeSinCosNormalizer(
    datetime_cols={"month": 12, "day": 31, "hour": 24}
)  # Normalize 'month' and 'hour' columns
power_transformer = PowerTransformer(method="yeo-johnson", standardize=True)

wind_encoder = OrdinalEncoder()

column_transform = ColumnTransformer(
    transformers=[
        ("datetime", date_time_normalizer, date_cols),
        ("scaler", power_transformer, float_cols),
        ("windlabel", wind_encoder, wind_cols),
    ],
    remainder="passthrough",
    verbose_feature_names_out=False,
    verbose=True,
)

In [39]:
pipeline_x = PipelinePD(steps=[("column_transform", column_transform)])
pipeline_y = PipelinePD(steps=[("scaler", power_transformer)])

In [40]:
X_train_cleaned = UKS_NP_PD.move_columns(pipeline_x.fit_transform_df(X_train_raw), ["year"], ["pm2.5", "cbwd"])
X_test_cleaned = UKS_NP_PD.move_columns(pipeline_x.transform_df(X_test_raw), ["year"], ["pm2.5", "cbwd"])

[ColumnTransformer] ...... (1 of 3) Processing datetime, total=   0.0s
[ColumnTransformer] ........ (2 of 3) Processing scaler, total=   0.1s
[ColumnTransformer] ..... (3 of 3) Processing windlabel, total=   0.0s


In [41]:
y_train_cleaned = pipeline_y.fit_transform_df(y_train_raw)
y_test_cleaned = pipeline_y.transform_df(y_test_raw)

## View

In [None]:
UKS_NP_PD.dtt([X_train_cleaned, y_train_cleaned], how="head")

Unnamed: 0_level_0,year,month_sin,month_cos,day_sin,day_cos,hour_sin,hour_cos,DEWP,TEMP,PRES,Iws,Is,Ir,pm2.5,cbwd
Unnamed: 0_level_1,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64
24,-1.358,0.5,0.866,0.394,0.919,0.0,1.0,-1.202,-1.315,0.0,-0.924,-0.101,-0.213,0.671,2.0
25,-1.358,0.5,0.866,0.394,0.919,0.259,0.966,-1.133,-1.315,0.0,-0.63,-0.101,-0.213,0.825,2.0
26,-1.358,0.5,0.866,0.394,0.919,0.5,0.866,-0.859,-1.419,0.0,-0.417,-0.101,-0.213,0.907,2.0
27,-1.358,0.5,0.866,0.394,0.919,0.707,0.707,-0.585,-1.419,0.0,-0.114,9.921,-0.213,1.056,2.0
28,-1.358,0.5,0.866,0.394,0.919,0.866,0.5,-0.585,-1.419,0.0,-0.002,9.921,-0.213,0.747,2.0

Unnamed: 0_level_0,pm2.5
Unnamed: 0_level_1,float64
24,0.671
25,0.825
26,0.907
27,1.056
28,0.747


In [47]:
UKS_NP_PD.dtt([X_test_cleaned, y_test_cleaned])

Unnamed: 0_level_0,year,month_sin,month_cos,day_sin,day_cos,hour_sin,hour_cos,DEWP,TEMP,PRES,Iws,Is,Ir,pm2.5,cbwd
Unnamed: 0_level_1,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64
36945,2.144,1.0,0.0,-0.791,-0.612,0.707,-0.707,-1.202,0.058,0.0,1.454,-0.101,-0.213,-1.592,1.0
40654,2.144,-0.866,-0.5,-0.898,-0.44,-0.5,0.866,0.997,1.035,-0.0,0.318,-0.101,-0.213,-1.168,2.0
42134,2.144,-0.866,0.5,-0.968,-0.251,-0.5,-0.866,0.307,0.135,0.0,0.05,-0.101,-0.213,0.405,2.0
42147,2.144,-0.866,0.5,-0.999,-0.051,0.707,0.707,0.376,-0.02,0.0,-1.377,-0.101,-0.213,0.477,3.0
43774,2.144,-0.0,1.0,-0.394,0.919,-0.5,0.866,-0.928,-1.213,-0.0,-0.515,-0.101,-0.213,0.942,0.0

Unnamed: 0_level_0,pm2.5
Unnamed: 0_level_1,float64
36945,-1.592
40654,-1.168
42134,0.405
42147,0.477
43774,0.942


In [None]:
tensor_ds = TensorDataset(
    torch.tensor(X_train_cleaned.values, dtype=torch.float32), torch.tensor(y_train_cleaned.values, dtype=torch.float32)
)

In [60]:
type(tensor_ds[0])

tuple

In [58]:
UKS_TH_UTILS.func_to_typed_dict(TensorDataset.__init__)

from torch import Tensor
from typing import TypedDict

class TensorDataset___init___TypedDict(TypedDict, total=False):
    tensors: Tensor
