In [1]:
%reload_ext autoreload
%autoreload 2

# Imports

In [2]:
from kret_notebook import *  # NOTE import first
from kret_matplotlib.mpl_nb_imports import *
from kret_np_pd.np_pd_nb_imports import *
from kret_polars.polars_nb_imports import *
from kret_rosetta.rosetta_nb_imports import *
from kret_sklearn.sklearn_nb_imports import *
from kret_torch_utils.torch_nb_imports import *
from kret_lightning.lightning_nb_imports import *
from kret_tqdm.tqdm_nb_imports import *
from kret_type_hints.types_nb_imports import *
from kret_utils.utils_nb_imports import *

# from kret_wandb.wandb_nb_imports import *  # NOTE this is slow to import

Loaded environment variables from /Users/Akseldkw/coding/kretsinger/.env
[kret_matplotlib.mpl_nb_imports] Imported kret_matplotlib.mpl_nb_imports in 0.0829 seconds
[kret_np_pd.np_pd_nb_imports] Imported kret_np_pd.np_pd_nb_imports in 0.0000 seconds
[kret_polars.polars_nb_imports] Imported kret_polars.polars_nb_imports in 0.1070 seconds
[kret_rosetta.rosetta_nb_imports] Imported kret_rosetta.rosetta_nb_imports in 0.0000 seconds
[kret_sklearn.sklearn_nb_imports] Imported kret_sklearn.sklearn_nb_imports in 0.2696 seconds
[kret_torch_utils.torch_nb_imports] Imported kret_torch_utils.torch_nb_imports in 0.3966 seconds
[kret_lightning.lightning_nb_imports] Imported kret_lightning.lightning_nb_imports in 0.0018 seconds
[kret_tqdm.tqdm_nb_imports] Imported kret_tqdm.tqdm_nb_imports in 0.0000 seconds
[kret_type_hints.types_nb_imports] Imported kret_type_hints.types_nb_imports in 0.0022 seconds
[kret_utils.utils_nb_imports] Imported kret_utils.utils_nb_imports in 0.0004 seconds


# LLM Ideas

In [3]:
import pandas as pd

df = pd.DataFrame({"feature_1": [1, 4, 7], "feature_2": [2, 5, 8], "label": [0, 1, 0]})
df  # Beautiful table in Jupyter

Unnamed: 0,feature_1,feature_2,label
0,1,2,0
1,4,5,1
2,7,8,0


In [4]:
import torch
import pandas as pd

data = torch.randn(100, 10)
labels = torch.randint(0, 2, (100,))

df = pd.DataFrame(data.numpy(), columns=[f"feature_{i}" for i in range(10)])
df["label"] = labels.numpy()
df  # Beautiful display!

Unnamed: 0,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,label
0,0.248204,-0.321196,1.266540,0.870059,-1.731091,0.028822,-0.417918,0.854616,-0.218868,-0.051376,1
1,0.099696,0.212655,-0.172532,0.186768,-1.064929,-1.701263,1.014468,0.155657,1.144059,-0.113525,0
2,0.102179,0.535909,0.234977,-0.199973,-1.188235,-0.098059,0.619745,0.656528,-0.619040,-0.325732,0
3,-0.686447,-0.187167,1.365916,0.142631,0.293780,-0.910443,-1.148521,-1.442550,0.257596,-1.522797,1
4,-1.164996,0.142649,-0.609945,-0.399465,-0.900292,-0.252036,-2.310055,1.249159,-0.229365,-0.536043,1
...,...,...,...,...,...,...,...,...,...,...,...
95,-0.364427,-0.453143,-0.674570,-0.080724,-0.160757,1.115006,-0.939578,0.526275,-0.088441,0.937404,0
96,-2.525029,0.013681,0.274931,0.201111,-1.563314,-0.394224,1.126284,0.614789,-1.470680,-1.630264,1
97,0.238477,0.423400,0.086069,-0.152172,0.961755,-1.188749,-0.522850,0.710184,1.818857,-0.843271,1
98,-0.759762,1.811863,-0.270715,1.260630,-2.284228,-0.562269,-1.124799,1.235072,0.240851,-0.757392,0


In [5]:
from datasets import Dataset

dataset = Dataset.from_dict({"feature_1": [1, 4, 7], "feature_2": [2, 5, 8], "label": [0, 1, 0]})
dataset  # HTML table with column names!

Dataset({
    features: ['feature_1', 'feature_2', 'label'],
    num_rows: 3
})

In [6]:
from datasets import DatasetDict, Dataset

splits = DatasetDict(
    {
        "train": Dataset.from_dict({"feature_1": [1, 4], "feature_2": [2, 5], "label": [0, 1]}),
        "val": Dataset.from_dict({"feature_1": [7], "feature_2": [8], "label": [0]}),
        "test": Dataset.from_dict({"feature_1": [10], "feature_2": [11], "label": [1]}),
    }
)
splits  # Shows all splits with stats!

DatasetDict({
    train: Dataset({
        features: ['feature_1', 'feature_2', 'label'],
        num_rows: 2
    })
    val: Dataset({
        features: ['feature_1', 'feature_2', 'label'],
        num_rows: 1
    })
    test: Dataset({
        features: ['feature_1', 'feature_2', 'label'],
        num_rows: 1
    })
})

In [7]:
import torch
import pandas as pd
from torch.utils.data import DataLoader

# 1. Create DataFrame with column names
df = pd.DataFrame({"feature_1": [1, 4, 7], "feature_2": [2, 5, 8], "label": [0, 1, 0]})
df  # Display in notebook

Unnamed: 0,feature_1,feature_2,label
0,1,2,0
1,4,5,1
2,7,8,0


In [8]:
# 2. Convert to torch for training
data = torch.tensor(df[["feature_1", "feature_2"]].values, dtype=torch.float32)
labels = torch.tensor(df["label"].values)

# 3. Create DataLoader for training
loader = DataLoader(TensorDataset(data, labels), batch_size=2)

In [9]:
from datasets import DatasetDict, Dataset
from torch.utils.data import DataLoader

# 1. Create HuggingFace Dataset with column names
dataset = Dataset.from_dict({"feature_1": [1, 4, 7], "feature_2": [2, 5, 8], "label": [0, 1, 0]})
dataset  # Beautiful display!

Dataset({
    features: ['feature_1', 'feature_2', 'label'],
    num_rows: 3
})

In [10]:
# 2. Split into train/val/test
splits = dataset.train_test_split(test_size=0.2, seed=42)
splits = splits["train"].train_test_split(test_size=0.25, seed=42)

split_dict = DatasetDict({"train": splits["train"], "val": splits["test"], "test": splits})  # simplified for example
split_dict  # Shows all splits!

DatasetDict({
    train: Dataset({
        features: ['feature_1', 'feature_2', 'label'],
        num_rows: 1
    })
    val: Dataset({
        features: ['feature_1', 'feature_2', 'label'],
        num_rows: 1
    })
    test: DatasetDict({
        train: Dataset({
            features: ['feature_1', 'feature_2', 'label'],
            num_rows: 1
        })
        test: Dataset({
            features: ['feature_1', 'feature_2', 'label'],
            num_rows: 1
        })
    })
})

In [11]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader

# Sample DataFrame
df = pd.DataFrame({"feature_1": [1, 2, 3], "feature_2": [4, 5, 6], "label": [0, 1, 0]})

# Display column names from pandas
print(f"DataFrame columns: {df.columns.tolist()}")


# Create a custom PyTorch Dataset class
class CustomTabularDataset(Dataset):
    def __init__(self, dataframe):
        self.data = torch.tensor(dataframe.drop("label", axis=1).values, dtype=torch.float32)
        self.labels = torch.tensor(dataframe["label"].values, dtype=torch.long)
        # Store column names separately if needed for reference
        self.column_names = list(dataframe.drop("label", axis=1).columns)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.data[idx], self.labels[idx]


# Instantiate and access column names
torch_dataset = CustomTabularDataset(df)
print(f"PyTorch Dataset column names: {torch_dataset.column_names}")

DataFrame columns: ['feature_1', 'feature_2', 'label']
PyTorch Dataset column names: ['feature_1', 'feature_2']


In [12]:
UKS_PATHS.HUGGING_FACE_DIR.mkdir(parents=True, exist_ok=True)

# Beijing By Hand

## Load

In [13]:
from projects.beijing.load_beijing_data import load_beijing_air_quality_data  # project_kretsinger

In [14]:
from kret_sklearn.custom_transformers import MissingValueRemover, DateTimeSinCosNormalizer
from kret_sklearn.pd_pipeline import PipelinePD
from sklearn.preprocessing import OrdinalEncoder

In [15]:
missing_value_remover = MissingValueRemover(how="any")  # Remove rows with any NaN values
remove_nans_pipeline = PipelinePD(steps=[("remove_nans", missing_value_remover)])

In [16]:
# Cell: Load and split data FIRST (temporal split)
X, y = load_beijing_air_quality_data()
X.shape

(43824, 12)

In [17]:
# dtt([X, y], 10, filter=X.index > 20, how="head")

### Remove Nans

In [18]:
X_no_nans = remove_nans_pipeline.fit_transform_df(X, y)
y_no_nans = y.loc[X_no_nans.index]

Removed 2067 rows, representing 4.72% of the data


In [19]:
dtt([X_no_nans, y_no_nans], 3, how="head")

Unnamed: 0_level_0,year,month,day,hour,pm2.5,DEWP,TEMP,PRES,cbwd,Iws,Is,Ir
Unnamed: 0_level_1,int64,int64,int64,int64,float64,int64,float64,float64,object,float64,int64,int64
24,2010,1,2,0,129.0,-16,-4.0,1020.0,SE,1.79,0,0
25,2010,1,2,1,148.0,-15,-4.0,1020.0,SE,2.68,0,0
26,2010,1,2,2,159.0,-11,-5.0,1021.0,SE,3.57,0,0

Unnamed: 0_level_0,pm2.5
Unnamed: 0_level_1,float64
24,129.0
25,148.0
26,159.0


In [20]:
# CRITICAL: Split BEFORE normalization to avoid data leakage
split_idx = int(0.8 * len(X_no_nans))
X_train_raw = X_no_nans.iloc[:split_idx]
X_test_raw = X_no_nans.iloc[split_idx:]
y_train_raw = y_no_nans.iloc[:split_idx]
y_test_raw = y_no_nans.iloc[split_idx:]
print(f"Train: {len(X_train_raw)} samples | Test: {len(X_test_raw)} samples")

Train: 33405 samples | Test: 8352 samples


## Redo Pipeline

NOTE Nans already removed

In [21]:
float_cols = ["pm2.5", "year", "DEWP", "TEMP", "PRES", "Iws", "Is", "Ir"]
date_cols = ["month", "day", "hour"]
wind_cols = ["cbwd"]

In [22]:
date_time_normalizer = DateTimeSinCosNormalizer(
    datetime_cols={"month": 12, "day": 31, "hour": 24}
)  # Normalize 'month' and 'hour' columns
power_transformer = PowerTransformer(method="yeo-johnson", standardize=True)

wind_encoder = OrdinalEncoder()

column_transform = ColumnTransformer(
    transformers=[
        ("datetime", date_time_normalizer, date_cols),
        ("scaler", power_transformer, float_cols),
        ("windlabel", wind_encoder, wind_cols),
    ],
    remainder="passthrough",
    verbose_feature_names_out=False,
    verbose=True,
)

In [23]:
pipeline_x = PipelinePD(steps=[("column_transform", column_transform)])
pipeline_y = PipelinePD(steps=[("scaler", power_transformer)])

In [24]:
X_train_cleaned = UKS_NP_PD.move_columns(pipeline_x.fit_transform_df(X_train_raw), ["year"], ["pm2.5", "cbwd"])
X_test_cleaned = UKS_NP_PD.move_columns(pipeline_x.transform_df(X_test_raw), ["year"], ["pm2.5", "cbwd"])

[ColumnTransformer] ...... (1 of 3) Processing datetime, total=   0.0s
[ColumnTransformer] ........ (2 of 3) Processing scaler, total=   0.1s
[ColumnTransformer] ..... (3 of 3) Processing windlabel, total=   0.0s


In [25]:
y_train_cleaned = pipeline_y.fit_transform_df(y_train_raw)
y_test_cleaned = pipeline_y.transform_df(y_test_raw)

## View

In [26]:
UKS_NP_PD.dtt([X_train_cleaned, y_train_cleaned], how="head")

Unnamed: 0_level_0,year,month_sin,month_cos,day_sin,day_cos,hour_sin,hour_cos,DEWP,TEMP,PRES,Iws,Is,Ir,pm2.5,cbwd
Unnamed: 0_level_1,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64
24,-1.358,0.5,0.866,0.394,0.919,0.0,1.0,-1.202,-1.315,0.0,-0.924,-0.101,-0.213,0.671,2.0
25,-1.358,0.5,0.866,0.394,0.919,0.259,0.966,-1.133,-1.315,0.0,-0.63,-0.101,-0.213,0.825,2.0
26,-1.358,0.5,0.866,0.394,0.919,0.5,0.866,-0.859,-1.419,0.0,-0.417,-0.101,-0.213,0.907,2.0
27,-1.358,0.5,0.866,0.394,0.919,0.707,0.707,-0.585,-1.419,0.0,-0.114,9.921,-0.213,1.056,2.0
28,-1.358,0.5,0.866,0.394,0.919,0.866,0.5,-0.585,-1.419,0.0,-0.002,9.921,-0.213,0.747,2.0

Unnamed: 0_level_0,pm2.5
Unnamed: 0_level_1,float64
24,0.671
25,0.825
26,0.907
27,1.056
28,0.747


In [27]:
UKS_NP_PD.dtt([X_test_cleaned, y_test_cleaned], how="head")

Unnamed: 0_level_0,year,month_sin,month_cos,day_sin,day_cos,hour_sin,hour_cos,DEWP,TEMP,PRES,Iws,Is,Ir,pm2.5,cbwd
Unnamed: 0_level_1,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64
35378,2.144,0.5,0.866,0.299,-0.954,0.5,0.866,-1.475,-1.843,0.0,0.05,-0.101,-0.213,0.628,1.0
35379,2.144,0.5,0.866,0.299,-0.954,0.707,0.707,-1.407,-1.736,0.0,0.431,-0.101,-0.213,0.619,1.0
35380,2.144,0.5,0.866,0.299,-0.954,0.866,0.5,-1.544,-1.951,0.0,0.622,-0.101,-0.213,-0.328,1.0
35381,2.144,0.5,0.866,0.299,-0.954,0.966,0.259,-1.407,-1.843,0.0,-1.377,-0.101,-0.213,-0.616,3.0
35382,2.144,0.5,0.866,0.299,-0.954,1.0,0.0,-1.475,-1.843,0.0,-0.928,-0.101,-0.213,-0.147,3.0

Unnamed: 0_level_0,pm2.5
Unnamed: 0_level_1,float64
35378,0.628
35379,0.619
35380,-0.328
35381,-0.616
35382,-0.147


In [28]:
x_test_tensor = TensorDatasetCustom.from_pd(X_test_cleaned)
x_test_tensor

TensorDatasetCustom(shape=(8352, 15), columns=(['year', 'month_sin', 'month_cos', 'day_sin', 'day_cos', 'hour_sin', 'hour_cos', 'DEWP', 'TEMP', 'PRES', 'Iws', 'Is', 'Ir', 'pm2.5', 'cbwd'],), dtype=[torch.float32])

In [35]:
x_y_test_tensor = TensorDatasetCustom.from_pd_xy(X_test_cleaned, y_test_cleaned)
x_y_test_tensor

TensorDatasetCustom(shape=(8352, 16), columns=(['year', 'month_sin', 'month_cos', 'day_sin', 'day_cos', 'hour_sin', 'hour_cos', 'DEWP', 'TEMP', 'PRES', 'Iws', 'Is', 'Ir', 'pm2.5', 'cbwd'], ['pm2.5']), dtype=[torch.float32, torch.float32])

In [None]:
test_temporal = UKS_TORCH_UTILS.create_sequence(x_y_test_tensor, sequence_length=24, target_offset=0)

In [37]:
test_temporal

<torch.utils.data.dataset.TensorDataset at 0x17b329310>