In [12]:
%reload_ext autoreload
%autoreload 2

# Imports

In [13]:
from kret_notebook import *  # NOTE import first
from kret_matplotlib.mpl_nb_imports import *
from kret_np_pd.np_pd_nb_imports import *
from kret_sklearn.sklearn_nb_imports import *
from kret_torch_utils.torch_nb_imports import *
from kret_lightning.lightning_nb_imports import *
from kret_tqdm.tqdm_nb_imports import *
from kret_type_hints.types_nb_imports import *
from kret_utils.utils_nb_imports import *

# from kret_wandb.wandb_nb_imports import *  # NOTE this is slow to import

# Load Data

In [14]:
import pandas as pd

df = pd.DataFrame({"feature_1": [1, 4, 7], "feature_2": [2, 5, 8], "label": [0, 1, 0]})
df  # Beautiful table in Jupyter

Unnamed: 0,feature_1,feature_2,label
0,1,2,0
1,4,5,1
2,7,8,0


In [15]:
import torch
import pandas as pd

data = torch.randn(100, 10)
labels = torch.randint(0, 2, (100,))

df = pd.DataFrame(data.numpy(), columns=[f"feature_{i}" for i in range(10)])
df["label"] = labels.numpy()
df  # Beautiful display!

Unnamed: 0,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,label
0,-0.399602,0.689945,-0.868915,-0.232121,-1.117067,-0.185673,-0.732697,-0.160703,0.229922,0.386030,0
1,0.337000,0.371505,-0.467738,-0.163283,0.112193,-1.238934,-1.311196,0.699966,1.191984,0.521376,0
2,0.431370,-1.424756,0.026361,-0.673424,-0.479644,-0.626135,1.318727,-1.382468,2.132013,-0.494565,0
3,0.248537,-0.287013,0.624497,0.537951,-0.052421,-2.066313,-0.934352,-0.208867,-1.821968,-0.520526,0
4,-1.049473,0.415063,0.766794,0.260369,0.732367,1.258776,-0.764856,-0.851808,-0.828112,-0.293630,1
...,...,...,...,...,...,...,...,...,...,...,...
95,-0.155279,0.211626,-0.937500,-0.330971,0.573659,-0.520434,-0.085662,-0.892870,0.899262,1.283864,1
96,0.097616,-0.408814,0.633120,-0.285829,2.103911,-3.103385,-0.366490,-0.786793,-1.772184,0.042490,1
97,-0.184031,-0.450025,0.708117,0.548564,-0.736666,0.119203,0.401039,-0.911862,-0.625530,-0.000527,0
98,-0.027433,0.036193,-1.132896,0.243510,0.334132,-2.875680,1.678606,0.121737,-0.146070,1.196409,1


In [16]:
from datasets import Dataset

dataset = Dataset.from_dict({"feature_1": [1, 4, 7], "feature_2": [2, 5, 8], "label": [0, 1, 0]})
dataset  # HTML table with column names!

Dataset({
    features: ['feature_1', 'feature_2', 'label'],
    num_rows: 3
})

In [17]:
from datasets import DatasetDict, Dataset

splits = DatasetDict(
    {
        "train": Dataset.from_dict({"feature_1": [1, 4], "feature_2": [2, 5], "label": [0, 1]}),
        "val": Dataset.from_dict({"feature_1": [7], "feature_2": [8], "label": [0]}),
        "test": Dataset.from_dict({"feature_1": [10], "feature_2": [11], "label": [1]}),
    }
)
splits  # Shows all splits with stats!

DatasetDict({
    train: Dataset({
        features: ['feature_1', 'feature_2', 'label'],
        num_rows: 2
    })
    val: Dataset({
        features: ['feature_1', 'feature_2', 'label'],
        num_rows: 1
    })
    test: Dataset({
        features: ['feature_1', 'feature_2', 'label'],
        num_rows: 1
    })
})

In [18]:
import torch
import pandas as pd
from torch.utils.data import DataLoader

# 1. Create DataFrame with column names
df = pd.DataFrame({"feature_1": [1, 4, 7], "feature_2": [2, 5, 8], "label": [0, 1, 0]})
df  # Display in notebook

Unnamed: 0,feature_1,feature_2,label
0,1,2,0
1,4,5,1
2,7,8,0


In [19]:
# 2. Convert to torch for training
data = torch.tensor(df[["feature_1", "feature_2"]].values, dtype=torch.float32)
labels = torch.tensor(df["label"].values)

# 3. Create DataLoader for training
loader = DataLoader(TensorDataset(data, labels), batch_size=2)

In [20]:
from datasets import DatasetDict, Dataset
from torch.utils.data import DataLoader

# 1. Create HuggingFace Dataset with column names
dataset = Dataset.from_dict({"feature_1": [1, 4, 7], "feature_2": [2, 5, 8], "label": [0, 1, 0]})
dataset  # Beautiful display!

Dataset({
    features: ['feature_1', 'feature_2', 'label'],
    num_rows: 3
})

In [21]:
# 2. Split into train/val/test
splits = dataset.train_test_split(test_size=0.2, seed=42)
splits = splits["train"].train_test_split(test_size=0.25, seed=42)

split_dict = DatasetDict({"train": splits["train"], "val": splits["test"], "test": splits})  # simplified for example
split_dict  # Shows all splits!

DatasetDict({
    train: Dataset({
        features: ['feature_1', 'feature_2', 'label'],
        num_rows: 1
    })
    val: Dataset({
        features: ['feature_1', 'feature_2', 'label'],
        num_rows: 1
    })
    test: DatasetDict({
        train: Dataset({
            features: ['feature_1', 'feature_2', 'label'],
            num_rows: 1
        })
        test: Dataset({
            features: ['feature_1', 'feature_2', 'label'],
            num_rows: 1
        })
    })
})

In [22]:
# 3. Convert to torch format for DataLoader
split_dict = split_dict.with_format("torch")
loader = DataLoader(split_dict["train"], batch_size=32)

TypeError: Values in `DatasetDict` should be of type `Dataset` but got type '<class 'datasets.dataset_dict.DatasetDict'>'

# Implementation

In [None]:
DATA_DIR / "hugging_face"

PosixPath('/Users/Akseldkw/coding/data_kretsinger')

In [None]:
from datasets import load_dataset

# Load a sample dataset (e.g., 'glue', 'cola')
dataset = load_dataset(DATA_DIR, "glue", "cola", split="train")

# Access and print column names
print(f"Column names: {dataset.column_names}")

# Access the data types (features)
print(f"Features: {dataset.features}")

cola/train-00000-of-00001.parquet:   0%|          | 0.00/251k [00:00<?, ?B/s]

cola/validation-00000-of-00001.parquet:   0%|          | 0.00/37.6k [00:00<?, ?B/s]

cola/test-00000-of-00001.parquet:   0%|          | 0.00/37.7k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/8551 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1043 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1063 [00:00<?, ? examples/s]

Column names: ['sentence', 'label', 'idx']
Features: {'sentence': Value('string'), 'label': ClassLabel(names=['unacceptable', 'acceptable']), 'idx': Value('int32')}


# Sandbox