In [1]:

import tensorflow as tf
import os

# Sprawdzenie GPU
print("Num GPUs Available:", len(tf.config.list_physical_devices('GPU')))

# Pobranie danych
import urllib.request
import gzip

Num GPUs Available: 1


In [2]:
if not os.path.exists("HIGGS.csv"):
    print("Pobieranie danych...")
    urllib.request.urlretrieve(
        "https://archive.ics.uci.edu/ml/machine-learning-databases/00280/HIGGS.csv.gz", "HIGGS.csv.gz"
    )
    with gzip.open("HIGGS.csv.gz", "rb") as f_in, open("HIGGS.csv", "wb") as f_out:
        f_out.write(f_in.read())



In [8]:
import math

# Pipeline danych
CSV_FILE = "HIGGS.csv"
FEATURE_DIM = 28
BATCH_SIZE = 1024

def parse_csv(line):
    defaults = [[0.0]] * (FEATURE_DIM + 1)
    fields = tf.io.decode_csv(line, record_defaults=defaults)
    label = fields[0]
    features = tf.stack(fields[1:])
    return features, label

# Apply batching before taking/skipping
dataset = (
    tf.data.TextLineDataset(CSV_FILE)
    .map(parse_csv, num_parallel_calls=tf.data.AUTOTUNE)
    .shuffle(100_000)
    .batch(BATCH_SIZE) # Batch here
    .prefetch(tf.data.AUTOTUNE)
)

# Now take/skip a specific number of batches
# Let's calculate the number of batches in the original dataset first
# Note: This can be computationally expensive for large datasets without specifying batch size first
# A more efficient way might be to use dataset.cardinality().numpy() if the cardinality is known
# For now, let's just use the specified splits and check their sizes.

train_batches_to_take = 1000
val_batches_to_take = 200
test_batches_to_take = 200
val_batches_to_skip = 1000
test_batches_to_skip = 1200

train_ds = dataset.take(train_batches_to_take)           # take 1000 batches
val_ds   = dataset.skip(val_batches_to_skip).take(val_batches_to_take) # skip 1000 batches, take 200
test_ds  = dataset.skip(test_batches_to_skip).take(test_batches_to_take) # skip 1200 batches, take 200

# Print the size of the datasets to verify they are not empty
# Note: dataset.cardinality().numpy() can return -1 if the size is not known.
# If it returns -1, you might need to iterate through the dataset to get the exact size,
# which can be slow, or estimate based on file size and batch size.
print(f"Train dataset cardinality: {train_ds.cardinality().numpy()}")
print(f"Validation dataset cardinality: {val_ds.cardinality().numpy()}")
print(f"Test dataset cardinality: {test_ds.cardinality().numpy()}")


# %%
# Model
model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(FEATURE_DIM,)),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model.compile(
    optimizer=tf.keras.optimizers.Adam(0.001),
    loss='binary_crossentropy',
    metrics=[
        'accuracy',
        tf.keras.metrics.AUC(name='auc')
    ]
)

# %%
# Callbacki
log_dir = "logs/higgs"
os.makedirs(log_dir, exist_ok=True)

tensorboard_cb = tf.keras.callbacks.TensorBoard(log_dir=log_dir)
earlystop_cb = tf.keras.callbacks.EarlyStopping(
    monitor='val_auc',
    patience=3,
    restore_best_weights=True,
    mode='max'
)

# %%
# Trening
# Only proceed with training if the datasets are not empty
if train_ds.cardinality().numpy() > 0 and val_ds.cardinality().numpy() > 0:
    print("Starting training...")
    model.fit(
        train_ds,
        epochs=20,
        validation_data=val_ds,
        callbacks=[tensorboard_cb, earlystop_cb]
    )

    # Ewaluacja
    if test_ds.cardinality().numpy() > 0:
        print("Starting evaluation...")
        loss, acc, auc = model.evaluate(test_ds)
        print(f"Test Accuracy: {acc:.4f}, AUC: {auc:.4f}")
    else:
        print("Test dataset is empty, skipping evaluation.")
else:
    print("Train or validation dataset is empty, skipping training and evaluation.")

Train dataset cardinality: -2
Validation dataset cardinality: -2
Test dataset cardinality: -2
Train or validation dataset is empty, skipping training and evaluation.
