In [1]:
# Importing packages
import kagglehub
import matplotlib.pyplot as plt
import os
import polars as pl
from sklearn.model_selection import train_test_split
import torch
from torch import nn
from torch.utils.data import TensorDataset, DataLoader
from torch.utils.tensorboard import SummaryWriter

### Load Dataset

In [2]:
# Download latest version
path = kagglehub.dataset_download("fedesoriano/stellar-classification-dataset-sdss17")
df = pl.read_csv(os.path.join(path, "star_classification.csv"), infer_schema_length=10000)
df.head()

obj_ID,alpha,delta,u,g,r,i,z,run_ID,rerun_ID,cam_col,field_ID,spec_obj_ID,class,redshift,plate,MJD,fiber_ID
f64,f64,f64,f64,f64,f64,f64,f64,i64,i64,i64,i64,f64,str,f64,i64,i64,i64
1.2377e+18,135.689107,32.494632,23.87882,22.2753,20.39501,19.16573,18.79371,3606,301,2,79,6.5438e+18,"""GALAXY""",0.6347936,5812,56354,171
1.2377e+18,144.826101,31.274185,24.77759,22.83188,22.58444,21.16812,21.61427,4518,301,5,119,1.176e+19,"""GALAXY""",0.779136,10445,58158,427
1.2377e+18,142.18879,35.582444,25.26307,22.66389,20.60976,19.34857,18.94827,3606,301,2,120,5.1522e+18,"""GALAXY""",0.6441945,4576,55592,299
1.2377e+18,338.741038,-0.402828,22.13682,23.77656,21.61162,20.50454,19.2501,4192,301,3,214,1.0301e+19,"""GALAXY""",0.9323456,9149,58039,775
1.2377e+18,345.282593,21.183866,19.43718,17.58028,16.49747,15.97711,15.54461,8102,301,3,137,6.8919e+18,"""GALAXY""",0.1161227,6121,56187,842


### Data Preprocessing

In [3]:
# Separate data and labels, dropping irrelevant/ID features to predicting class
not_features = ["class", "obj_ID", "run_ID", "rerun_ID", "cam_col", "field_ID", "spec_obj_ID", "plate", "MJD", "fiber_ID"]
X_df, y_df_str = df.drop(not_features), df["class"]

# Convert labels to enum for numeric computation
labels = pl.Enum(["GALAXY", "STAR", "QSO"])
y_df = pl.Series(y_df_str, dtype=labels)

# Convert data to 32-bit precision
X_df = X_df.cast(pl.Float32)

# Create train-test-validation split
test_size = 0.10
X_train, X_test, y_train, y_test = train_test_split(
    X_df, y_df.to_physical(), test_size=test_size, random_state=42)
X_train, X_validate, y_train, y_validate = train_test_split(
    X_train, y_train, test_size=test_size/(1 - test_size), random_state=42)

# Create a dataloader (with GPU acceleration if availible
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

X_train_tensor, y_train_tensor = X_train.to_torch().to(device), y_train.to_torch().to(device)
X_test_tensor, y_test_tensor = X_test.to_torch().to(device), y_test.to_torch().to(device)
X_validate_tensor, y_validate_tensor = X_validate.to_torch().to(device), y_validate.to_torch().to(device)
train_loader = DataLoader(
    TensorDataset(X_train_tensor, y_train_tensor),
    batch_size=32, 
    shuffle=True)

# Min max normalization
# X_df = X_df.select((pl.all() - pl.all().min()) / (pl.all().max() - pl.all().min()))

X_df.head()

alpha,delta,u,g,r,i,z,redshift
f32,f32,f32,f32,f32,f32,f32,f32
135.689102,32.494633,23.87882,22.275299,20.39501,19.16573,18.793711,0.634794
144.826096,31.274185,24.77759,22.831881,22.58444,21.168119,21.614269,0.779136
142.188797,35.582443,25.263069,22.663891,20.60976,19.34857,18.948271,0.644194
338.741028,-0.402828,22.13682,23.77656,21.61162,20.504539,19.250099,0.932346
345.282593,21.183867,19.43718,17.58028,16.497471,15.97711,15.54461,0.116123


### Helper Functions

In [32]:
# Create writer to plot training data
writer = SummaryWriter("runs/")

def calc_accuracy(model: nn.Module, X: torch.Tensor, y: torch.Tensor) -> float:
    model.eval()  # set model to evaluation mode
    z = model(X)
    pred = torch.argmax(z, dim=-1)
    acc = (pred == y).float().mean().item()
    return acc

def train_model(
    model: nn.Module, 
    opt: torch.optim.Optimizer,
    dataloader: DataLoader,
    loss_fn: nn.Module = nn.CrossEntropyLoss(),
    epochs: int = 100,
    plot_every: int = 5,
) -> None:
    model.train()  # set model to training mode

    n_total_steps = len(dataloader)
    
    # create metrics to display while training
    running_loss = 0.0
    running_correct = 0
    
    for epoch in range(epochs):
        for (X_batch, y_batch) in dataloader:
            opt.zero_grad()

            z = model(X_batch)
            loss = loss_fn(z,y_batch)

            loss.backward()
            opt.step()

            # calculate metrics
            running_loss = loss.item()
            predicted = torch.argmax(z, dim=-1)
            running_correct += (predicted == y_batch).sum().item()

            if (epoch+1) % plot_every == 0:
                writer.add_scalar('training loss', running_loss / plot_every, epoch * n_total_steps + epoch)
                writer.add_scalar('accuracy', running_correct / plot_every, epoch * n_total_steps + epoch)
                running_loss = 0.0
                running_correct = 0

### Training 

Training a simple linear model on data

In [33]:
# Create and train model
simple_nn = nn.Sequential(
    nn.Linear(8, 100),
    nn.ReLU(),
    nn.Linear(100, 3)
).to(device)

opt = torch.optim.SGD(simple_nn.parameters(), lr=1e-2)

train_model(simple_nn, opt, train_loader)

# Evaluate model
train_acc = calc_accuracy(simple_nn, X_train_tensor, y_train_tensor, y_test_tensor)
test_acc = calc_accuracy(simple_nn, X_test_tensor, y_test_tensor)

print(f'train acc: {train_acc}, test acc: {test_acc}')

TypeError: calc_accuracy() takes 3 positional arguments but 4 were given

Pretty good, but let's try and do better