## Weights and Biases
https://wandb.ai/site

## Helper Functions

In [None]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

import joblib

def load_raw_titanc():
  # Load the Titanic dataset
  titanic_url = 'https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv'
  return pd.read_csv(titanic_url)

def clean_titanic(titanic_data, fill_missing=True):
  df = titanic_data.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)
  # Fill missing values
  if fill_missing:
    df['Age'].fillna(df['Age'].median(), inplace=True)
    df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)
    df['Fare'].fillna(df['Fare'].median(), inplace=True)
  else:
    df = df.dropna()
  # Encode categorical features
  df['Sex'] = df['Sex'].map({'male': 0, 'female': 1})
  df['Embarked'] = df['Embarked'].map({'C': 0, 'Q': 1, 'S': 2})
  return df

def load_titanic(cleaned=True, Xy=True):
  '''
  Returns the titanic dataset
  '''
  # Get data
  df = load_raw_titanc()
  # Clean data
  if cleaned:
    # Drop useless features
    df = clean_titanic(df)
  # Split Data
  if Xy:
    return df.drop('Survived', axis=1), df['Survived'] # X, y
  return df

def get_data_loaders(X_train, X_test, y_train, y_test, batch_size=32):
  # Convert to Tensors
  X_train_tensor = torch.tensor(X_train.to_numpy(), dtype=torch.float32)
  y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32).view(-1, 1)
  X_test_tensor = torch.tensor(X_test.to_numpy(), dtype=torch.float32)
  y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32).view(-1, 1)

  # Create Datasets
  train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
  test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

  # Create DataLoaders
  train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
  test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

  return train_loader, test_loader

def train_loop(train_loader, model, optimizer, criterion, device):
  model.train()
  epoch_train_loss = 0.0
  for batch, labels in train_loader:
    batch, labels = batch.to(device), labels.to(device)
    optimizer.zero_grad()
    # Forward Pass
    outputs = model(batch)
    loss = criterion(outputs, labels)
    # Update Parameters
    loss.backward()
    optimizer.step()
    # Save loss
    epoch_train_loss += loss.item() * batch.size(0)
  return epoch_train_loss / len(train_loader.dataset)

def test_loop(test_loader, model, criterion, device):
  model.eval()
  epoch_test_loss = 0.0
  with torch.no_grad():
    for batch, labels in test_loader:
      batch, labels = batch.to(device), labels.to(device)
      # Calculate predictions
      outputs = model(batch)
      # Save loss
      loss = criterion(outputs, labels)
      epoch_test_loss += loss.item() * batch.size(0)
  return epoch_test_loss / len(test_loader.dataset)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

## Setup Weights & Biases

In [None]:
!pip install wandb # Python library for the Weights & Biases API

In [None]:
import wandb
wandb.login() # Login

## Simple Scikit-Learn Run

In [None]:
# Define hyperparameters
n_estimators = 30
max_depth = 2

# Create a new run in a project
run = wandb.init(
    project="Simple Scikit-Learn Run",
    notes="commit message for the run",
    config={
        "n_estimators": n_estimators,
        "max_depth": max_depth
    }
)

# Get Data
X, y = load_titanic()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
                                                    random_state=42)

# Define Random Forest Classifier
clf = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth)
clf.fit(X_train, y_train)

# Log metrics
wandb.log({"accuracy": clf.score(X_test, y_test)})

# Finish the run
wandb.finish()

## Simple PyTorch Run

In [None]:
# Define hyperparameters
num_epochs = 10
learning_rate = 0.01
run_count = 3

# Start a run
run = wandb.init(
    project="Simple PyTorch Run",
    name=f"MyRun{run_count}",
    config={
        "num_epochs": num_epochs,
        "learning_rate": learning_rate
    }
)

# Get Data
X, y = load_titanic()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
                                                    random_state=42)
train_loader, test_loader = get_data_loaders(X_train, X_test, y_train, y_test)

# Define Simple PyTorch Model
model = nn.Sequential(
    nn.Linear(7, 32), # 7 input features -> 32 hidden nodes
    nn.ReLU(),
    nn.Linear(32, 16),
    nn.ReLU(),
    nn.Linear(16, 1),
    nn.Sigmoid()
).to(device)

criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

for epoch in range(num_epochs):
  train_loss = train_loop(train_loader, model, optimizer, criterion, device)
  test_loss = test_loop(test_loader, model, criterion, device)

  print(f"[Epoch {epoch+1}/{num_epochs}] TrainLoss: {train_loss}; TestLoss: {test_loss}")
  wandb.log({
    "train_loss": train_loss,
    "test_loss": test_loss
  })

wandb.finish()

## Artifacts (Dataset Registry + Model Registry)

### First Dataset Artifact - Add Raw Titanic Data

In [None]:
run = wandb.init(project="Artifacts Registry", job_type="data-loading")

# Save data locally
raw_titanic_df = load_titanic(cleaned=False, Xy=False)
raw_titanic_df.to_csv('raw_titanic.csv', index=False)  # Save as CSV

# Create Artifact object
raw_dataset_artifact = wandb.Artifact(name='raw_titanic', type='dataset')

# Add files to the artifact (multiple)
raw_dataset_artifact.add_file('raw_titanic.csv')

# Log the artifact
wandb.log_artifact(raw_dataset_artifact, aliases=["raw"])

wandb.finish()

### Add Cleaned Dataset Artifact (version 0)

In [None]:
run = wandb.init(project="Artifacts Registry", job_type="data-loading")

# Get clean data
clean_titanic_df_dropped = clean_titanic(raw_titanic_df, fill_missing=False)
clean_titanic_df_dropped.to_csv('clean_titanic.csv', index=False)

# Create and Log New Artifact
run.log_artifact(
    artifact_or_path="clean_titanic.csv",
    name="clean_titanic",
    type="dataset",
    aliases=["dropped"]
)

wandb.finish()

### Create New Cleaned Dataset (Version 1)

In [None]:
run = wandb.init(project="Artifacts Registry", job_type="data-loading")

# Create new artifact version
artifact_name = "clean_titanic"
artifact_old = run.use_artifact(
    f'michael-hammer-ai/Artifacts Registry/{artifact_name}:latest',
    type='dataset'
)
artifact_next_version = wandb.Artifact(artifact_name, type='dataset')

# Update the data or get new data for the next version
## artifact_old_path = artifact_old.download()
## clean_titanic_df_old = pd.read_csv(artifact_old_path)
clean_titanic_df_filled = clean_titanic(raw_titanic_df, fill_missing=True)
clean_titanic_df_filled.to_csv('clean_titanic.csv', index=False)

# Log the new artifact
artifact_next_version.add_file('clean_titanic.csv')
wandb.log_artifact(artifact_next_version, aliases =["filled"])

wandb.finish()

### Create Model Artifact

In [None]:
run = wandb.init(project="Artifacts Registry", job_type="training")

# Load artifact dataset
artifact = run.use_artifact(artifact_or_name=
                  f"michael-hammer-ai/Artifacts Registry/clean_titanic:latest")
path = artifact.download()
df = pd.read_csv(f"{path}/clean_titanic.csv")

# Get feature and labels
X, y = df.drop('Survived', axis=1), df['Survived']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
                                                    random_state=42)

# Create a model and save it
clf = RandomForestClassifier(n_estimators=1, max_depth=5)
clf.fit(X_train, y_train)
wandb.log({"accuracy": clf.score(X_test, y_test)})

# Create Artifact object
joblib.dump(clf, 'random_forest_model.pkl')
model_artifact = wandb.Artifact(name='random_forest_model', type='model')
model_artifact.add_file('random_forest_model.pkl')
wandb.log_artifact(model_artifact)

wandb.finish()

## Create a Table

In [None]:
run = wandb.init(project='Table')

# Load artifact dataset
artifact = run.use_artifact(
    artifact_or_name=f"michael-hammer-ai/Artifacts Registry/clean_titanic:latest"
)
path = artifact.download()
df = pd.read_csv(f"{path}/clean_titanic.csv")

# Get feature and labels
X, y = df.drop('Survived', axis=1), df['Survived']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
                                                    random_state=42)

# Create a model and save it
model_artifact = run.use_artifact(
    'michael-hammer-ai/Artifacts Registry/random_forest_model:v0',
    type='model'
)
model_artifact_dir = model_artifact.download()
clf = joblib.load(f"{model_artifact_dir}/random_forest_model.pkl")
wandb.log({"accuracy": clf.score(X_test, y_test)})

# Create Table
table = wandb.Table(columns=['y_pred', 'y_true', 'prob_survive',
                             'Pclass', 'Sex', 'Age',
                             'SibSp', 'Parch', 'Fare',
                             'Embarked'])

# Add prediction data to the table
for i in range(len(X_test)):
  # Get Values
  row = X_test.iloc[[i]]
  y_pred = clf.predict(row)[0]
  y_prob_1 = clf.predict_proba(row)[0][1]
  y_true = y_test.iloc[i]

  # Add values to table
  table.add_data(
      y_pred, y_true, y_prob_1,
      row['Pclass'].iloc[0], row['Sex'].iloc[0], row['Age'].iloc[0],
      row['SibSp'].iloc[0], row['Parch'].iloc[0], row['Fare'].iloc[0],
      row['Embarked'].iloc[0])

run.log({"test_predictions": table})

wandb.finish()

## Sweep

In [None]:
def main():
  run = wandb.init()
  config = wandb.config

  # Get Train and Test Data
  artifact = run.use_artifact(
      artifact_or_name=f"michael-hammer-ai/Artifacts Registry/clean_titanic:latest"
  )
  path = artifact.download()
  df = pd.read_csv(f"{path}/clean_titanic.csv")
  X, y = df.drop('Survived', axis=1), df['Survived']
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
                                                      random_state=42)

  # Get PyTorch Data Loaders w/ batch_size
  train_loader, test_loader = get_data_loaders(X_train, X_test, y_train, y_test,
                                               batch_size=config.batch_size)

  # Define Simple PyTorch Model
  model = nn.Sequential(
      nn.Linear(7, 32), # 7 input features -> 32 hidden nodes
      nn.ReLU(),
      nn.Linear(32, 16),
      nn.ReLU(),
      nn.Linear(16, 1),
      nn.Sigmoid()
  ).to(device)

  # Loss Funciton
  criterion = nn.BCELoss()

  # Optimizer
  if config.optimizer == 'adam':
    optimizer = optim.Adam(model.parameters(), lr=config.learning_rate)
  else:
    optimizer = optim.SGD(model.parameters(), lr=config.learning_rate)

  for epoch in range(config.num_epochs):
    train_loss = train_loop(train_loader, model, optimizer, criterion, device)
    test_loss = train_loop(test_loader, model, optimizer, criterion, device)

    wandb.log({
      "train_loss": train_loss,
      "test_loss": test_loss
    })

  wandb.finish()

# Initialize the sweep
sweep_config = {
    "method": "random",
    "metric": {"goal": "minimize", "name": "test_loss"},
    "parameters": {
        "learning_rate": {"min": 0.001, "max": 0.1},
        "batch_size": {"values": [32, 64, 128]},
        "optimizer": {"values": ["adam", "sgd"]},
        "num_epochs": {"values": [5, 10, 15]}
    },
}
sweep_id = wandb.sweep(sweep_config, project='Sweep')

# Run the sweep
wandb.agent(sweep_id, function=main)