<a href="https://colab.research.google.com/github/aimlsonika/MLOpsAssignment1_GRP19/blob/main/colabs/intro/Intro_to_Weights_%26_Biases.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

<a href="https://colab.research.google.com/github/wandb/examples/blob/master/colabs/intro/Intro_to_Weights_&_Biases.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>
<!--- @wandbcode{intro-colab} -->

# **Helper Functions**

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

import joblib

def load_titanic_ds():
  titanic_url = 'https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv'
  return pd.read_csv(titanic_url)

def clean_data(titanic_data, fill_missing = True):
  df = titanic_data.drop(['PassengerId','Name','Ticket','Cabin'],axis=1)
  #Fill missing values
  if fill_missing:
    df['Age'] = df['Age'].fillna(df['Age'].median())
    df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0],inplace=True)
    df['Fare'].fillna(df['Fare'].median(),inplace=True)
  else:
    df = df.dropna()
  #Encode categorical variables to numerical
  df['Sex'] = df['Sex'].map({'male': 0, 'female': 1})
  df['Embarked'] = df['Embarked'].map({'C': 0, 'Q': 1, 'S': 2})
  return df

def load_titanic(cleaned=True, Xy=True):
  '''
  Returns the titanic dataset
  '''
  # Get data
  df = load_titanic_ds()
  # Clean data
  if cleaned:
    # Drop useless features
    df = clean_data(df)
  # Split Data
  if Xy:
    return df.drop('Survived', axis=1), df['Survived'] # X, y
  return df

def get_data_loaders(X_train, X_test, y_train, y_test, batch_size=32):
  # Convert to Tensors
  X_train_tensor = torch.tensor(X_train.to_numpy(), dtype=torch.float32)
  y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32).view(-1, 1)
  X_test_tensor = torch.tensor(X_test.to_numpy(), dtype=torch.float32)
  y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32).view(-1, 1)

  # Create Datasets
  train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
  test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

  # Create DataLoaders
  train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
  test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

  return train_loader, test_loader

def train_loop(train_loader, model, optimizer, criterion, device):
  model.train()
  epoch_train_loss = 0.0
  for batch, labels in train_loader:
    batch, labels = batch.to(device), labels.to(device)
    optimizer.zero_grad()
    # Forward Pass
    outputs = model(batch)
    loss = criterion(outputs, labels)
    # Update Parameters
    loss.backward()
    optimizer.step()
    # Save loss
    epoch_train_loss += loss.item() * batch.size(0)
  return epoch_train_loss / len(train_loader.dataset)

def test_loop(test_loader, model, criterion, device):
  model.eval()
  epoch_test_loss = 0.0
  with torch.no_grad():
    for batch, labels in test_loader:
      batch, labels = batch.to(device), labels.to(device)
      # Calculate predictions
      outputs = model(batch)
      # Save loss
      loss = criterion(outputs, labels)
      epoch_test_loss += loss.item() * batch.size(0)
  return epoch_test_loss / len(test_loader.dataset)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cpu


<img src="http://wandb.me/logo-im-png" width="400" alt="Weights & Biases" />
<!--- @wandbcode{intro-colab} -->

Use [W&B](https://wandb.ai/site?utm_source=intro_colab&utm_medium=code&utm_campaign=intro) for machine learning experiment tracking, model checkpointing, collaboration with your team and more. See the full W&B Documentation [here](https://docs.wandb.ai/).

In this notebook, you will create and track a machine learning experiment using a simple PyTorch model. By the end of the notebook, you will have an interactive project dashboard that you can share and customize with other members of your team. [View an example dashboard here](https://wandb.ai/wandb/wandb_example).

## Prerequisites

Install the W&B Python SDK and log in:

In [2]:
!pip install wandb -qU

In [3]:
# Log in to your W&B account
import wandb
import random
import math

In [4]:
wandb.login()

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mmasteraimlinbits[0m ([33mmasteraimlinbits-bits-pilani[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

# **Simple Scikit-Learn Run**

In [5]:
# Define hyperparameters
n_estimators = 40
max_depth = 5

# Create a new run in a project
run = wandb.init(
    project="Simple Scikit-Learn Runnnnn",
    notes="commit message for the run",
    config={
        "n_estimators": n_estimators,
        "max_depth": max_depth
    }
)

# Get Data
X, y = load_titanic()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
                                                    random_state=42)

# Define Random Forest Classifier
clf = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth)
clf.fit(X_train, y_train)

# Log metrics
wandb.log({"accuracy": clf.score(X_test, y_test)})

# Finish the run
wandb.finish()



[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0],inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Fare'].fillna(df['Fare'].median(),inplace=True)


0,1
accuracy,▁

0,1
accuracy,0.81006


# **Simple PyTorch Run**

In [6]:
'''
##Define hyperparameters
num_epochs = 10
learning_rate = 0.01
run_count = 3

# Start a run
run = wandb.init(
    project="Simple PyTorch Run",
    name=f"MyRun{run_count}",
    config={
        "num_epochs": num_epochs,
        "learning_rate": learning_rate
    }
)

# Get Data
X, y = load_titanic()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
                                                    random_state=42)
train_loader, test_loader = get_data_loaders(X_train, X_test, y_train, y_test)

# Define Simple PyTorch Model
model = nn.Sequential(
    nn.Linear(7, 32), # 7 input features -> 32 hidden nodes
    nn.ReLU(),
    nn.Linear(32, 16),
    nn.ReLU(),
    nn.Linear(16, 1),
    nn.Sigmoid())

criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

for epoch in range(num_epochs):
  train_loss = train_loop(train_loader, model, optimizer, criterion, device)
  test_loss = test_loop(test_loader, model, criterion, device)

  print(f"[Epoch {epoch+1}/{num_epochs}] TrainLoss: {train_loss:.4f}; TestLoss: {test_loss:.4f}")
  wandb.log({
    "train_loss": train_loss,
    "test_loss": test_loss
  })

wandb.finish()
'''

'\n##Define hyperparameters\nnum_epochs = 10\nlearning_rate = 0.01\nrun_count = 3\n\n# Start a run\nrun = wandb.init(\n    project="Simple PyTorch Run",\n    name=f"MyRun{run_count}",\n    config={\n        "num_epochs": num_epochs,\n        "learning_rate": learning_rate\n    }\n)\n\n# Get Data\nX, y = load_titanic()\nX_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,\n                                                    random_state=42)\ntrain_loader, test_loader = get_data_loaders(X_train, X_test, y_train, y_test)\n\n# Define Simple PyTorch Model\nmodel = nn.Sequential(\n    nn.Linear(7, 32), # 7 input features -> 32 hidden nodes\n    nn.ReLU(),\n    nn.Linear(32, 16),\n    nn.ReLU(),\n    nn.Linear(16, 1),\n    nn.Sigmoid())\n\ncriterion = nn.BCELoss()\noptimizer = optim.Adam(model.parameters(), lr=learning_rate)\n\nfor epoch in range(num_epochs):\n  train_loss = train_loop(train_loader, model, optimizer, criterion, device)\n  test_loss = test_loop(test_loader,

# **Artifacts (Dataset Registry + Model Registry)**
# **First Dataset Artifact - Raw Titanic Dataset**

In [7]:
run = wandb.init(project="Artifacts Registry", job_type="data-loading")

# Save data locally without cleaning and splitting
raw_titanic_df = load_titanic(cleaned=False, Xy=False)
raw_titanic_df.to_csv('raw_titanic.csv', index=False)  # Save as CSV

# Create Artifact object
raw_dataset_artifact = wandb.Artifact(name='raw_titanic', type='dataset')

# Add files to the artifact (multiple)
raw_dataset_artifact.add_file('raw_titanic.csv')

# Log the artifact
wandb.log_artifact(raw_dataset_artifact, aliases=["raw"])

wandb.finish()

# **Add Cleaned Dataset Artifact - Cleaned Titanic Dataset**

In [8]:
run = wandb.init(project="Artifacts Registry", job_type='Cleaning')

clean_titanic_df_dropped = clean_data(raw_titanic_df, fill_missing = False)
clean_titanic_df_dropped.to_csv("clean_titanic.csv", index=False)

#Create and Log new Artifact
run.log_artifact(artifact_or_path = "clean_titanic.csv",
                 name = "clean_titanic",
                 type = "dataset",
                 aliases = ["dropped"])

wandb.finish()

SyntaxError: invalid syntax. Perhaps you forgot a comma? (<ipython-input-8-a19edd953d09>, line 8)