# Setup

In [None]:
!pip install datasets -qqq
!pip install git+https://github.com/huggingface/transformers -qqq
!pip install wandb -qqq
!pip install accelerate -qqq

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/536.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m174.1/536.7 kB[0m [31m5.1 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m536.7/536.7 kB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m12.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for transformers (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m11.3 MB/s[0m eta [36m0:00:00[0m
[2K     

In [None]:
import json
import pandas as pd

import torch
import wandb
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import load_dataset

# Logging MMLU College Chemistry Validation Split

In [None]:
# We'll use the MMLU college_chemistry validation split (7 instances) for evaluating our models.
# We'll use this artifact later during our automation.
run = wandb.init(project="enterprise_model_management_wandb", name="log_val_dataset")
dataset = load_dataset("lukaemon/mmlu", "college_chemistry", split="validation")
table = wandb.Table(data=dataset.to_pandas().to_numpy().tolist(), columns=list(dataset.features.keys()))
run.log({"my_table": table})
run.finish()

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Downloading data:   0%|          | 0.00/20.4k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/5.63k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/4.19k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/99 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/7 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/4 [00:00<?, ? examples/s]

VBox(children=(Label(value='0.016 MB of 0.016 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

# Loading the Dataset/Table

In [None]:
run = wandb.init()
artifact = run.use_artifact('vincenttu/enterprise_model_management_wandb/run-vhp36175-my_table:v0', type='run_table')
artifact_dir = artifact.download()

VBox(children=(Label(value='0.010 MB of 0.010 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

[34m[1mwandb[0m:   1 of 1 files downloaded.  


In [None]:
artifact_dir

'/content/artifacts/run-vhp36175-my_table:v0'

In [None]:
# To load the saved table.
with open('/content/artifacts/run-vhp36175-my_table:v0/my_table.table.json', 'r') as file:
    data = json.load(file)

df = pd.DataFrame(data["data"], columns=data["columns"])

In [None]:
df

Unnamed: 0,input,A,B,C,D,target
0,The 13C spectrum of which isomer of C6H14 has ...,hexane,2-methylpentane,3-methylpentane,"2,3-dimethylbutane",A
1,The Rydberg equation v = R_H(1/n_1^2 - 1/n_2^2...,"hydride ion, H−","deuterium atom, D","tritium atom, T","helium cation, He+",A
2,Reduction of D-xylose with NaBH4 yields a prod...,racemic mixture,single pure enantiomer,mixture of two diastereomers in equal amounts,meso compound,D
3,Calculate the polarization of a proton in a ma...,6.345 x 10^-4 at 0.335 T; 9.871 x 10^-5 at 10.5 T,0.793 x 10^-4 at 0.335 T; 6.931 x 10^-7 at 10.5 T,1.148 x 10^-6 at 0.335 T; 3.598 x 10^-5 at 10.5 T,4.126 x 10^-3 at 0.335 T; 2.142 x 10^-6 at 10.5 T,D
4,The anhydride of Ba(OH)2 is,BaH2,BaOH,Ba,BaO,D
5,A 1 M solution of TEMPO in chloroform at room ...,"in the more concentrated solution, the lines a...","in the more concentrated solution, the lines a...","in the more concentrated solution, the lines a...","in the more concentrated solution, the lines a...",C
6,A set of hybrid sp^3 orbitals for a carbon ato...,The orbitals are degenerate.,The set of orbitals has a tetrahedral geometry.,These orbitals are constructed from a linear c...,Each hybrid orbital may hold four electrons.,D


# First Model: Populating our Model Registry

In [None]:
# Instantiating our tokenizer and model.
tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b")
model = AutoModelForCausalLM.from_pretrained("google/gemma-2b", device_map="auto", torch_dtype=torch.float16)

tokenizer_config.json:   0%|          | 0.00/1.11k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/555 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/627 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/13.5k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

In [None]:
# Let's assume we just finetuned the baseline. Let's save the model and tokenizer.
model.save_pretrained("./models/baseline")
tokenizer.save_pretrained("./models/baseline/tokenizer")

('./models/baseline/tokenizer/tokenizer_config.json',
 './models/baseline/tokenizer/special_tokens_map.json',
 './models/baseline/tokenizer/tokenizer.model',
 './models/baseline/tokenizer/added_tokens.json',
 './models/baseline/tokenizer/tokenizer.json')

In [None]:
# Let's now register our model. There are 2 programmatic options:

# ======================= OPTION 1 =======================

# # Log model as an artifact first.
# run = wandb.init(project="enterprise_model_management_wandb")
# run.log_model(path="./models/baseline", name="baseline", aliases="baseline")
# run.finish()

# # Clones down the artifact and returns a path.
# artifact_name = "baseline:v0"
# model_path = run.use_model(artifact_name)

# # Link the cloned down artifact to the model registry.
# run = wandb.init(project="enterprise_model_management_wandb")
# run.link_model(path=model_path, registered_model_name="gemma-2b")
# run.finish()

# ======================= OPTION 2 =======================

# Logs as artifact + registers the model to a model registry as a model version.
model_path = "./models/baseline"
run = wandb.init(project="enterprise_model_management_wandb")
run.link_model(path=model_path, registered_model_name="gemma-2b")
run.finish()

[34m[1mwandb[0m: Currently logged in as: [33mvincenttu[0m. Use [1m`wandb login --relogin`[0m to force relogin


[34m[1mwandb[0m: Adding directory to artifact (./models/baseline)... Done. 42.3s
[34m[1mwandb[0m: Waiting for artifact run-jbed3y5a-baseline to be committed...


VBox(children=(Label(value='53.135 MB of 4800.891 MB uploaded\r'), FloatProgress(value=0.01106775587369641, ma…


[34m[1mwandb[0m: Committed artifact vincenttu/enterprise_model_management_wandb/run-jbed3y5a-baseline:v0


# Adding More Models: Populating our Model Registry

In [None]:
tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b-it")
model = AutoModelForCausalLM.from_pretrained("google/gemma-2b-it", device_map="auto", torch_dtype=torch.float16)

model.save_pretrained("./models/2B_IT")
tokenizer.save_pretrained("./models/2B_IT/tokenizer")

tokenizer_config.json:   0%|          | 0.00/2.16k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/888 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/627 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/13.5k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

('./models/2B_IT/tokenizer/tokenizer_config.json',
 './models/2B_IT/tokenizer/special_tokens_map.json',
 './models/2B_IT/tokenizer/tokenizer.model',
 './models/2B_IT/tokenizer/added_tokens.json',
 './models/2B_IT/tokenizer/tokenizer.json')

In [None]:
# Let's pretend we just finished instruction-tuning and we are ready to link our models!
run = wandb.init(project="enterprise_model_management_wandb")

# Logs as artifact + registers the model to a model registry as a model version.
model_path = "./models/2B_IT"

# Ref: https://docs.wandb.ai/ref/python/run#link_model.
run.link_model(path=model_path, registered_model_name="gemma-2b", aliases=["staging"])  # A common alias. Another one is "production".
run.finish()

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


[34m[1mwandb[0m: Adding directory to artifact (./models/2B_IT)... Done. 62.8s
[34m[1mwandb[0m: Waiting for artifact run-z09uoy4z-2B_IT to be committed...


VBox(children=(Label(value='0.001 MB of 4800.893 MB uploaded\r'), FloatProgress(value=2.500943075042238e-07, m…


[34m[1mwandb[0m: Committed artifact vincenttu/enterprise_model_management_wandb/run-z09uoy4z-2B_IT:v0


# Loading the Model

In [None]:
run = wandb.init()
artifact = run.use_artifact('vincenttu/model-registry/gemma-2b:v1', type='model')
artifact_dir = artifact.download()

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


[34m[1mwandb[0m: Downloading large artifact gemma-2b:v1, 4800.89MB. 9 files... 
[34m[1mwandb[0m:   9 of 9 files downloaded.  
Done. 0:1:37.9


In [None]:
tokenizer = AutoTokenizer.from_pretrained("/content/artifacts/run-z09uoy4z-2B_IT:v0/tokenizer")
model = AutoModelForCausalLM.from_pretrained("/content/artifacts/run-z09uoy4z-2B_IT:v0", device_map="auto", torch_dtype=torch.float16)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
artifact_dir

'/content/artifacts/run-z09uoy4z-2B_IT:v0'

# Validation

In [None]:
# Load the dataset.
run = wandb.init()
artifact = run.use_artifact('vincenttu/enterprise_model_management_wandb/run-vhp36175-my_table:v0', type='run_table')
artifact_dir = artifact.download()

with open('/content/artifacts/run-vhp36175-my_table:v0/my_table.table.json', 'r') as file:
    data = json.load(file)

df = pd.DataFrame(data["data"], columns=data["columns"])
run.finish()

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

[34m[1mwandb[0m:   1 of 1 files downloaded.  


In [None]:
# Load the model.
run = wandb.init()
artifact = run.use_artifact('vincenttu/model-registry/gemma-2b:v1', type='model')
artifact_dir = artifact.download()

tokenizer = AutoTokenizer.from_pretrained("/content/artifacts/run-z09uoy4z-2B_IT:v0/tokenizer")
model = AutoModelForCausalLM.from_pretrained("/content/artifacts/run-z09uoy4z-2B_IT:v0", device_map="auto")
run.finish()

VBox(children=(Label(value='0.010 MB of 0.010 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

[34m[1mwandb[0m: Downloading large artifact gemma-2b:v1, 4800.89MB. 9 files... 
[34m[1mwandb[0m:   9 of 9 files downloaded.  
Done. 0:0:29.2


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



In [None]:
# Validate model against dataset.
run = wandb.init(project="enterprise_model_management_wandb")

req = "Please select one option: (A), (B), (C), or (D).\n"
data = []
for idx, i in df.iterrows():
  input_text = "\n".join([i["input"], req, "(A) " + i["A"], "(B) " + i["B"], "(C) " + i["C"], "(D) " + i["D"]])
  input_ids = tokenizer(input_text, return_tensors="pt")
  for k, v in input_ids.items(): input_ids[k] = input_ids[k].to(model.device)
  outputs = model.generate(**input_ids, max_new_tokens=10)
  outputs = tokenizer.decode(outputs[0])
  data.append([str(idx), i["input"], i["A"], i["B"], i["C"], i["D"], i["target"], input_text, outputs])

table = wandb.Table(data=data, columns=["id", "input", "A", "B", "C", "D", "target", "input_text", "output"])
run.log({f"val_table_{version}": table})
run.finish()

# Adding Another Gemma-2B-IT + Validation

In [None]:
# Load the dataset.
run = wandb.init()
artifact = run.use_artifact('vincenttu/enterprise_model_management_wandb/run-vhp36175-my_table:v0', type='run_table')
artifact_dir = artifact.download()

with open('/content/artifacts/run-vhp36175-my_table:v0/my_table.table.json', 'r') as file:
    data = json.load(file)

df = pd.DataFrame(data["data"], columns=data["columns"])
run.finish()

In [None]:
# Instantiating our tokenizer and model.
tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b-it")
model = AutoModelForCausalLM.from_pretrained("google/gemma-2b-it", device_map="auto", torch_dtype=torch.float16)

In [None]:
model.save_pretrained("./models/2B_IT_new")
tokenizer.save_pretrained("./models/2B_IT_new/tokenizer")

In [None]:
# This time our validation results will be in the same run as the model artifact instead of existing
# as a W&B Table Artifact in a separate run.
# Validate model against dataset.

req = "Please select one option: (A), (B), (C), or (D).\n"
data = []
for idx, i in df.iterrows():
  input_text = "\n".join([i["input"], req, "(A) " + i["A"], "(B) " + i["B"], "(C) " + i["C"], "(D) " + i["D"]])
  input_ids = tokenizer(input_text, return_tensors="pt")
  for k, v in input_ids.items(): input_ids[k] = input_ids[k].to(model.device)
  outputs = model.generate(**input_ids, max_new_tokens=10)
  outputs = tokenizer.decode(outputs[0])
  data.append([str(idx), i["input"], i["A"], i["B"], i["C"], i["D"], i["target"], input_text, outputs])

df = pd.DataFrame(data=data, columns=["id", "input", "A", "B", "C", "D", "target", "input_text", "output"])

In [None]:
# Link the cloned down artifact to the model registry.
run = wandb.init(project="enterprise_model_management_wandb")
run.link_model(path="./models/2B_IT_new/", registered_model_name="gemma-2b")
table = wandb.Table(dataframe=df)
run.log({"val_table": table})
run.finish()

# Loading the Gemma-2B-IT Validation Results Table

In [None]:
run = wandb.init(project="enterprise_model_management_wandb")
artifact = run.use_artifact('vincenttu/model-registry/gemma-2b:v2', type='model')
producer_run_id = artifact.logged_by().id
table_artifact = wandb.use_artifact(f"run-{producer_run_id}-val_table:v0")
table = table_artifact.get("val_table")
run.finish()