In [None]:
import base64
import enum
import json
import math
import os
import tempfile
import textwrap
import warnings
from collections import Counter
from pathlib import Path
from pprint import pprint

import anthropic
import fasttext
import google.generativeai as genai
import lightgbm
import matplotlib.pyplot as plt
import numpy as np
import open_clip
import pandas as pd
import torch
import torch.nn.functional as F  # noqa: N812
import vertexai
from openai import OpenAI
from PIL import Image
from pydantic import BaseModel
from sentence_transformers import SentenceTransformer
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import (
    TfidfVectorizer,
)
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import FeatureUnion, make_pipeline
from sklearn.preprocessing import (
    FunctionTransformer,
)
from torchvision import transforms
from torchvision.models import ConvNeXt_Base_Weights, convnext_base
from transformers import pipeline
from typing_extensions import TypedDict
from ultralytics import YOLO
from vertexai.generative_models import GenerativeModel, Part
from vertexai.generative_models import Image as VImage

warnings.filterwarnings("ignore")
np.set_printoptions(linewidth=np.inf)

# Available models

links:
- https://cloud.google.com/vertex-ai/generative-ai
- https://aistudio.google.com/
- https://platform.openai.com/playground/
- https://console.anthropic.com/workbench
- https://huggingface.co/models

practical approach
- start with closed models for POC for easy of use and performance
- consider open source options if closed options don't perform on niche tasks or meet production requirements (cost, security etc)
- HuggingFace is good place to start for open source models however best in class open source models not on HuggingFace to exist

By task and open/closed:

- image classification
    - open
        - TIMM (ViT, ResNet, ConvNext)
        - HuggingFaceVision
    - closed
        - gemini 2
        - openai
        - claude 3
- text to text
    - open
        - llama 3
    - closed
        - gemini 2
        - openai
- text+image to text
    - open
        - OpenCLIP
        - llama 3
    - closed
        - gemini 2
        - openai
        - claude 3
- image embeddings/similarity
    - open
        - TIMM (ViT, ResNet, ConvNext)
        - [OpenCLIP](https://github.com/mlfoundations/open_clip)
    - closed
        - gemini 2
        - openai
- image captioning
    - open
        - Salesforce (BLIP)
    - closed
        - gemini 2
        - openai
        - claude vision
- text embeddings
    - open
        - hugging face (sentence transformer, all-MiniLM-L6-v2)
    - closed
        - gemini 2
        - openai
- object detection/image segmentation
    - open
        - YOLO (You Only Look Once)
        - Meta AI (Detectron2, Segment Anything Model i.e. SAM)
        - MMDetection
    - closed
        - gemini 2

# Tasks

## text to text

### Gemini

In [None]:
genai.configure(api_key=os.environ["GEMINI_API_KEY"])

prompt: str = "What is a cocker spaniel?"
model = genai.GenerativeModel(model_name="models/gemini-2.0-flash-exp")
r = model.generate_content(contents=prompt)
print(r.text)

### OpenAI

In [None]:
client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])

completion = client.chat.completions.create(
    model="gpt-4o",
    messages=[
        {"role": "developer", "content": "You are a helpful assistant."},
        {"role": "user", "content": "Write a haiku about recursion in programming."},
    ],
)
print(completion.choices[0].message.content)

Structured output

In [None]:
client = OpenAI()


class CalendarEvent(BaseModel):
    name: str
    date: str
    participants: list[str]


completion = client.beta.chat.completions.parse(
    model="gpt-4o-2024-08-06",
    messages=[
        {"role": "system", "content": "Extract the event information."},
        {
            "role": "user",
            "content": "Alice and Bob are going to a science fair on Friday.",
        },
    ],
    response_format=CalendarEvent,
)

event = completion.choices[0].message.parsed
event

### Claude

In [None]:
client = anthropic.Anthropic(api_key=os.environ["ANTHROPIC_API_KEY"])
message = client.messages.create(
    model="claude-3-5-sonnet-20241022",
    max_tokens=1024,
    messages=[{"role": "user", "content": "Hello, Claude"}],
)
print(message.content[0].text)

### VertexAI

In [None]:
vertexai.init()
model = GenerativeModel("gemini-1.5-flash-002")
chat_session = model.start_chat()
r = chat_session.send_message("What is a cocker spaniel?")
text = r.candidates[0].content.parts[0].text
print("\n".join(textwrap.wrap(text, 88)))

## text+image to text

### Gemini

In [None]:
img = Image.open(Path.home() / "Downloads" / "dogs.jpg")
img.thumbnail((1024,) * 2)
prompt = "What is this a picture of?"
model = genai.GenerativeModel(
    model_name="models/gemini-2.0-flash-exp",
    system_instruction="Answer in the style of Arnold Schwarzenegger",
)
r = model.generate_content([prompt, img])
print("\n".join(textwrap.wrap(r.text, width=88)))

In [None]:
class Choice(enum.Enum):
    COCKER_SPANIEL = "Cocker Spaniel"
    LABRADOR = "Labrador"
    SPRINGER_SPANIEL = "Springer Spaniel"
    PUG = "Pug"
    GREAT_DANE = "Great Dane"


class Dog(TypedDict):
    breed: str


model = genai.GenerativeModel(
    model_name="models/gemini-2.0-flash-exp",
)
config = genai.GenerationConfig(
    response_mime_type="application/json", response_schema=list[Dog]
)
r = model.generate_content(
    ["What breed of dog is this?", img], generation_config=config
)
pprint(json.loads(r.text))

### llama

In [None]:
# url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/0052a70beed5bf71b92610a43a52df6d286cd5f3/diffusers/rabbit.jpg"
# images = Image.open(requests.get(url, stream=True).raw)
# images.thumbnail((800,)*2)

# import requests
# import torch
# from PIL import Image
# from transformers import MllamaForConditionalGeneration, AutoProcessor,
# BitsAndBytesConfig


# model_id = "meta-llama/Llama-3.2-11B-Vision-Instruct"
# quantization_config = BitsAndBytesConfig(
#     load_in_8bit=True,  # or load_in_4bit=True for 4-bit quantization
# )
# model = MllamaForConditionalGeneration.from_pretrained(
#     model_id,
#         quantization_config=quantization_config,
#     # torch_dtype=torch.bfloat16,
#     device_map="auto",
# )
# processor = AutoProcessor.from_pretrained(model_id)

# messages = [
#     {"role": "user", "content": [
#         {"type": "image"},
#      {"type": "text", "text": "If I had to write a haiku for this one, it would be: "}
#     ]}
# ]
# input_text = processor.apply_chat_template(messages, add_generation_prompt=True)
# inputs = processor(
#     images,
#     input_text,
#     add_special_tokens=False,
#     return_tensors="pt"
# ).to(model.device)

# output = model.generate(**inputs, max_new_tokens=30)
# print(processor.decode(output[0]))


### OpenCLIP

text + image to embeddings

In [None]:
model, _, preprocess = open_clip.create_model_and_transforms(
    "ViT-B-32", pretrained="laion2b_s34b_b79k"
)
model.eval()
tokenizer = open_clip.get_tokenizer("ViT-B-32")

img_path = "../../images/dogs.jpg"
images = Image.open(img_path)

images = preprocess(images)[None,]
text = tokenizer(["a diagram", "a dog", "a cat"])

with torch.no_grad(), torch.cuda.amp.autocast():
    emb = model.encode_image(images)
    embs = model.encode_text(text)
    emb /= emb.norm(dim=-1, keepdim=True)
    embs /= embs.norm(dim=-1, keepdim=True)

    text_probs = (1 * emb @ embs.T).softmax(dim=-1)

print("Label probs:", text_probs)  # prints: [[1., 0., 0.]]

## image embeddings


### Vertex AI

### ConvNext

In [None]:
def get_batch_embeddings(image_paths, batch_size=32):
    embeddings = []

    for i in range(0, len(image_paths), batch_size):
        batch_paths = image_paths[i : i + batch_size]
        batch_images = []

        for path in batch_paths:
            image = Image.open(path).convert("RGB")
            image = transform(image)
            batch_images.append(image)

        batch_tensor = torch.stack(batch_images)

        with torch.no_grad():
            batch_embeddings = model(batch_tensor)
            batch_embeddings = F.normalize(batch_embeddings.flatten(1), dim=1, p=2)
            embeddings.extend(batch_embeddings)

    return torch.stack(embeddings)


img_path = "../../images/dogs.jpg"
img_path_1 = "../../images/dogs_1.jpg"

model = convnext_base(weights=ConvNeXt_Base_Weights.DEFAULT)
model = torch.nn.Sequential(*list(model.children())[:-1])
model.eval()

transform = transforms.Compose(
    [
        transforms.Resize(224),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ]
)
embedding = get_batch_embeddings([img_path, img_path_1])
cosine_similarity = embedding[[0]] @ embedding[[1]].T

print(cosine_similarity)

### OpenCLIP

In [None]:
model, _, preprocess = open_clip.create_model_and_transforms(
    "ViT-B-32", pretrained="laion2b_s34b_b79k"
)
model.eval()

img_path = "../../images/dogs.jpg"
img_path_1 = "../../images/dogs_1.jpg"

xs = [preprocess(Image.open(path)) for path in [img_path, img_path_1]]
xs = torch.stack(xs)

with torch.no_grad(), torch.cuda.amp.autocast():
    emb = model.encode_image(xs)
    emb /= emb.norm(dim=-1, keepdim=True)

cosine_similarity = emb[[0]] @ emb[[0]].T
cosine_similarity

## image captioning


### HuggingFace (Salesforce)

In [None]:
pipe = pipeline(
    "image-to-text", model="Salesforce/blip-image-captioning-large", use_fast=True
)
images = Image.open(img_path)
r = pipe(images)

print(r[0]["generated_text"])

### Gemini

In [None]:
genai.configure(api_key=os.environ["GEMINI_API_KEY"])

img_path = "../../images/dogs.jpg"
img = Image.open(img_path)
img.thumbnail((1024,) * 2)

prompt = "Describe the contents of the image"
model = genai.GenerativeModel(
    model_name="models/gemini-2.0-flash-exp",
)
r = model.generate_content([prompt, img])
print("\n".join(textwrap.wrap(r.text, width=88)))

### OpenAI

In [None]:
image_path = "../../images/dogs.jpg"
with open(image_path, "rb") as image_file:
    base64_image = base64.b64encode(image_file.read()).decode("utf-8")

client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])
response = client.chat.completions.create(
    model="gpt-4o",
    messages=[
        {
            "role": "user",
            "content": [
                {
                    "type": "text",
                    "text": "Describe the contents of the image",
                },
                {
                    "type": "image_url",
                    "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
                },
            ],
        }
    ],
)
content = response.choices[0].message.content
print("\n".join(textwrap.wrap(content, 88)))

### Claude

In [None]:
image_path = "../../images/dogs.jpg"
with open(image_path, "rb") as image_file:
    base64_image = base64.b64encode(image_file.read()).decode("utf-8")

client = anthropic.Anthropic()
message = client.messages.create(
    model="claude-3-5-sonnet-20241022",
    max_tokens=1024,
    messages=[
        {
            "role": "user",
            "content": [
                {
                    "type": "image",
                    "source": {
                        "type": "base64",
                        "media_type": "image/jpeg",
                        "data": base64_image,
                    },
                },
                {"type": "text", "text": "Describe the contents of the image"},
            ],
        }
    ],
)
text = message.content[0].text
print("\n".join(textwrap.wrap(text, 88)))

### Vertex AI

In [None]:
image = Part.from_image(VImage.load_from_file("../../images/dogs.jpg"))
model = GenerativeModel("gemini-1.5-flash-002")
r = model.generate_content([image, "Describe the contents of the image"])
print("\n".join(textwrap.wrap(r.text, 88)))

## text embeddings


In [None]:
texts = ["Hello Mole!", "Hey Ted!!!"]

### HuggingFace (SentenceTransformer)

In [None]:
model = SentenceTransformer("all-MiniLM-L6-v2")
embs = torch.tensor(model.encode(texts))

(embs[[0]] @ embs[[1]].T).item(), embs.shape

In [None]:
model = SentenceTransformer("BAAI/bge-large-en")
embs = torch.tensor(model.encode(texts))

(embs[[0]] @ embs[[1]].T).item(), embs.shape

### OpenCLIP

In [None]:
model, _, preprocess = open_clip.create_model_and_transforms(
    "ViT-B-32", pretrained="laion2b_s34b_b79k"
)
model.eval()
tokenizer = open_clip.get_tokenizer("ViT-B-32")
text = tokenizer(texts)

with torch.no_grad(), torch.cuda.amp.autocast():
    embs = model.encode_text(text)
    embs /= embs.norm(dim=-1, keepdim=True)

(embs[[0]] @ embs[[1]].T).item(), embs.shape

### Gemini

In [None]:
genai.configure(api_key=os.environ["GEMINI_API_KEY"])

r = genai.embed_content(model="models/text-embedding-004", content=texts)
embs = torch.tensor(r["embedding"])
(embs[[0]] @ embs[[1]].T).item(), embs.shape

### OpenAI

In [None]:
client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])

r = client.embeddings.create(
    model="text-embedding-3-small", input=texts, encoding_format="float"
)
embs = torch.tensor([emb.embedding for emb in r.data])
(embs[[0]] @ embs[[1]].T).item(), embs.shape

## text Classification

In [None]:
def pca_explained_var(pca: PCA, fmt: bool = True) -> pd.DataFrame:
    """Return the variance captured by each principal component."""
    ratios = {
        "var": pca.explained_variance_,
        "var_ratio": pca.explained_variance_ratio_,
        "var_ratio_cum": pca.explained_variance_ratio_.cumsum(),
    }
    df = pd.DataFrame(ratios)
    if fmt:
        df = pd.concat([df.iloc[:, 0], df.iloc[:, 1:].map("{:.1%}".format)], axis=1)
    return df


# def show_confusion_matrix(y_true, y_pred):
#     cm = confusion_matrix(y_true, y_pred)
#     disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=categories)
#     _, ax = plt.subplots(figsize=(7, 7))
#     disp.plot(ax=ax, xticks_rotation=90)
#     plt.show()


def shannon_entropy(probabilities):
    return -sum(p * math.log2(p) for p in probabilities if p > 0)


def entropy(labels):
    counts = Counter(labels)
    total = len(labels)
    probabilities = [count / total for count in counts.values()]
    return shannon_entropy(probabilities)


def feature_importance(names, importances) -> pd.DataFrame:
    fi = pd.DataFrame({"feature": names, "score": importances}).sort_values(
        "score", ascending=False
    )
    fi["pct"] = fi.score / fi.score.sum()
    return fi

### load data

In [None]:
root_dir = Path.cwd().parent
# !kaggle datasets download -d deepshah16/song-lyrics-dataset
# !unzip song-lyrics-dataset.zip
paths = list((root_dir / "tmp" / "csv").iterdir())
df = pd.concat(map(pd.read_csv, paths)).iloc[:, 1:]
df.columns = df.columns.str.lower()
df = df[~df.lyric.isnull()]
df["n_char"] = df.lyric.str.len()
df["n_word"] = df.lyric.str.split().str.len()
df["n_unique_word"] = df.lyric.str.split().apply(set).str.len()
df["word_entropy"] = df.lyric.str.split().apply(entropy)
df["avg_word_len"] = df.lyric.str.split().apply(lambda x: np.mean([len(i) for i in x]))
df["label"] = df["artist"].astype("category").cat.codes
categories = df.artist.astype("category").cat.categories.tolist()
df.head(5)

### sentence transformers

In [None]:
x_train, x_valid, y_train, y_valid = train_test_split(
    df.lyric.tolist(), df.label, test_size=0.2, random_state=42
)
st = SentenceTransformer("all-MiniLM-L6-v2")
st_transform = FunctionTransformer(st.encode)

pipe = make_pipeline(
    st_transform,
    lightgbm.LGBMClassifier(verbose=-100),
)
pipe.fit(x_train, y_train)
yhat = pipe.predict(x_valid)
print(accuracy_score(y_valid, yhat), f1_score(y_valid, yhat, average="micro"))
# show_confusion_matrix(y_valid, yhat)

### tfidf

In [None]:
manual_features = [
    "n_char",
    "n_word",
    "n_unique_word",
    "word_entropy",
    "avg_word_len",
]

x_train, x_test, y_train, y_test = train_test_split(
    df, df.label, test_size=0.2, random_state=42
)

feature_union = FeatureUnion(
    [
        (
            "tfidf",
            make_pipeline(
                FunctionTransformer(lambda x: x["lyric"], validate=False),
                TfidfVectorizer(),
            ),
        ),
        ("manual", FunctionTransformer(lambda x: x[manual_features], validate=False)),
    ]
)
pipe = make_pipeline(feature_union, lightgbm.LGBMClassifier(verbose=-100, n_jobs=-1))
pipe.fit(x_train, y_train)
yhat = pipe.predict(x_test)

accuracy = accuracy_score(y_test, yhat)
f1 = f1_score(y_test, yhat, average="micro")

print(f"Accuracy: {accuracy}")
print(f"F1 Score: {f1}")
# show_confusion_matrix(y_valid, yhat)

In [None]:
m = pipe.named_steps["lgbmclassifier"]
vec_columns = (
    pipe.named_steps["featureunion"]
    .transformer_list[0][1]
    .named_steps["tfidfvectorizer"]
    .get_feature_names_out()
)
features = vec_columns.tolist() + manual_features
fi = feature_importance(features, m.feature_importances_)
fig, ax = plt.subplots(figsize=(13, 5.5))
fi.head(40).iloc[::-1].plot(kind="barh", x="feature", y="pct", ax=ax)
ax.tick_params(labelsize=8)
plt.show()

### fasttext

In [None]:
x = df["lyric"]
y = df["label"].astype(str)
x_train, x_valid, y_train, y_valid = train_test_split(
    x, y, test_size=0.2, random_state=42
)


def prepare_fasttext_data(texts, labels, file_path):
    with open(file_path, "w", encoding="utf-8") as f:
        for text, label in zip(texts, labels, strict=False):
            f.write(f"__label__{label} {text}\n")


with tempfile.NamedTemporaryFile(mode="w+", delete=False, suffix=".txt") as temp_train:
    prepare_fasttext_data(x_train, y_train, temp_train.name)
    train_file = temp_train.name

model = fasttext.train_supervised(input=train_file, lr=1.0, epoch=25, wordNgrams=2)


def predict(model, texts):
    return [model.predict(text)[0][0].replace("__label__", "") for text in texts]


y_pred = predict(model, x_valid)
y_valid = y_valid.astype(str)
y_pred = [str(pred) for pred in y_pred]

accuracy = accuracy_score(y_valid, y_pred)
f1 = f1_score(y_valid, y_pred, average="micro")

print(f"Accuracy: {accuracy}")
print(f"F1 Score: {f1}")

os.unlink(train_file)

## object detection/image segmentation


### Gemini

In [None]:
r = model.generate_content(
    [
        (
            "Return a bounding box for each of the objects in this image in "
            "[ymin, xmin, ymax, xmax] format."
        ),
        images,
    ],
    generation_config=genai.GenerationConfig(
        response_mime_type="application/json", response_schema=list[list[int]]
    ),
)
json.loads(r.text)

### YOLO (ultralytics)

In [None]:
yolo = YOLO("yolo11n.pt")
img_path = "../../images/dogs.jpg"
images = Image.open(img_path)
images.thumbnail((512,) * 2)
images.save(img_path)
r = yolo(img_path)
r[0].show()
images

In [None]:
[(yolo.names[i[5]], i[4]) for i in r[0].boxes.data.tolist()]

In [None]:
for i in range(len(r[0].boxes.data)):
    # Get box coordinates, confidence, and class
    box_data = r[0].boxes.data[i]
    x1, y1, x2, y2 = box_data[0:4]
    conf = box_data[4]
    cls = box_data[5]

    # Convert to integers for cropping
    x1, y1, x2, y2 = map(int, [x1, y1, x2, y2])
    # Crop image
    cropped = images.crop((x1, y1, x2, y2))
cropped

## text to image

In [None]:
client = OpenAI()

response = client.images.generate(
    model="dall-e-3",
    prompt="A chocolate brown short-haired cocker spaniel",
    size="1024x1024",
    quality="standard",
    n=1,
)

print(response.data[0].url)

# Evaluation

## Text Generation

### Complexity

In [None]:
# vocab probs * prediction index
probs = np.random.dirichlet(np.ones(5), 10)
y = probs.argmax(1)
probs, y

In [None]:
# complexity if all predictions are correct
complexity = np.exp(-np.log(probs[range(len(probs)), y]).mean())
complexity

In [None]:
# complexity if actual labels are random
y_rand = np.random.randint(10)
complexity = np.exp(-np.log(probs[range(len(probs)), y_rand]).mean())
complexity

# Fine-tuning

## Parameter efficient methods

### LoRA

In [None]:
rank = 1
weights = np.random.random((10, 10))
a = np.random.random((10, rank))
b = np.random.random((rank, 10))
assert (a @ b).shape == weights.shape
weights, a, b

In [None]:
# total parameters, decomposed parameters
weights.flatten().shape[0], len(a) + len(b)

In [None]:
scaling = 0.1
weights_new = weights + (a @ b) * scaling

### P-tuning

In [None]:
vocab = {
    "cat": np.array([1, 0, 0, 0]),
    "dog": np.array([0, 1, 0, 0]),
    "is": np.array([0, 0, 1, 0]),
    "black": np.array([0, 0, 0, 1]),
}
sentence = ["cat", "is", "black"]
input_embeddings = np.array([vocab[word] for word in sentence])
input_embeddings

In [None]:
num_prompt_tokens = 2
prompt_embeddings = np.random.randn(num_prompt_tokens, 4)
combined = np.vstack([prompt_embeddings, input_embeddings])
combined

In [None]:
2**32 / 1_000_000

In [None]:
2**8

In [None]:
# forward pass and update the virtual tokens