In [1]:
# Import necessary libraries
from typing import List

import pandas as pd
from pydantic import BaseModel

### Create a DataFrame

We define a list of English entities and create a Pandas DataFrame from this list.

In [2]:
# Define a list of entities to translate
fruits: List[str] = ["apple", "banana", "orange", "grape", "kiwi", "mango", "peach", "pear", "pineapple", "strawberry"]
fruits_df = pd.DataFrame({"name": fruits})
fruits_df

Unnamed: 0,name
0,apple
1,banana
2,orange
3,grape
4,kiwi
5,mango
6,peach
7,pear
8,pineapple
9,strawberry


# import openaivec.pandas_ext

This example demonstrates how to integrate the `openaivec.pandas_ext` module with Pandas for text translation tasks. Follow the examples below for single and multi-language translations.

If environment variavle `OPENAI_API_KEY` is set, `pandas_ext` automatically use the client `openai.OpenAI`.

If environment variables `AZURE_OPENAI_API_KEY`, `AZURE_OPENAI_ENDPOINT` and `AZURE_OPENAI_API_VERSION` are set, `pandas_ext` automatically use the client `openai.AzureOpenAI`.

If you must use specific instance of `openai.OpenAI`, please set client with `pandas_ext.use`.

In [3]:
import openai

from openaivec import pandas_ext

# Set OpenAI Client
pandas_ext.use(openai.OpenAI())

# Set models for responses and embeddings
pandas_ext.responses_model("gpt-4o-mini")
pandas_ext.embedding_model("text-embedding-3-small")

# Process the columns with OpenAI
Once we load `pandas_ext`, we are able to process with series with simple accessof `pd.Series.ai.response`.

In [4]:
# Translate name to French and add as a new column
s: pd.Series = fruits_df.name.ai.response("translate to French")

s

0     pomme
1    banane
2    orange
3    raisin
4      kiwi
5    mangue
6     pêche
7     poire
8    ananas
9    fraise
Name: name, dtype: object

And embeddings also works with method `pd.Series.ai.embed`

In [5]:
e: pd.Series = fruits_df.name.ai.embed()

e

0    [0.017667118, -0.016819485, -0.041800473, 0.01...
1    [0.013411593, -0.020545648, -0.033350088, -0.0...
2    [-0.025922043, -0.0055465647, -0.006110964, 0....
3    [-0.038692072, 0.009548252, -0.020608373, -0.0...
4    [-0.0057260627, -0.021419084, -0.02602446, 0.0...
5    [0.0554931, -0.008836438, -0.01999861, -0.0065...
6    [0.030673496, -0.041959558, -0.013912023, 0.03...
7    [0.023664422, -0.022354774, -0.008752595, 0.03...
8    [0.020983547, -0.060567692, -0.002925918, 0.02...
9    [0.02012112, -0.014351289, -0.040668648, -0.02...
Name: name, dtype: object

Count tokens with `pd.Series.ai.count_tokens`

In [6]:
num_tokens: pd.Series = fruits_df.name.ai.count_tokens()

num_tokens

0    1
1    1
2    1
3    2
4    2
5    2
6    2
7    1
8    2
9    3
Name: num_tokens, dtype: int64

# Structured Output with pandas_ext

Structured output is also available in `pd.Series.ai.predict`.

In [7]:
# Define a structured output model for translations (Example: using Pydantic for structured output)
class Translation(BaseModel):
    en: str  # English
    fr: str  # French
    ja: str  # Japanese
    es: str  # Spanish
    de: str  # German
    it: str  # Italian
    pt: str  # Portuguese
    ru: str  # Russian

translations: pd.Series = fruits_df.name.ai.response(
    instructions="translate to multiple languages",
    response_format=Translation
)

translations

0    en='apple' fr='pomme' ja='リンゴ' es='manzana' de...
1    en='banana' fr='banane' ja='バナナ' es='plátano' ...
2    en='orange' fr='orange' ja='オレンジ' es='naranja'...
3    en='grape' fr='raisin' ja='ぶどう' es='uva' de='T...
4    en='kiwi' fr='kiwi' ja='キウイ' es='kiwi' de='Kiw...
5    en='mango' fr='mangue' ja='マンゴー' es='mango' de...
6    en='peach' fr='pêche' ja='桃' es='durazno' de='...
7    en='pear' fr='poire' ja='梨' es='pera' de='Birn...
8    en='pineapple' fr='ananas' ja='パイナップル' es='piñ...
9    en='strawberry' fr='fraise' ja='いちご' es='fresa...
Name: name, dtype: object

And these values of `pd.Series` are instance of `pydantic.BaseModel`. 

`pd.Series.ai.extract` method can parse each element as `pd.DataFrame`

In [8]:
translations.ai.extract()

Unnamed: 0,name_en,name_fr,name_ja,name_es,name_de,name_it,name_pt,name_ru
0,apple,pomme,リンゴ,manzana,Apfel,mela,maçã,яблоко
1,banana,banane,バナナ,plátano,Banane,banana,banana,банан
2,orange,orange,オレンジ,naranja,Orange,arancia,laranja,апельсин
3,grape,raisin,ぶどう,uva,Traube,uva,uva,виноград
4,kiwi,kiwi,キウイ,kiwi,Kiwi,kiwi,kiwi,киви
5,mango,mangue,マンゴー,mango,Mango,mango,manga,манго
6,peach,pêche,桃,durazno,Pfirsich,pesca,pêssego,персик
7,pear,poire,梨,pera,Birne,pera,pera,груша
8,pineapple,ananas,パイナップル,piña,Ananas,ananas,abacaxi,ананас
9,strawberry,fraise,いちご,fresa,Erdbeere,fragola,morango,клубника


# Example of Data Enrichment of fruit table

These interfaces can be seamlessly integreted with `pd.DataFrame` APIs.

Let's enrich your data with power of LLMs!

In [9]:
fruits_df.pipe(
    # assign a new column
    lambda df: df.assign(
        # Assign the color column using a openai model
        color=lambda df: df.name.ai.response("Return the color of given fruit"),

        # Assign the embedding column using a openai model
        embedding=lambda df: df.name.ai.embed(),

        # Assign the multilingual translation column using a openai model
        translation=lambda df: df.name.ai.response(
            instructions="translate to multiple languages",
            response_format=Translation # Use the structured output model with pydantic.BaseModel
        )
    )
    # Extract the translation column from the structured output
    .ai.extract(column="translation")   
)

Unnamed: 0,name,color,embedding,translation_en,translation_fr,translation_ja,translation_es,translation_de,translation_it,translation_pt,translation_ru
0,apple,red,"[0.01764064, -0.016817328, -0.041843545, 0.019...",apple,pomme,リンゴ,manzana,Apfel,mela,maçã,яблоко
1,banana,yellow,"[0.013411593, -0.020545648, -0.033350088, -0.0...",banana,banane,バナナ,plátano,Banane,banana,banana,банан
2,orange,orange,"[-0.025922043, -0.0055465647, -0.006110964, 0....",orange,orange,オレンジ,naranja,Orange,arancia,laranja,апельсин
3,grape,purple,"[-0.038692072, 0.009548252, -0.020608373, -0.0...",grape,raisin,ぶどう,uva,Traube,uva,uva,виноград
4,kiwi,green,"[-0.0057398607, -0.021460608, -0.026025245, 0....",kiwi,kiwi,キウイ,kiwi,Kiwi,kiwi,kiwi,киви
5,mango,yellow/orange,"[0.055455774, -0.008839109, -0.019977605, -0.0...",mango,mangue,マンゴー,mango,Mango,mango,manga,манго
6,peach,pink/white,"[0.030673496, -0.041959558, -0.013912023, 0.03...",peach,pêche,桃,durazno,Pfirsich,pesca,pêssego,персик
7,pear,green,"[0.023664422, -0.022354774, -0.008752595, 0.03...",pear,poire,梨,pera,Birne,pera,pera,груша
8,pineapple,brown/yellow,"[0.020983547, -0.060567692, -0.002925918, 0.02...",pineapple,ananas,パイナップル,piña,Ananas,ananas,abacaxi,ананас
9,strawberry,red,"[0.020106195, -0.014350146, -0.040745355, -0.0...",strawberry,fraise,いちご,fresa,Erdbeere,fragola,morango,клубника
