# Step 1 - Install the required dependencies and make sure the python version is 3.10 and above

In [1]:
!pip install zeno-client
!pip install --upgrade pip
!pip install --upgrade bottleneck
!pip install langdetect


Collecting zeno-client
  Downloading zeno_client-0.1.16-py3-none-any.whl.metadata (2.1 kB)
Collecting arrow-json<0.10.0,>=0.9.0 (from zeno-client)
  Downloading arrow_json-0.9.0-cp37-abi3-macosx_10_9_x86_64.macosx_11_0_arm64.macosx_10_9_universal2.whl.metadata (1.5 kB)
Collecting outdated>=0.2.0 (from zeno-client)
  Downloading outdated-0.2.2-py2.py3-none-any.whl (7.5 kB)
Collecting pandas>=1.4.3 (from zeno-client)
  Downloading pandas-2.2.0-cp312-cp312-macosx_11_0_arm64.whl.metadata (19 kB)
Collecting pyarrow>=6.0.0 (from zeno-client)
  Downloading pyarrow-15.0.0-cp312-cp312-macosx_11_0_arm64.whl.metadata (3.0 kB)
Collecting pydantic>=1.0 (from zeno-client)
  Downloading pydantic-2.6.1-py3-none-any.whl.metadata (83 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m83.5/83.5 kB[0m [31m939.2 kB/s[0m eta [36m0:00:00[0m0:01[0m00:01[0m
[?25hCollecting requests>=2.20.0 (from zeno-client)
  Using cached requests-2.31.0-py3-none-any.whl.metadata (4.6 kB)
Collecting tq

In [2]:
!python --version

Python 3.12.1


In [1]:
from zeno_client import ZenoClient, ZenoMetric
import pandas as pd

# Initialize a client with our API key.
client = ZenoClient("zen_mRzHxR4RegXCg_iEqp2aoDQCeDiY_LGUrReYqzW-grA")


  from .autonotebook import tqdm as notebook_tqdm


# Step 2 - Create a project

In [4]:
project = client.create_project(
    name="Biased Movies Analysis",
    view="text-classification",
    metrics=[
        ZenoMetric(name="accuracy", type="mean", columns=["correct"]),
        #Uncomment the following line after implementing the function
        ZenoMetric(name="avg length ratio", type="mean", columns=["avg_length_ratio"]),
    ]
)

Successfully created project.
Access your project at  https://hub.zenoml.com/project/5b34bdf9-f4f6-4df0-bb1a-0180ac01acbe/Biased%20Movies%20Analysis


# Step 3 - Create dataset

In [8]:
import pandas as pd
from langdetect import detect
from collections import Counter
import random

# Sample language detection function
def detect_language(text):
    try:
        return detect(text)
    except:
        return "Unknown"

# Create a Pandas DataFrame for the biased dataset of movies
df = pd.DataFrame(
    {
        "id": range(1, 11),  # Using only 10 instances for demonstration
        "text": [
            "I love this movie!",
            "¡Odio esta película!",
            "Ce film est ok.",
            "The acting was great!",
            "The plot was confusing.",
            "I really enjoyed it!",
            "Worst movie ever!",
            "Not bad, but could be better.",
            "A masterpiece!",
            "I didn't like it.",
        ],
        "genre": ["action", "drama", "comedy", "action", "drama", "action", "drama", "comedy", "action", "drama"],
        "year": [random.choice([2020, 2021, 2022]) for _ in range(10)],  # Randomly select year
        "label": ["positive", "negative", "neutral", "positive", "negative", "positive", "negative", "neutral", "positive", "negative"],
    }
)

# Introduce biases in genre
biased_genre = ["action", "action", "drama", "drama", "comedy", "comedy", "comedy", "comedy", "drama", "drama"]
df["biased_genre"] = biased_genre

# Introduce biases in year
biased_year = [2020, 2020, 2020, 2021, 2021, 2021, 2022, 2022, 2022, 2022]
df["biased_year"] = biased_year

# Add language detection to create a new column
df["detected_language"] = df["text"].apply(detect_language)

# Add additional columns for analysis
df["input_length"] = df["text"].str.len()

#Uncomment the following line
df["avg_length_ratio"] = df["input_length"] / (df["input_length"].mean())



# Step 4 - Upload the Base Dataset


In [9]:

project.upload_dataset(df, id_column="id", data_column="text", label_column="label")



  df.loc[:, id_column] = df[id_column].astype(str)
100%|██████████| 1/1 [00:00<00:00,  1.65it/s]

Successfully uploaded data





# Step 5 - Upload the AI System Outputs


In [11]:
# For demonstration, let's assume we have system predictions
df_system = pd.DataFrame(
    {
        "output": ["positive", "negative", "negative", "positive", "negative", "positive", "negative", "neutral", "positive", "negative"],
    }
)

# Create an id column to match the base dataset
df_system["id"] = df_system.index + 1

# Measure accuracy for each instance, which is averaged by the ZenoMetric above
df_system["correct"] = (df_system["output"] == df["label"]).astype(int)



# Step 6 - Upload the system outputs


In [12]:
project.upload_system(df_system, name="System A", id_column="id", output_column="output")

  df.loc[:, id_column] = df[id_column].astype(str)
100%|██████████| 1/1 [00:00<00:00,  1.80it/s]

Successfully uploaded system



