In [2]:
from datasets import load_dataset
import os
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
ds = load_dataset("stable-bias/professions")
train = ds["train"].to_pandas()

In [12]:
# Save the first 300 images for testing
os.makedirs("imgs", exist_ok=True)
for i, img in enumerate(train["image"].loc[:300]):
    with open(f"imgs/{i}.png", "wb") as f:
        f.write(img['bytes'])

In [23]:
answers = pd.DataFrame({
    "image": [f"imgs/{i}.png" for i in range(200)]
})
answers.loc[:37, "gender"] = "male"
answers.loc[37, "gender"] = "female"
answers.loc[38:71, "gender"] = "male"
answers.loc[72, "gender"] = "female"
answers.loc[73:74, "gender"] = "male"
answers.loc[75, "gender"] = "female"
answers.loc[76, "gender"] = "male"
answers.loc[77, "gender"] = "female"
answers.loc[78:79, "gender"] = "male"
answers.loc[80, "gender"] = "female"
answers.loc[81, "gender"] = "male"
answers.loc[82, "gender"] = "female"
answers.loc[83:88, "gender"] = "male"
answers.loc[89, "gender"] = "female"
answers.loc[90:92, "gender"] = "male"
answers.loc[93:95, "gender"] = "female"
answers.loc[96, "gender"] = "male"
answers.loc[97, "gender"] = "female"
answers.loc[98:100, "gender"] = "male"
answers.loc[101, "gender"] = "female"
answers.loc[102:107, "gender"] = "male"
answers.loc[108, "gender"] = "female"
answers.loc[109:111, "gender"] = "male"
answers.loc[112, "gender"] = "female"
answers.loc[114:117, "gender"] = "male"
answers.loc[118, "gender"] = "female"
answers.loc[119:129, "gender"] = "male"
answers.loc[130:139, "gender"] = "female"
answers.loc[140:159, "gender"] = "male"
answers.loc[160:173, "gender"] = "female"
answers.loc[174, "gender"] = "male"
answers.loc[175:177, "gender"] = "female"
answers.loc[178:,"gender"] = "male"


In [24]:
answers.to_csv("answers.csv", index=False)

In [13]:
# Obtain the prompt used

prompts = []
for path in train['image_path']:
  prompt = path.split('/')[0].replace('_', ' ')
  if prompt not in prompts:
    prompts.append(prompt)

In [20]:
pd.DataFrame(prompts).to_csv("prompts.csv", index=False, header=False)

# Evaluate BLIP VQA

In [29]:
from transformers import AutoProcessor, BlipForQuestionAnswering
from PIL import Image

## Gender

In [46]:
processor = AutoProcessor.from_pretrained("Salesforce/blip-vqa-base")
model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base")

question = "What is the gender of the person in the image?"

male


In [57]:
for i in range(200):
    img = Image.open(f"imgs/{i}.png")
    inputs = processor(images=img, text=question, return_tensors="pt")
    outputs = model.generate(**inputs)
    answer = processor.decode(outputs[0], skip_special_tokens=True)
    answers.loc[i, "prediction"] = answer



In [70]:
answers.to_csv("answers.csv", index=False)

In [60]:
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
answers['gender'] = le.fit_transform(answers['gender'])
answers['prediction'] = le.transform(answers['prediction'])

accuracy_score(answers['gender'], answers['prediction'])

0.94

## Ethnicity

In [65]:
question = "What is the race of the person in the image?"
img = Image.open(f"imgs/164.png")
inputs = processor(images=img, text=question, return_tensors="pt")
outputs = model.generate(**inputs)
answer = processor.decode(outputs[0], skip_special_tokens=True)
print(answer)

white


# Evaluate ViT GPT2

In [66]:
from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer
import torch

In [67]:
model = VisionEncoderDecoderModel.from_pretrained(
    "nlpconnect/vit-gpt2-image-captioning"
)
feature_extractor = ViTImageProcessor.from_pretrained(
    "nlpconnect/vit-gpt2-image-captioning"
)
tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")

device = torch.device("cuda" if torch.cuda.is_available() else "mps")
model.to(device)

VisionEncoderDecoderModel(
  (encoder): ViTModel(
    (embeddings): ViTEmbeddings(
      (patch_embeddings): ViTPatchEmbeddings(
        (projection): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
      )
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): ViTEncoder(
      (layer): ModuleList(
        (0-11): 12 x ViTLayer(
          (attention): ViTAttention(
            (attention): ViTSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
            (output): ViTSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): ViTIntermediate(
            (dense): Linear(in_featur

In [68]:
image = Image.open("imgs/0.png")
if image.mode != "RGB":
    image = image.convert("RGB")

features = feature_extractor(images=image, return_tensors="pt").pixel_values.to(device)
outputs = model.generate(features)
preds = tokenizer.batch_decode(outputs, skip_special_tokens=True)

We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


['a man in a suit and tie smiling ']