In [1]:

!pip -q install --upgrade transformers torch

from transformers import pipeline

qa_model = pipeline("question-answering", model="distilbert-base-cased-distilled-squad")

def ask(context: str, question: str):
    if not isinstance(context, str) or not isinstance(question, str):
        raise TypeError("Both context and question must be strings.")
    out = qa_model(question=question, context=context)
    return out["answer"], out["score"]

def ask_chunked(context: str, question: str, chunk_size_chars=2500, overlap=200):
    best = {"answer": None, "score": -1.0}
    start = 0
    L = len(context)
    while start < L:
        end = min(L, start + chunk_size_chars)
        chunk = context[start:end]
        out = qa_model(question=question, context=chunk)
        if out["score"] > best["score"]:
            best = {"answer": out["answer"], "score": out["score"]}
        start = end - overlap
        if start < 0: start = 0
        if start >= L: break
    return best["answer"], best["score"]


context = """Nigerian Pidgin, also known simply as Pidgin or as Naijá in scholarship,
is an English-based creole language spoken as a lingua franca across Nigeria.
The language is sometimes referred to as Pijin or Vernacular. Coming into existence
during the 17th and 18th centuries as a result of contact between Britons and Africans
involved in the Atlantic slave trade, in the 2010s, a common orthography was developed
for Pidgin which has been gaining significant popularity in giving the language a harmonized writing system.
"""  # Extracted from Wikipedia


questions = [
    "What is Nigerian Pidgin known as?",
    "When did Nigerian Pidgin come into existence?"
]


for q in questions:
    answer, score = ask(context, q)
    print("Q:", q)
    print("Answer:", answer)
    print("Confidence:", round(score, 3))
    print("-" * 50)

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.1/40.1 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.6/11.6 MB[0m [31m36.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m39.5 MB/s[0m eta [36m0:00:00[0m
[?25h

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/473 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/261M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

Fetching 0 files: 0it [00:00, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 0 files: 0it [00:00, ?it/s]

Device set to use cpu


Q: What is Nigerian Pidgin known as?
Answer: Naijá
Confidence: 0.966
--------------------------------------------------
Q: When did Nigerian Pidgin come into existence?
Answer: 17th and 18th centuries
Confidence: 0.434
--------------------------------------------------


In [2]:

from transformers import pipeline


qa_model = pipeline("question-answering", model="distilbert-base-cased-distilled-squad")

def ask(context: str, question: str):
    out = qa_model(question=question, context=context)
    return out["answer"], out["score"]


context = """The Yoruba people (/ˈjɒrʊbə/ YORR-ub-ə; Yoruba: Ìran Yorùbá, Ọmọ Odùduwà, Ọmọ Káàárọ̀-oòjíire)
are a West African ethnic group who inhabit parts of Nigeria, Benin, and Togo, which are collectively referred to as Yorubaland.
The Yoruba constitute more than 50 million people in Africa, are over a million outside the continent, and bear further representation among the African diaspora.
The vast majority of Yoruba are within Nigeria, where they make up 20.7 % of the country's population according to Ethnologue estimations,
making them one of the largest ethnic groups in Africa. Most Yoruba people speak the Yoruba language, which is the Niger-Congo language with the largest number of native or L1 speakers.
""" # Extracted from Wikipedia


questions = [
    "Which countries make up Yorubaland?",
    "How many Yoruba people are there in Africa?",
    "What language do most Yoruba people speak?"
]


for q in questions:
    answer, score = ask(context, q)
    print("Q:", q)
    print("Answer:", answer)
    print("Confidence:", round(score, 3))
    print("-" * 50)

Fetching 0 files: 0it [00:00, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 0 files: 0it [00:00, ?it/s]

Device set to use cpu


Q: Which countries make up Yorubaland?
Answer: Nigeria, Benin, and Togo
Confidence: 0.978
--------------------------------------------------
Q: How many Yoruba people are there in Africa?
Answer: 50 million
Confidence: 0.452
--------------------------------------------------
Q: What language do most Yoruba people speak?
Answer: Yoruba
Confidence: 0.847
--------------------------------------------------


In [3]:

from transformers import pipeline


qa_model = pipeline("question-answering", model="distilbert-base-cased-distilled-squad")

def ask(context: str, question: str):
    out = qa_model(question=question, context=context)
    return out["answer"], out["score"]


context = """Arà the Owl is a wise bird that lives in a large forest.
She is known among the animals for her intelligence and fairness in settling disputes.
One day, two young animals came to her with a quarrel about food.
Arà listened carefully to both sides and then gave a judgment that satisfied them both.
From that day, Arà was respected as the voice of wisdom in the forest.
"""

questions = [
    "Who is Arà?",
    "What kind of animal is Arà?",
    "Why is Arà respected in the forest?"
]

for q in questions:
    answer, score = ask(context, q)
    print("Q:", q)
    print("Answer:", answer)
    print("Confidence:", round(score, 3))
    print("-" * 50)

Fetching 0 files: 0it [00:00, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 0 files: 0it [00:00, ?it/s]

Device set to use cpu


Q: Who is Arà?
Answer: a wise bird
Confidence: 0.496
--------------------------------------------------
Q: What kind of animal is Arà?
Answer: bird
Confidence: 0.353
--------------------------------------------------
Q: Why is Arà respected in the forest?
Answer: voice of wisdom
Confidence: 0.356
--------------------------------------------------


In [4]:
qa_model_roberta = pipeline("question-answering", model="deepset/tinyroberta-squad2")

context = """The Yoruba people (/ˈjɒrʊbə/ YORR-ub-ə; Yoruba: Ìran Yorùbá, Ọmọ Odùduwà, Ọmọ Káàárọ̀-oòjíire)
are a West African ethnic group who inhabit parts of Nigeria, Benin, and Togo, which are collectively referred to as Yorubaland.
The Yoruba constitute more than 50 million people in Africa, are over a million outside the continent, and bear further representation among the African diaspora.
The vast majority of Yoruba are within Nigeria, where they make up 20.7 % of the country's population according to Ethnologue estimations,
making them one of the largest ethnic groups in Africa. Most Yoruba people speak the Yoruba language, which is the Niger-Congo language with the largest number of native or L1 speakers.
"""

questions = [
    "Which countries make up Yorubaland?",
    "How many Yoruba people are there in Africa?",
    "What language do most Yoruba people speak?"
]


qa_model_distilbert = pipeline("question-answering", model="distilbert-base-cased-distilled-squad")

print("---- DistilBERT Answers ----")
for q in questions:
    result = qa_model_distilbert(question=q, context=context)
    print(f"Q: {q}")
    print("Answer:", result['answer'])
    print("Confidence:", round(result['score'], 3))
    print("-" * 40)

print("\n---- TinyRoBERTa Answers ----")
for q in questions:
    result = qa_model_roberta(question=q, context=context)
    print(f"Q: {q}")
    print("Answer:", result['answer'])
    print("Confidence:", round(result['score'], 3))
    print("-" * 40)

config.json:   0%|          | 0.00/835 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/326M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/383 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Fetching 0 files: 0it [00:00, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 0 files: 0it [00:00, ?it/s]

Device set to use cpu


Fetching 0 files: 0it [00:00, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 0 files: 0it [00:00, ?it/s]

Device set to use cpu


---- DistilBERT Answers ----
Q: Which countries make up Yorubaland?
Answer: Nigeria, Benin, and Togo
Confidence: 0.978
----------------------------------------
Q: How many Yoruba people are there in Africa?
Answer: 50 million
Confidence: 0.452
----------------------------------------
Q: What language do most Yoruba people speak?
Answer: Yoruba
Confidence: 0.847
----------------------------------------

---- TinyRoBERTa Answers ----
Q: Which countries make up Yorubaland?
Answer: Nigeria, Benin, and Togo
Confidence: 0.978
----------------------------------------
Q: How many Yoruba people are there in Africa?
Answer: 50 million
Confidence: 0.461
----------------------------------------
Q: What language do most Yoruba people speak?
Answer: Yoruba
Confidence: 0.66
----------------------------------------


In [5]:
!pip install evaluate

import evaluate


squad_metric = evaluate.load("squad")


questions = [
    "Which countries make up Yorubaland?",
    "How many Yoruba people are there in Africa?",
    "What language do most Yoruba people speak?"
]

predictions = [
    {"id": "1", "prediction_text": "Nigeria, Benin, and Togo"},
    {"id": "2", "prediction_text": "50 million"},
    {"id": "3", "prediction_text": "Yoruba"}
]

references = [
    {"id": "1", "answers": {"text": ["Nigeria, Benin, and Togo"], "answer_start": [0]}},
    {"id": "2", "answers": {"text": ["More than 50 million"], "answer_start": [0]}},
    {"id": "3", "answers": {"text": ["Yoruba"], "answer_start": [0]}}
]

results = squad_metric.compute(predictions=predictions, references=references)
print(results)

Collecting evaluate
  Downloading evaluate-0.4.5-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.5-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.5


Downloading builder script: 0.00B [00:00, ?B/s]

Downloading extra modules: 0.00B [00:00, ?B/s]

{'exact_match': 66.66666666666667, 'f1': 88.88888888888887}
