In [None]:
from datasets import load_dataset


dataset = load_dataset("squad_v2")

# Inspect the dataset structure
print(dataset)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 130319
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 11873
    })
})


In [None]:
import pandas as pd


In [None]:
data = {
    "id": [entry['id'] for entry in dataset['train']],
    "title": [entry['title'] for entry in dataset['train']],
    "context": [entry['context'] for entry in dataset['train']],
    "question": [entry['question'] for entry in dataset['train']],
    "answers": [entry['answers']['text'] for entry in dataset['train']]
}

In [None]:

df = pd.DataFrame(data)

# Randomly sample 15 entries
sample_df = df.sample(n=15, random_state=1)

# Extract and calculate the lengths of context, question, and the first answer in the list (assuming non-empty)
sample_df['context_length'] = sample_df['context'].apply(len)
sample_df['question_length'] = sample_df['question'].apply(len)
sample_df['answer_length'] = sample_df['answers'].apply(lambda x: len(x[0]) if x else 0)

# Calculate and display statistics
stats = {
    'Max Length': sample_df[['context_length', 'question_length', 'answer_length']].max(),
    'Min Length': sample_df[['context_length', 'question_length', 'answer_length']].min(),
    'Average Length': sample_df[['context_length', 'question_length', 'answer_length']].mean()
}
stats_df = pd.DataFrame(stats)
print(stats_df)


print(sample_df[['context', 'question', 'answers']])

                 Max Length  Min Length  Average Length
context_length         1196         228      641.666667
question_length          93          33       51.400000
answer_length           115           0       19.266667
                                                  context  \
51829   In general, avian influenza is a disease of bi...   
19377   The National Historical Publications and Recor...   
123528  Starting with Republican Nicholas Longworth in...   
25403   122nd Street is mentioned in the movie Taxi Dr...   
1519    Wang and Nyima argue that the Ming emperor sen...   
61309   After the President signs a bill into law (or ...   
105754  Buddhism first entered China during the Easter...   
80122   The Sichuan government raised the minimum wage...   
14744   Network hardware, software and specifications,...   
4194    New York City's commuter rail network is the l...   
121888  Constantine's nephew Julian rejected the "Gali...   
31484   Houston has sports teams for every m

### Load the question answering pipeline with RoBERTa


In [None]:
from transformers import pipeline


qa_pipeline = pipeline('question-answering', model='deepset/roberta-base-squad2')




### Apply the QA pipeline to each sample

In [None]:

def get_answers(row):
    return qa_pipeline({
        'context': row['context'],
        'question': row['question']
    })

sample_df['prediction'] = sample_df.apply(get_answers, axis=1)
print(sample_df[['question', 'prediction']])


                                                 question  \
51829   What is the danger to humans in regards to the...   
19377                   Which arm of NARA handles grants?   
123528  What republican majority leader died in a plan...   
25403          Which neighborhood surrounds 122nd Street?   
1519                What did the lamas called themselves?   
61309   Once the slip laws are placed into the United ...   
105754          When did Emperor Ming of Han's reign end?   
80122   What is the national minimum monthly wage in C...   
14744               What sort of route does data follow?    
4194    About how many stations does New York City's c...   
121888          What building did Julian want to rebuild?   
31484   When were the Houston Astros in the World Series?   
24334       Which poet was more famous, Lucan or Statius?   
19272   What institution has none of materials from th...   
95519                  Where was the CTR Business office?   

                       

In [None]:
sample_df['start_index'] = sample_df['prediction'].apply(lambda x: x['start'])
sample_df['end_index'] = sample_df['prediction'].apply(lambda x: x['end'])
sample_df['answer_tokens'] = sample_df['prediction'].apply(lambda x: x['answer'])
print(sample_df[['start_index', 'end_index', 'answer_tokens']])


        start_index  end_index  \
51829           408        480   
19377             4         59   
123528          434        444   
25403           199        205   
1519            691        698   
61309           518        530   
105754          152        157   
80122           489        507   
14744           157        166   
4194            318        321   
121888          363        381   
31484           264        268   
24334           156        160   
19272           360        386   
95519           187        195   

                                            answer_tokens  
51829   The virus possibly could mutate to become high...  
19377   National Historical Publications and Records C...  
123528                                         Hale Boggs  
25403                                              Harlem  
1519                                              princes  
61309                                        session laws  
105754                             

In [None]:
#!pip install --upgrade openai

In [None]:
import openai
openai.api_key = 'Removed key'



In [None]:
#!pip install openai==0.28

In [None]:


def ask_openai(context, question):
    prompt = f"Given the following context: {context} Can you answer the question: {question}? After providing your answer, please explain how you arrived at that conclusion."
    response = openai.ChatCompletion.create(
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt}
        ],
        max_tokens=150
    )
    return response['choices'][0]['message']['content'].strip()


In [None]:
# Example to test on a small subset of your dataset
results = []
for index, row in df.head(15).iterrows():
    answer = ask_openai(row['context'], row['question'])
    results.append((row['question'], answer))

# Convert results to a DataFrame
results_df = pd.DataFrame(results, columns=['Question', 'OpenAI Answer'])
print(results_df)


                                             Question  \
0            When did Beyonce start becoming popular?   
1   What areas did Beyonce compete in when she was...   
2   When did Beyonce leave Destiny's Child and bec...   
3       In what city and state did Beyonce  grow up?    
4          In which decade did Beyonce become famous?   
5          In what R&B group was she the lead singer?   
6       What album made her a worldwide known artist?   
7              Who managed the Destiny's Child group?   
8                      When did Beyoncé rise to fame?   
9      What role did Beyoncé have in Destiny's Child?   
10  What was the first album Beyoncé released as a...   
11      When did Beyoncé release Dangerously in Love?   
12  How many Grammy awards did Beyoncé win for her...   
13        What was Beyoncé's role in Destiny's Child?   
14   What was the name of Beyoncé's first solo album?   

                                        OpenAI Answer  
0   Beyoncé started becoming po