In [None]:
!pip install datasets
from datasets import get_dataset_config_names
domains = get_dataset_config_names("subjqa")

domains


In [None]:
from datasets import load_dataset
subjqa = load_dataset("subjqa", name="electronics")


In [None]:
print(subjqa["train"]["answers"][1])

In [None]:
import pandas as pd
dfs = {split: dset.to_pandas() for split, dset in subjqa.flatten().items()}
for split, df in dfs.items():
	print(f"Number of questions in {split}: {df['id'].nunique()}")

In [None]:
qa_cols = ["title", "question", "answers.text", "answers.answer_start", "context"]
sample_df = dfs["train"][qa_cols].sample(2, random_state=7)
sample_df

In [None]:
start_idx = sample_df["answers.answer_start"].iloc[0][0]
end_idx = start_idx + len(sample_df["answers.text"].iloc[0][0])
sample_df["context"].iloc[0][start_idx:end_idx]


In [None]:
import matplotlib.pyplot as plt
counts = {}
question_types = ["What", "How", "Is", "Does", "Do", "Was", "Where", "Why"]
for q in question_types:
	counts[q] = dfs["train"]["question"].str.startswith(q).value_counts()

pd.Series(counts).sort_values().plot.barh()
plt.title("Frequency of Question Types")
plt.show()


In [None]:
for question_type in ["How", "What", "Is"]:
	for question in ( dfs["train"][dfs["train"].question.str.startswith(question_type)] .sample(n=3, random_state=42)['question']):
		print(question)

In [None]:
from transformers import AutoTokenizer
model_ckpt = "deepset/minilm-uncased-squad2"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

In [None]:
question = "How much music can this hold?"
context = """An MP3 is about 1 MB/minute, so about 6000 hours depending on \ file size."""
inputs = tokenizer(question, context, return_tensors="pt")


In [None]:
inputs

In [None]:
print(tokenizer.decode(inputs["input_ids"][0]))

In [None]:
import torch
from transformers import AutoModelForQuestionAnswering
model = AutoModelForQuestionAnswering.from_pretrained(model_ckpt)
with torch.no_grad():
	outputs = model(**inputs)
	print(outputs)

In [None]:
start_logits = outputs.start_logits
end_logits = outputs.end_logits

In [None]:
print(f"Input IDs shape: {inputs.input_ids.size()}")
print(f"Start logits shape: {start_logits.size()}")
print(f"End logits shape: {end_logits.size()}")


In [None]:
import torch
start_idx = torch.argmax(start_logits)
end_idx = torch.argmax(end_logits) + 1
answer_span = inputs["input_ids"][0][start_idx:end_idx]
answer = tokenizer.decode(answer_span)
print(f"Question: {question}")
print(f"Answer: {answer}")


In [None]:
from transformers import pipeline
pipe = pipeline("question-answering", model=model, tokenizer=tokenizer)
pipe(question=question, context=context, topk=3)

pipe(question="Why is there no data?", context=context, handle_impossible_answer=True)

超越抽取式QA直接生成而非提取
