In [37]:
# external
import datasets as ds
from transformers import pipeline
import importlib
import os
import torch
import transformers
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from functools import partial
import numpy as np

# internal
from redditqa.data.smart_filter import question_grading_map, question_filter, answer_grading_map, answer_filter
from redditqa.data import qa_generation
from redditqa.data.util import mask_links

importlib.reload(qa_generation)

<module 'redditqa.data.qa_generation' from '/workspaces/reddit_qa/redditqa/data/qa_generation.py'>

In [2]:
torch.cuda.is_available()

True

In [3]:
model_id = "HuggingFaceH4/zephyr-7b-beta"
cache_dir = "/scratch1/ssawicki/cache"

### Load model

In [4]:
# quantization config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

In [5]:
model = AutoModelForCausalLM.from_pretrained(
        model_id,
        low_cpu_mem_usage=True,
        quantization_config=bnb_config,
        cache_dir=cache_dir
    )

Loading checkpoint shards: 100%|██████████| 8/8 [01:03<00:00,  7.90s/it]


In [6]:
tokenizer = AutoTokenizer.from_pretrained(model_id, cache_dir=cache_dir)
tokenizer.pad_token_id = tokenizer.eos_token_id

In [7]:
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)

### Load dataset

In [8]:
dataset = ds.load_from_disk("/scratch1/redditqa/cached_datasets/AskHistorians.jsonl")
# Preprocessing
dataset = dataset.map(mask_links)

In [9]:
dataset

Dataset({
    features: ['question_created_utc', 'question_retrieved_on', 'question_deleted', 'question_title', 'question_selftext', 'question_score', 'question_char_length', 'question_selftext_char_length', 'answers'],
    num_rows: 116452
})

In [10]:
dataset = dataset.filter(lambda x: len(x['question_title']) >= 55)

In [11]:
dataset = dataset.filter(lambda x: x['question_score'] >= 4)

In [10]:
dataset = ds.load_from_disk("/scratch1/redditqa/cached_datasets/AskHistorians_question_filtered.jsonl")
question_filter = partial(question_filter, accepted_token_str=['y', 'yes'])
dataset = dataset.filter(question_filter)
dataset[1000:1100]['question_title']

['In pre-modern warfare, how did opposing armies handle battle field casualties in multi-day engagements?',
 'Anybody familiar with how black troops were treated by the American military in WWII?',
 'Did King Richard III murder the princes in the Tower of London?',
 'How were weapon control laws implemented throughout history? Did strict laws ever really exist banning civilian use of things like swords, bows, and gunpowder?',
 'A few questions about the end of the civil war and slavery, inspired by the movie *Lincoln*',
 'The Polar Bear Expedition of 1918 was basically an allied invasion of Russia, why did it have such limited ramifications?',
 'Is there any truth to the Soviet Union and China having a "location ambiguity" pact about nuclear weapons during the Cold War?',
 'Why did the Russians take Prussia from the Germans after WW2?',
 'How does a society typically recover from a state of poor law enforcement?',
 'During WW2, how would non-combatant aircraft and ships travel safely t

In [5]:
dataset.save_to_disk("/scratch1/redditqa/cached_datasets/AskHistorians_question_filtered.jsonl")

Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 177.16 examples/s]


In [15]:
dataset.info.features

{'question_created_utc': Value(dtype='int64', id=None),
 'question_retrieved_on': Value(dtype='int64', id=None),
 'question_deleted': Value(dtype='bool', id=None),
 'question_title': Value(dtype='string', id=None),
 'question_selftext': Value(dtype='string', id=None),
 'question_score': Value(dtype='int64', id=None),
 'question_char_length': Value(dtype='int64', id=None),
 'question_selftext_char_length': Value(dtype='int64', id=None),
 'answers': [{'answer_body': Value(dtype='string', id=None),
   'answer_char_length': Value(dtype='int64', id=None),
   'answer_created_utc': Value(dtype='int64', id=None),
   'answer_deleted': Value(dtype='bool', id=None),
   'answer_id': Value(dtype='string', id=None),
   'answer_retrieved_on': Value(dtype='int64', id=None),
   'answer_score': Value(dtype='int64', id=None)}]}

### LLM Filtering

Question filtering:
- Few shot classification: define what a well written question is, show some good and bad examples.

Answer filter:
- Few shot classification: define what a well written answer is, show good & bad examples.

#### Question filter

In [44]:
question_grading = partial(question_grading_map, pipeline=pipe, verbose=True)
question_filter = partial(question_filter, accepted_token_str=['y', 'yes'])

In [45]:
example_ds = dataset.select(list(range(50,80)))
example_ds

Dataset({
    features: ['question_created_utc', 'question_retrieved_on', 'question_deleted', 'question_title', 'question_selftext', 'question_score', 'question_char_length', 'question_selftext_char_length', 'answers'],
    num_rows: 30
})

In [48]:
# grading the questions
example_ds = example_ds.map(question_grading)

Map:   3%|▎         | 1/30 [00:00<00:06,  4.51 examples/s]

{'graded_output': [{'token_id': tensor(9780), 'token_str': 'yes', 'probability': 0.0010224399}, {'token_id': tensor(1510), 'token_str': 'no', 'probability': 4.9931226e-10}, {'token_id': tensor(5613), 'token_str': 'Yes', 'probability': 0.85968566}, {'token_id': tensor(2501), 'token_str': 'No', 'probability': 1.7314133e-07}, {'token_id': tensor(28724), 'token_str': 'y', 'probability': 4.893228e-08}, {'token_id': tensor(28711), 'token_str': 'n', 'probability': 1.4886707e-09}, {'token_id': tensor(5613), 'token_str': 'generated: Yes', 'probability': 0.85968566}]}


Map:   7%|▋         | 2/30 [00:00<00:06,  4.59 examples/s]

{'graded_output': [{'token_id': tensor(9780), 'token_str': 'yes', 'probability': 0.012283739}, {'token_id': tensor(1510), 'token_str': 'no', 'probability': 1.866186e-09}, {'token_id': tensor(5613), 'token_str': 'Yes', 'probability': 0.7902418}, {'token_id': tensor(2501), 'token_str': 'No', 'probability': 8.2891866e-08}, {'token_id': tensor(28724), 'token_str': 'y', 'probability': 4.1281754e-07}, {'token_id': tensor(28711), 'token_str': 'n', 'probability': 3.214851e-09}, {'token_id': tensor(5613), 'token_str': 'generated: Yes', 'probability': 0.7902418}]}


Map:  10%|█         | 3/30 [00:00<00:05,  4.63 examples/s]

{'graded_output': [{'token_id': tensor(9780), 'token_str': 'yes', 'probability': 4.089529e-08}, {'token_id': tensor(1510), 'token_str': 'no', 'probability': 0.000702899}, {'token_id': tensor(5613), 'token_str': 'Yes', 'probability': 7.4800837e-06}, {'token_id': tensor(2501), 'token_str': 'No', 'probability': 0.8599144}, {'token_id': tensor(28724), 'token_str': 'y', 'probability': 1.3765256e-08}, {'token_id': tensor(28711), 'token_str': 'n', 'probability': 7.882774e-08}, {'token_id': tensor(2501), 'token_str': 'generated: No', 'probability': 0.8599144}]}


Map:  13%|█▎        | 4/30 [00:00<00:05,  4.64 examples/s]

{'graded_output': [{'token_id': tensor(9780), 'token_str': 'yes', 'probability': 0.0012329542}, {'token_id': tensor(1510), 'token_str': 'no', 'probability': 1.6048263e-10}, {'token_id': tensor(5613), 'token_str': 'Yes', 'probability': 0.7948575}, {'token_id': tensor(2501), 'token_str': 'No', 'probability': 5.467929e-08}, {'token_id': tensor(28724), 'token_str': 'y', 'probability': 4.787879e-08}, {'token_id': tensor(28711), 'token_str': 'n', 'probability': 1.4472266e-09}, {'token_id': tensor(5613), 'token_str': 'generated: Yes', 'probability': 0.7948575}]}


Map:  17%|█▋        | 5/30 [00:01<00:05,  4.63 examples/s]

{'graded_output': [{'token_id': tensor(9780), 'token_str': 'yes', 'probability': 0.00049505144}, {'token_id': tensor(1510), 'token_str': 'no', 'probability': 8.405353e-10}, {'token_id': tensor(5613), 'token_str': 'Yes', 'probability': 0.53866506}, {'token_id': tensor(2501), 'token_str': 'No', 'probability': 4.4097135e-07}, {'token_id': tensor(28724), 'token_str': 'y', 'probability': 5.1647806e-08}, {'token_id': tensor(28711), 'token_str': 'n', 'probability': 7.554054e-09}, {'token_id': tensor(5613), 'token_str': 'generated: Yes', 'probability': 0.53866506}]}


Map:  20%|██        | 6/30 [00:01<00:05,  4.64 examples/s]

{'graded_output': [{'token_id': tensor(9780), 'token_str': 'yes', 'probability': 0.00011417482}, {'token_id': tensor(1510), 'token_str': 'no', 'probability': 1.4840994e-07}, {'token_id': tensor(5613), 'token_str': 'Yes', 'probability': 0.01890357}, {'token_id': tensor(2501), 'token_str': 'No', 'probability': 8.385902e-05}, {'token_id': tensor(28724), 'token_str': 'y', 'probability': 1.1717284e-07}, {'token_id': tensor(28711), 'token_str': 'n', 'probability': 2.024565e-07}, {'token_id': tensor(2301), 'token_str': 'generated: Is', 'probability': 0.35390842}]}


Map:  23%|██▎       | 7/30 [00:01<00:04,  4.63 examples/s]

{'graded_output': [{'token_id': tensor(9780), 'token_str': 'yes', 'probability': 3.2598038e-08}, {'token_id': tensor(1510), 'token_str': 'no', 'probability': 5.446129e-06}, {'token_id': tensor(5613), 'token_str': 'Yes', 'probability': 1.1849063e-05}, {'token_id': tensor(2501), 'token_str': 'No', 'probability': 0.003038514}, {'token_id': tensor(28724), 'token_str': 'y', 'probability': 5.9215065e-08}, {'token_id': tensor(28711), 'token_str': 'n', 'probability': 4.3961943e-08}, {'token_id': tensor(2198), 'token_str': 'generated: As', 'probability': 0.5745321}]}


Map:  27%|██▋       | 8/30 [00:01<00:04,  4.64 examples/s]

{'graded_output': [{'token_id': tensor(9780), 'token_str': 'yes', 'probability': 1.0989602e-05}, {'token_id': tensor(1510), 'token_str': 'no', 'probability': 9.0915665e-07}, {'token_id': tensor(5613), 'token_str': 'Yes', 'probability': 0.014480293}, {'token_id': tensor(2501), 'token_str': 'No', 'probability': 0.0015143345}, {'token_id': tensor(28724), 'token_str': 'y', 'probability': 5.9093434e-08}, {'token_id': tensor(28711), 'token_str': 'n', 'probability': 4.4410474e-08}, {'token_id': tensor(3260), 'token_str': 'generated: This', 'probability': 0.53914034}]}


Map:  30%|███       | 9/30 [00:01<00:04,  4.64 examples/s]

{'graded_output': [{'token_id': tensor(9780), 'token_str': 'yes', 'probability': 1.6456577e-08}, {'token_id': tensor(1510), 'token_str': 'no', 'probability': 7.995751e-05}, {'token_id': tensor(5613), 'token_str': 'Yes', 'probability': 7.3793294e-06}, {'token_id': tensor(2501), 'token_str': 'No', 'probability': 0.1401187}, {'token_id': tensor(28724), 'token_str': 'y', 'probability': 1.8955282e-08}, {'token_id': tensor(28711), 'token_str': 'n', 'probability': 5.8700817e-08}, {'token_id': tensor(3260), 'token_str': 'generated: This', 'probability': 0.4890623}]}


Map:  33%|███▎      | 10/30 [00:02<00:04,  4.64 examples/s]

{'graded_output': [{'token_id': tensor(9780), 'token_str': 'yes', 'probability': 1.7728684e-05}, {'token_id': tensor(1510), 'token_str': 'no', 'probability': 0.012166407}, {'token_id': tensor(5613), 'token_str': 'Yes', 'probability': 2.4613839e-05}, {'token_id': tensor(2501), 'token_str': 'No', 'probability': 0.09644673}, {'token_id': tensor(28724), 'token_str': 'y', 'probability': 8.8771365e-08}, {'token_id': tensor(28711), 'token_str': 'n', 'probability': 7.9430725e-07}, {'token_id': tensor(2301), 'token_str': 'generated: Is', 'probability': 0.87315744}]}


Map:  37%|███▋      | 11/30 [00:02<00:04,  4.63 examples/s]

{'graded_output': [{'token_id': tensor(9780), 'token_str': 'yes', 'probability': 0.0011352679}, {'token_id': tensor(1510), 'token_str': 'no', 'probability': 7.421579e-09}, {'token_id': tensor(5613), 'token_str': 'Yes', 'probability': 0.8423905}, {'token_id': tensor(2501), 'token_str': 'No', 'probability': 2.3975986e-06}, {'token_id': tensor(28724), 'token_str': 'y', 'probability': 8.5810875e-08}, {'token_id': tensor(28711), 'token_str': 'n', 'probability': 4.8010995e-09}, {'token_id': tensor(5613), 'token_str': 'generated: Yes', 'probability': 0.8423905}]}


Map:  40%|████      | 12/30 [00:02<00:03,  4.64 examples/s]

{'graded_output': [{'token_id': tensor(9780), 'token_str': 'yes', 'probability': 9.502734e-05}, {'token_id': tensor(1510), 'token_str': 'no', 'probability': 8.46212e-10}, {'token_id': tensor(5613), 'token_str': 'Yes', 'probability': 0.9658147}, {'token_id': tensor(2501), 'token_str': 'No', 'probability': 1.442914e-06}, {'token_id': tensor(28724), 'token_str': 'y', 'probability': 5.093354e-09}, {'token_id': tensor(28711), 'token_str': 'n', 'probability': 7.526366e-10}, {'token_id': tensor(5613), 'token_str': 'generated: Yes', 'probability': 0.9658147}]}


Map:  43%|████▎     | 13/30 [00:02<00:03,  4.64 examples/s]

{'graded_output': [{'token_id': tensor(9780), 'token_str': 'yes', 'probability': 3.0360818e-06}, {'token_id': tensor(1510), 'token_str': 'no', 'probability': 2.4446947e-05}, {'token_id': tensor(5613), 'token_str': 'Yes', 'probability': 0.00093545835}, {'token_id': tensor(2501), 'token_str': 'No', 'probability': 0.023938036}, {'token_id': tensor(28724), 'token_str': 'y', 'probability': 4.1913513e-08}, {'token_id': tensor(28711), 'token_str': 'n', 'probability': 3.5833025e-08}, {'token_id': tensor(28737), 'token_str': 'generated: I', 'probability': 0.570974}]}


Map:  47%|████▋     | 14/30 [00:03<00:03,  4.64 examples/s]

{'graded_output': [{'token_id': tensor(9780), 'token_str': 'yes', 'probability': 0.00064305746}, {'token_id': tensor(1510), 'token_str': 'no', 'probability': 3.8501398e-09}, {'token_id': tensor(5613), 'token_str': 'Yes', 'probability': 0.4771608}, {'token_id': tensor(2501), 'token_str': 'No', 'probability': 4.0545015e-06}, {'token_id': tensor(28724), 'token_str': 'y', 'probability': 7.8434944e-08}, {'token_id': tensor(28711), 'token_str': 'n', 'probability': 8.841073e-09}, {'token_id': tensor(5613), 'token_str': 'generated: Yes', 'probability': 0.4771608}]}


Map:  50%|█████     | 15/30 [00:03<00:03,  4.64 examples/s]

{'graded_output': [{'token_id': tensor(9780), 'token_str': 'yes', 'probability': 0.003972109}, {'token_id': tensor(1510), 'token_str': 'no', 'probability': 1.3726309e-10}, {'token_id': tensor(5613), 'token_str': 'Yes', 'probability': 0.9568744}, {'token_id': tensor(2501), 'token_str': 'No', 'probability': 2.14538e-08}, {'token_id': tensor(28724), 'token_str': 'y', 'probability': 3.2522024e-08}, {'token_id': tensor(28711), 'token_str': 'n', 'probability': 7.8436957e-10}, {'token_id': tensor(5613), 'token_str': 'generated: Yes', 'probability': 0.9568744}]}


Map:  53%|█████▎    | 16/30 [00:03<00:03,  4.65 examples/s]

{'graded_output': [{'token_id': tensor(9780), 'token_str': 'yes', 'probability': 0.0021188662}, {'token_id': tensor(1510), 'token_str': 'no', 'probability': 2.3973123e-10}, {'token_id': tensor(5613), 'token_str': 'Yes', 'probability': 0.76624817}, {'token_id': tensor(2501), 'token_str': 'No', 'probability': 4.8182017e-08}, {'token_id': tensor(28724), 'token_str': 'y', 'probability': 1.8648528e-08}, {'token_id': tensor(28711), 'token_str': 'n', 'probability': 6.301445e-10}, {'token_id': tensor(5613), 'token_str': 'generated: Yes', 'probability': 0.76624817}]}


Map:  57%|█████▋    | 17/30 [00:03<00:02,  4.66 examples/s]

{'graded_output': [{'token_id': tensor(9780), 'token_str': 'yes', 'probability': 0.00066310493}, {'token_id': tensor(1510), 'token_str': 'no', 'probability': 8.599975e-09}, {'token_id': tensor(5613), 'token_str': 'Yes', 'probability': 0.3226854}, {'token_id': tensor(2501), 'token_str': 'No', 'probability': 3.7771283e-06}, {'token_id': tensor(28724), 'token_str': 'y', 'probability': 1.12950964e-07}, {'token_id': tensor(28711), 'token_str': 'n', 'probability': 1.845677e-08}, {'token_id': tensor(2301), 'token_str': 'generated: Is', 'probability': 0.4410586}]}


Map:  60%|██████    | 18/30 [00:03<00:02,  4.66 examples/s]

{'graded_output': [{'token_id': tensor(9780), 'token_str': 'yes', 'probability': 0.0013605129}, {'token_id': tensor(1510), 'token_str': 'no', 'probability': 1.5193675e-09}, {'token_id': tensor(5613), 'token_str': 'Yes', 'probability': 0.8501067}, {'token_id': tensor(2501), 'token_str': 'No', 'probability': 3.9652826e-07}, {'token_id': tensor(28724), 'token_str': 'y', 'probability': 9.928372e-08}, {'token_id': tensor(28711), 'token_str': 'n', 'probability': 3.7632257e-09}, {'token_id': tensor(5613), 'token_str': 'generated: Yes', 'probability': 0.8501067}]}


Map:  63%|██████▎   | 19/30 [00:04<00:02,  4.65 examples/s]

{'graded_output': [{'token_id': tensor(9780), 'token_str': 'yes', 'probability': 7.415732e-06}, {'token_id': tensor(1510), 'token_str': 'no', 'probability': 8.604032e-08}, {'token_id': tensor(5613), 'token_str': 'Yes', 'probability': 0.0035113648}, {'token_id': tensor(2501), 'token_str': 'No', 'probability': 5.810185e-05}, {'token_id': tensor(28724), 'token_str': 'y', 'probability': 6.297919e-08}, {'token_id': tensor(28711), 'token_str': 'n', 'probability': 5.388211e-08}, {'token_id': tensor(1014), 'token_str': 'generated: The', 'probability': 0.3933715}]}


Map:  67%|██████▋   | 20/30 [00:04<00:02,  4.65 examples/s]

{'graded_output': [{'token_id': tensor(9780), 'token_str': 'yes', 'probability': 0.0020061184}, {'token_id': tensor(1510), 'token_str': 'no', 'probability': 9.815169e-09}, {'token_id': tensor(5613), 'token_str': 'Yes', 'probability': 0.5266385}, {'token_id': tensor(2501), 'token_str': 'No', 'probability': 9.791769e-07}, {'token_id': tensor(28724), 'token_str': 'y', 'probability': 2.1848389e-07}, {'token_id': tensor(28711), 'token_str': 'n', 'probability': 2.2915865e-08}, {'token_id': tensor(5613), 'token_str': 'generated: Yes', 'probability': 0.5266385}]}


Map:  70%|███████   | 21/30 [00:04<00:01,  4.65 examples/s]

{'graded_output': [{'token_id': tensor(9780), 'token_str': 'yes', 'probability': 0.00010251865}, {'token_id': tensor(1510), 'token_str': 'no', 'probability': 0.00020229605}, {'token_id': tensor(5613), 'token_str': 'Yes', 'probability': 0.0066991253}, {'token_id': tensor(2501), 'token_str': 'No', 'probability': 0.08824369}, {'token_id': tensor(28724), 'token_str': 'y', 'probability': 3.1013843e-07}, {'token_id': tensor(28711), 'token_str': 'n', 'probability': 1.0553505e-06}, {'token_id': tensor(2301), 'token_str': 'generated: Is', 'probability': 0.69953424}]}


Map:  73%|███████▎  | 22/30 [00:04<00:01,  4.63 examples/s]

{'graded_output': [{'token_id': tensor(9780), 'token_str': 'yes', 'probability': 0.0012128709}, {'token_id': tensor(1510), 'token_str': 'no', 'probability': 2.4012425e-07}, {'token_id': tensor(5613), 'token_str': 'Yes', 'probability': 0.48172104}, {'token_id': tensor(2501), 'token_str': 'No', 'probability': 0.00015785589}, {'token_id': tensor(28724), 'token_str': 'y', 'probability': 7.2948254e-08}, {'token_id': tensor(28711), 'token_str': 'n', 'probability': 1.1481663e-08}, {'token_id': tensor(5613), 'token_str': 'generated: Yes', 'probability': 0.48172104}]}


Map:  77%|███████▋  | 23/30 [00:04<00:01,  4.64 examples/s]

{'graded_output': [{'token_id': tensor(9780), 'token_str': 'yes', 'probability': 0.00010871565}, {'token_id': tensor(1510), 'token_str': 'no', 'probability': 1.8198293e-07}, {'token_id': tensor(5613), 'token_str': 'Yes', 'probability': 0.03159046}, {'token_id': tensor(2501), 'token_str': 'No', 'probability': 0.00011572721}, {'token_id': tensor(28724), 'token_str': 'y', 'probability': 4.3266922e-08}, {'token_id': tensor(28711), 'token_str': 'n', 'probability': 1.2758493e-08}, {'token_id': tensor(2301), 'token_str': 'generated: Is', 'probability': 0.90889555}]}


Map:  80%|████████  | 24/30 [00:05<00:01,  4.65 examples/s]

{'graded_output': [{'token_id': tensor(9780), 'token_str': 'yes', 'probability': 0.002046508}, {'token_id': tensor(1510), 'token_str': 'no', 'probability': 1.513315e-08}, {'token_id': tensor(5613), 'token_str': 'Yes', 'probability': 0.21369985}, {'token_id': tensor(2501), 'token_str': 'No', 'probability': 1.3181556e-06}, {'token_id': tensor(28724), 'token_str': 'y', 'probability': 1.5651149e-07}, {'token_id': tensor(28711), 'token_str': 'n', 'probability': 2.7131327e-08}, {'token_id': tensor(2301), 'token_str': 'generated: Is', 'probability': 0.48916382}]}


Map:  83%|████████▎ | 25/30 [00:05<00:01,  4.64 examples/s]

{'graded_output': [{'token_id': tensor(9780), 'token_str': 'yes', 'probability': 0.0016672731}, {'token_id': tensor(1510), 'token_str': 'no', 'probability': 3.7717032e-10}, {'token_id': tensor(5613), 'token_str': 'Yes', 'probability': 0.66739166}, {'token_id': tensor(2501), 'token_str': 'No', 'probability': 1.114324e-07}, {'token_id': tensor(28724), 'token_str': 'y', 'probability': 2.0253752e-08}, {'token_id': tensor(28711), 'token_str': 'n', 'probability': 2.5699511e-09}, {'token_id': tensor(5613), 'token_str': 'generated: Yes', 'probability': 0.66739166}]}


Map:  87%|████████▋ | 26/30 [00:05<00:00,  4.64 examples/s]

{'graded_output': [{'token_id': tensor(9780), 'token_str': 'yes', 'probability': 8.231417e-05}, {'token_id': tensor(1510), 'token_str': 'no', 'probability': 4.895039e-08}, {'token_id': tensor(5613), 'token_str': 'Yes', 'probability': 0.019752033}, {'token_id': tensor(2501), 'token_str': 'No', 'probability': 1.794131e-05}, {'token_id': tensor(28724), 'token_str': 'y', 'probability': 2.756356e-07}, {'token_id': tensor(28711), 'token_str': 'n', 'probability': 6.265437e-08}, {'token_id': tensor(1014), 'token_str': 'generated: The', 'probability': 0.2835303}]}


Map:  90%|█████████ | 27/30 [00:05<00:00,  4.65 examples/s]

{'graded_output': [{'token_id': tensor(9780), 'token_str': 'yes', 'probability': 7.081951e-05}, {'token_id': tensor(1510), 'token_str': 'no', 'probability': 7.0029547e-09}, {'token_id': tensor(5613), 'token_str': 'Yes', 'probability': 0.03314251}, {'token_id': tensor(2501), 'token_str': 'No', 'probability': 1.595466e-06}, {'token_id': tensor(28724), 'token_str': 'y', 'probability': 3.0267582e-08}, {'token_id': tensor(28711), 'token_str': 'n', 'probability': 2.3457597e-08}, {'token_id': tensor(15423), 'token_str': 'generated: Here', 'probability': 0.36476526}]}


Map:  93%|█████████▎| 28/30 [00:06<00:00,  4.64 examples/s]

{'graded_output': [{'token_id': tensor(9780), 'token_str': 'yes', 'probability': 0.0012166302}, {'token_id': tensor(1510), 'token_str': 'no', 'probability': 2.367969e-10}, {'token_id': tensor(5613), 'token_str': 'Yes', 'probability': 0.86142224}, {'token_id': tensor(2501), 'token_str': 'No', 'probability': 1.4723997e-07}, {'token_id': tensor(28724), 'token_str': 'y', 'probability': 2.2980423e-08}, {'token_id': tensor(28711), 'token_str': 'n', 'probability': 1.1642753e-09}, {'token_id': tensor(5613), 'token_str': 'generated: Yes', 'probability': 0.86142224}]}


Map:  97%|█████████▋| 29/30 [00:06<00:00,  4.64 examples/s]

{'graded_output': [{'token_id': tensor(9780), 'token_str': 'yes', 'probability': 1.27161675e-05}, {'token_id': tensor(1510), 'token_str': 'no', 'probability': 1.1087928e-07}, {'token_id': tensor(5613), 'token_str': 'Yes', 'probability': 0.009927167}, {'token_id': tensor(2501), 'token_str': 'No', 'probability': 5.0886258e-05}, {'token_id': tensor(28724), 'token_str': 'y', 'probability': 1.5514799e-07}, {'token_id': tensor(28711), 'token_str': 'n', 'probability': 4.4624613e-08}, {'token_id': tensor(2198), 'token_str': 'generated: As', 'probability': 0.5091666}]}


Map: 100%|██████████| 30/30 [00:06<00:00,  4.62 examples/s]

{'graded_output': [{'token_id': tensor(9780), 'token_str': 'yes', 'probability': 0.0006323905}, {'token_id': tensor(1510), 'token_str': 'no', 'probability': 9.781082e-09}, {'token_id': tensor(5613), 'token_str': 'Yes', 'probability': 0.5660184}, {'token_id': tensor(2501), 'token_str': 'No', 'probability': 1.6621306e-06}, {'token_id': tensor(28724), 'token_str': 'y', 'probability': 1.6297389e-07}, {'token_id': tensor(28711), 'token_str': 'n', 'probability': 1.3815863e-08}, {'token_id': tensor(5613), 'token_str': 'generated: Yes', 'probability': 0.5660184}]}





In [13]:
# filtering quetions
example_ds = example_ds.filter(question_filter)

Filter: 100%|██████████| 30/30 [00:00<00:00, 5564.46 examples/s]


In [14]:
# remaining questions
example_ds["question_title"]

['Was Dwight Eisenhower consulted by JFK during the Cuban Missile Crisis? ',
 'Religious sects of Reformation England: Who were they and what did they believe?',
 'Reddit, what were the perspectives of the British and the Boers during the Second Boer War ?',
 'Were the fire bombings on Dresden from Allied forces during WWII necessary?',
 'What was the role of dogs in the Roman military?',
 'What did pre-modern racism look like?',
 'How did the Roman government react to the eruption of Mt. Vesuvius in AD 79?',
 'Do you guys know of any instance when a revolution has succeeded without foreign intervention?',
 'Is there any other instance in history where a different race of people were imported into a land as slaves and eventually became fully assimilated as equals into society? If so how long did it take? ',
 'What was the longest period of time where the US was not at war, or actively involved in combat?']

#### Answer filter

In [15]:
answer_grading = partial(answer_grading_map, pipeline=pipe, verbose=True)
answer_filter = partial(answer_filter, accepted_token_str=['y', 'yes'])

In [16]:
example_ds = example_ds.map(answer_grading)

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

{'graded_output': [{'token_id': tensor(5613), 'token_str': 'Yes', 'probability': 0.7641037}]}


Map:  10%|█         | 1/10 [00:00<00:05,  1.72 examples/s]

{'graded_output': [{'token_id': tensor(5613), 'token_str': 'Yes', 'probability': 0.7315472}]}
{'graded_output': [{'token_id': tensor(2301), 'token_str': 'Is', 'probability': 0.46591264}]}


Map:  20%|██        | 2/10 [00:01<00:05,  1.58 examples/s]

{'graded_output': [{'token_id': tensor(2301), 'token_str': 'Is', 'probability': 0.35863084}]}
{'graded_output': [{'token_id': tensor(2501), 'token_str': 'No', 'probability': 0.95299494}]}


Map:  30%|███       | 3/10 [00:01<00:04,  1.62 examples/s]

{'graded_output': [{'token_id': tensor(2501), 'token_str': 'No', 'probability': 0.9207272}]}
{'graded_output': [{'token_id': tensor(2501), 'token_str': 'No', 'probability': 0.76148134}]}
{'graded_output': [{'token_id': tensor(5613), 'token_str': 'Yes', 'probability': 0.55770934}]}
{'graded_output': [{'token_id': tensor(5613), 'token_str': 'Yes', 'probability': 0.562034}]}
{'graded_output': [{'token_id': tensor(5613), 'token_str': 'Yes', 'probability': 0.55572397}]}
{'graded_output': [{'token_id': tensor(5613), 'token_str': 'Yes', 'probability': 0.7264219}]}
{'graded_output': [{'token_id': tensor(2301), 'token_str': 'Is', 'probability': 0.2522639}]}


Map:  40%|████      | 4/10 [00:04<00:08,  1.38s/ examples]

{'graded_output': [{'token_id': tensor(2501), 'token_str': 'No', 'probability': 0.9220224}]}
{'graded_output': [{'token_id': tensor(2301), 'token_str': 'Is', 'probability': 0.3299616}]}


Map:  50%|█████     | 5/10 [00:04<00:05,  1.09s/ examples]

{'graded_output': [{'token_id': tensor(2501), 'token_str': 'No', 'probability': 0.779684}]}
{'graded_output': [{'token_id': tensor(5613), 'token_str': 'Yes', 'probability': 0.57565033}]}
{'graded_output': [{'token_id': tensor(1014), 'token_str': 'The', 'probability': 0.28809276}]}
{'graded_output': [{'token_id': tensor(2301), 'token_str': 'Is', 'probability': 0.25955176}]}


Map:  60%|██████    | 6/10 [00:06<00:04,  1.14s/ examples]

{'graded_output': [{'token_id': tensor(5613), 'token_str': 'Yes', 'probability': 0.3975593}]}
{'graded_output': [{'token_id': tensor(5613), 'token_str': 'Yes', 'probability': 0.90252525}]}
{'graded_output': [{'token_id': tensor(2501), 'token_str': 'No', 'probability': 0.89953196}]}
{'graded_output': [{'token_id': tensor(5613), 'token_str': 'Yes', 'probability': 0.6132235}]}
{'graded_output': [{'token_id': tensor(2501), 'token_str': 'No', 'probability': 0.70639944}]}


Map:  70%|███████   | 7/10 [00:07<00:03,  1.27s/ examples]

{'graded_output': [{'token_id': tensor(2501), 'token_str': 'No', 'probability': 0.84103245}]}
{'graded_output': [{'token_id': tensor(5613), 'token_str': 'Yes', 'probability': 0.87321466}]}
{'graded_output': [{'token_id': tensor(2301), 'token_str': 'Is', 'probability': 0.22599277}]}
{'graded_output': [{'token_id': tensor(5613), 'token_str': 'Yes', 'probability': 0.5481237}]}
{'graded_output': [{'token_id': tensor(5613), 'token_str': 'Yes', 'probability': 0.42372894}]}
{'graded_output': [{'token_id': tensor(28777), 'token_str': 'G', 'probability': 0.25677797}]}


Map:  80%|████████  | 8/10 [00:09<00:02,  1.38s/ examples]

{'graded_output': [{'token_id': tensor(5613), 'token_str': 'Yes', 'probability': 0.94298965}]}
{'graded_output': [{'token_id': tensor(2501), 'token_str': 'No', 'probability': 0.6223856}]}
{'graded_output': [{'token_id': tensor(5613), 'token_str': 'Yes', 'probability': 0.38676727}]}
{'graded_output': [{'token_id': tensor(5613), 'token_str': 'Yes', 'probability': 0.91877407}]}
{'graded_output': [{'token_id': tensor(2301), 'token_str': 'Is', 'probability': 0.44995272}]}
{'graded_output': [{'token_id': tensor(5613), 'token_str': 'Yes', 'probability': 0.5059207}]}
{'graded_output': [{'token_id': tensor(5613), 'token_str': 'Yes', 'probability': 0.81613564}]}
{'graded_output': [{'token_id': tensor(1014), 'token_str': 'The', 'probability': 0.35310444}]}
{'graded_output': [{'token_id': tensor(2501), 'token_str': 'No', 'probability': 0.49348438}]}
{'graded_output': [{'token_id': tensor(5613), 'token_str': 'Yes', 'probability': 0.91818935}]}
{'graded_output': [{'token_id': tensor(2301), 'token_st

Map:  90%|█████████ | 9/10 [00:18<00:03,  3.66s/ examples]

{'graded_output': [{'token_id': tensor(2501), 'token_str': 'No', 'probability': 0.9745897}]}
{'graded_output': [{'token_id': tensor(2301), 'token_str': 'Is', 'probability': 0.43321145}]}


Map: 100%|██████████| 10/10 [00:18<00:00,  1.86s/ examples]

{'graded_output': [{'token_id': tensor(5613), 'token_str': 'Yes', 'probability': 0.89880574}]}





In [17]:
# filter the answers
example_ds = example_ds.map(answer_filter)

Map: 100%|██████████| 10/10 [00:00<00:00, 1067.09 examples/s]


In [18]:
# remaining answers
example_ds["answers"]

[[{'answer_body': "Generally speaking, yes. Presidents often meet with with ex-presidents, regardless of political alignment.\n\nSpecifically for this case, yes. A lot. Eisenhower was President during the build-up to the Cuban Missile Crisis and widely renowned as an excellent general. Here is a recording of one such phone call:\n\n[LINK]\n\nHere's another:\n\n[LINK]",
   'answer_char_length': 407,
   'answer_created_utc': 1347938144000,
   'answer_deleted': False,
   'answer_id': 'c69qhes',
   'answer_retrieved_on': 1429926087000,
   'answer_score': 2,
   'graded_output': [{'probability': 0.7641037106513977,
     'token_id': 5613,
     'token_str': 'Yes'}]},
  {'answer_body': "Interestingly enough, Eisenhower was sharply criticized shortly before the Crisis because he made a statement criticizing Kennedy's foreign policy, thus breaking the informal rule against past presidents criticizing sitting ones. As Barrel-rider stated, however, Eisenhower offered his support during the Crisis."

### Toxicity

Currently not in use.

In [5]:
toxicity_pipe = pipeline("text-classification", model="tomh/toxigen_roberta", device=0)

In [13]:
toxicity_pipe('I love ML', top_k=None)

[{'label': 'LABEL_0', 'score': 0.9992640614509583},
 {'label': 'LABEL_1', 'score': 0.0007359233568422496}]

In [6]:
def run_toxicity_pipe(text):
    try: 
        result = toxicity_pipe(text, top_k=None)
        result = [r for r in result if r['label'] == 'LABEL_1'][0]
    except:
        return 0.5

    return result['score']

In [11]:
for answer in dataset[65]['answers']:
    print(run_toxicity_pipe(answer["answer_body"]))

0.0006564322975464165
0.0007360000745393336
0.0007385569042526186
0.000814662838820368
