In [1]:
# external
import datasets as ds
from transformers import pipeline
import importlib
import os
import torch
import transformers
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from functools import partial

# internal
from redditqa.data.smart_filter import question_grading_map, question_filter, answer_grading_map, answer_filter
from redditqa.data import qa_generation
from redditqa.data.util import mask_links

importlib.reload(qa_generation)

<module 'redditqa.data.qa_generation' from '/workspaces/reddit_qa/redditqa/data/qa_generation.py'>

In [2]:
torch.cuda.is_available()

True

In [3]:
model_id = "HuggingFaceH4/zephyr-7b-beta"
cache_dir = "/scratch1/ssawicki/cache"

### Load model

In [4]:
# quantization config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

In [5]:
model = AutoModelForCausalLM.from_pretrained(
        model_id,
        low_cpu_mem_usage=True,
        quantization_config=bnb_config,
        cache_dir=cache_dir
    )

Loading checkpoint shards: 100%|██████████| 8/8 [00:05<00:00,  1.45it/s]


In [6]:
tokenizer = AutoTokenizer.from_pretrained(model_id, cache_dir=cache_dir)
tokenizer.pad_token_id = tokenizer.eos_token_id

In [7]:
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)

### Load dataset

In [8]:
dataset = ds.load_from_disk("/scratch1/redditqa/cached_datasets/AskHistorians.jsonl")
# Preprocessing
dataset = dataset.map(mask_links)

In [9]:
dataset.info.features

{'question_created_utc': Value(dtype='int64', id=None),
 'question_retrieved_on': Value(dtype='int64', id=None),
 'question_deleted': Value(dtype='bool', id=None),
 'question_title': Value(dtype='string', id=None),
 'question_selftext': Value(dtype='string', id=None),
 'question_score': Value(dtype='int64', id=None),
 'question_char_length': Value(dtype='int64', id=None),
 'question_selftext_char_length': Value(dtype='int64', id=None),
 'answers': [{'answer_body': Value(dtype='string', id=None),
   'answer_char_length': Value(dtype='int64', id=None),
   'answer_created_utc': Value(dtype='int64', id=None),
   'answer_deleted': Value(dtype='bool', id=None),
   'answer_id': Value(dtype='string', id=None),
   'answer_retrieved_on': Value(dtype='int64', id=None),
   'answer_score': Value(dtype='int64', id=None)}]}

### LLM Filtering

Question filtering:
- Few shot classification: define what a well written question is, show some good and bad examples.

Answer filter:
- Few shot classification: define what a well written answer is, show good & bad examples.

#### Question filter

In [10]:
question_grading = partial(question_grading_map, pipeline=pipe, verbose=True)
question_filter = partial(question_filter, accepted_token_str=['y', 'yes'])

In [11]:
example_ds = dataset.select(list(range(50,80)))
example_ds

Dataset({
    features: ['question_created_utc', 'question_retrieved_on', 'question_deleted', 'question_title', 'question_selftext', 'question_score', 'question_char_length', 'question_selftext_char_length', 'answers'],
    num_rows: 30
})

In [12]:
# grading the questions
example_ds = example_ds.map(question_grading)

Map:   3%|▎         | 1/30 [00:00<00:22,  1.29 examples/s]

{'graded_output': [{'token_id': tensor(2301), 'token_str': 'Is', 'probability': 0.650636}]}


Map:   7%|▋         | 2/30 [00:00<00:12,  2.24 examples/s]

{'graded_output': [{'token_id': tensor(5613), 'token_str': 'Yes', 'probability': 0.61765474}]}


Map:  10%|█         | 3/30 [00:01<00:09,  2.95 examples/s]

{'graded_output': [{'token_id': tensor(5613), 'token_str': 'Yes', 'probability': 0.8637139}]}


Map:  13%|█▎        | 4/30 [00:01<00:07,  3.43 examples/s]

{'graded_output': [{'token_id': tensor(2301), 'token_str': 'Is', 'probability': 0.8291705}]}


Map:  17%|█▋        | 5/30 [00:01<00:06,  3.80 examples/s]

{'graded_output': [{'token_id': tensor(5613), 'token_str': 'Yes', 'probability': 0.72948694}]}


Map:  20%|██        | 6/30 [00:01<00:05,  4.06 examples/s]

{'graded_output': [{'token_id': tensor(5613), 'token_str': 'Yes', 'probability': 0.66851276}]}


Map:  23%|██▎       | 7/30 [00:02<00:05,  4.25 examples/s]

{'graded_output': [{'token_id': tensor(1014), 'token_str': 'The', 'probability': 0.39586458}]}


Map:  27%|██▋       | 8/30 [00:02<00:05,  4.38 examples/s]

{'graded_output': [{'token_id': tensor(1014), 'token_str': 'The', 'probability': 0.339123}]}


Map:  30%|███       | 9/30 [00:02<00:04,  4.47 examples/s]

{'graded_output': [{'token_id': tensor(24207), 'token_str': 'Based', 'probability': 0.30783248}]}


Map:  33%|███▎      | 10/30 [00:02<00:04,  4.54 examples/s]

{'graded_output': [{'token_id': tensor(5613), 'token_str': 'Yes', 'probability': 0.8851687}]}


Map:  37%|███▋      | 11/30 [00:02<00:04,  4.57 examples/s]

{'graded_output': [{'token_id': tensor(2301), 'token_str': 'Is', 'probability': 0.60324824}]}


Map:  40%|████      | 12/30 [00:03<00:03,  4.59 examples/s]

{'graded_output': [{'token_id': tensor(3260), 'token_str': 'This', 'probability': 0.4616516}]}


Map:  43%|████▎     | 13/30 [00:03<00:03,  4.61 examples/s]

{'graded_output': [{'token_id': tensor(2301), 'token_str': 'Is', 'probability': 0.6763611}]}


Map:  47%|████▋     | 14/30 [00:03<00:03,  4.64 examples/s]

{'graded_output': [{'token_id': tensor(2301), 'token_str': 'Is', 'probability': 0.39937752}]}


Map:  50%|█████     | 15/30 [00:03<00:03,  4.66 examples/s]

{'graded_output': [{'token_id': tensor(5613), 'token_str': 'Yes', 'probability': 0.85479397}]}


Map:  53%|█████▎    | 16/30 [00:03<00:03,  4.66 examples/s]

{'graded_output': [{'token_id': tensor(2501), 'token_str': 'No', 'probability': 0.7841807}]}


Map:  57%|█████▋    | 17/30 [00:04<00:02,  4.66 examples/s]

{'graded_output': [{'token_id': tensor(5613), 'token_str': 'Yes', 'probability': 0.8281972}]}


Map:  60%|██████    | 18/30 [00:04<00:02,  4.68 examples/s]

{'graded_output': [{'token_id': tensor(1014), 'token_str': 'The', 'probability': 0.4351434}]}


Map:  63%|██████▎   | 19/30 [00:04<00:02,  4.68 examples/s]

{'graded_output': [{'token_id': tensor(2301), 'token_str': 'Is', 'probability': 0.5663763}]}


Map:  67%|██████▋   | 20/30 [00:04<00:02,  4.68 examples/s]

{'graded_output': [{'token_id': tensor(5613), 'token_str': 'Yes', 'probability': 0.9522475}]}


Map:  70%|███████   | 21/30 [00:05<00:01,  4.68 examples/s]

{'graded_output': [{'token_id': tensor(2301), 'token_str': 'Is', 'probability': 0.93276036}]}


Map:  73%|███████▎  | 22/30 [00:05<00:01,  4.68 examples/s]

{'graded_output': [{'token_id': tensor(2301), 'token_str': 'Is', 'probability': 0.4009727}]}


Map:  77%|███████▋  | 23/30 [00:05<00:01,  4.66 examples/s]

{'graded_output': [{'token_id': tensor(5613), 'token_str': 'Yes', 'probability': 0.9776138}]}


Map:  80%|████████  | 24/30 [00:05<00:01,  4.64 examples/s]

{'graded_output': [{'token_id': tensor(1014), 'token_str': 'The', 'probability': 0.38265973}]}


Map:  83%|████████▎ | 25/30 [00:05<00:01,  4.63 examples/s]

{'graded_output': [{'token_id': tensor(2301), 'token_str': 'Is', 'probability': 0.32117096}]}


Map:  87%|████████▋ | 26/30 [00:06<00:00,  4.64 examples/s]

{'graded_output': [{'token_id': tensor(2501), 'token_str': 'No', 'probability': 0.98222184}]}


Map:  90%|█████████ | 27/30 [00:06<00:00,  4.65 examples/s]

{'graded_output': [{'token_id': tensor(28737), 'token_str': 'I', 'probability': 0.70708704}]}


Map:  93%|█████████▎| 28/30 [00:06<00:00,  4.66 examples/s]

{'graded_output': [{'token_id': tensor(1014), 'token_str': 'The', 'probability': 0.59088075}]}


Map:  97%|█████████▋| 29/30 [00:06<00:00,  4.68 examples/s]

{'graded_output': [{'token_id': tensor(2301), 'token_str': 'Is', 'probability': 0.6315164}]}


Map: 100%|██████████| 30/30 [00:06<00:00,  4.29 examples/s]

{'graded_output': [{'token_id': tensor(5613), 'token_str': 'Yes', 'probability': 0.3377912}]}





In [13]:
# filtering quetions
example_ds = example_ds.filter(question_filter)

Filter: 100%|██████████| 30/30 [00:00<00:00, 5564.46 examples/s]


In [14]:
# remaining questions
example_ds["question_title"]

['Was Dwight Eisenhower consulted by JFK during the Cuban Missile Crisis? ',
 'Religious sects of Reformation England: Who were they and what did they believe?',
 'Reddit, what were the perspectives of the British and the Boers during the Second Boer War ?',
 'Were the fire bombings on Dresden from Allied forces during WWII necessary?',
 'What was the role of dogs in the Roman military?',
 'What did pre-modern racism look like?',
 'How did the Roman government react to the eruption of Mt. Vesuvius in AD 79?',
 'Do you guys know of any instance when a revolution has succeeded without foreign intervention?',
 'Is there any other instance in history where a different race of people were imported into a land as slaves and eventually became fully assimilated as equals into society? If so how long did it take? ',
 'What was the longest period of time where the US was not at war, or actively involved in combat?']

#### Answer filter

In [15]:
answer_grading = partial(answer_grading_map, pipeline=pipe, verbose=True)
answer_filter = partial(answer_filter, accepted_token_str=['y', 'yes'])

In [16]:
example_ds = example_ds.map(answer_grading)

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

{'graded_output': [{'token_id': tensor(5613), 'token_str': 'Yes', 'probability': 0.7641037}]}


Map:  10%|█         | 1/10 [00:00<00:05,  1.72 examples/s]

{'graded_output': [{'token_id': tensor(5613), 'token_str': 'Yes', 'probability': 0.7315472}]}
{'graded_output': [{'token_id': tensor(2301), 'token_str': 'Is', 'probability': 0.46591264}]}


Map:  20%|██        | 2/10 [00:01<00:05,  1.58 examples/s]

{'graded_output': [{'token_id': tensor(2301), 'token_str': 'Is', 'probability': 0.35863084}]}
{'graded_output': [{'token_id': tensor(2501), 'token_str': 'No', 'probability': 0.95299494}]}


Map:  30%|███       | 3/10 [00:01<00:04,  1.62 examples/s]

{'graded_output': [{'token_id': tensor(2501), 'token_str': 'No', 'probability': 0.9207272}]}
{'graded_output': [{'token_id': tensor(2501), 'token_str': 'No', 'probability': 0.76148134}]}
{'graded_output': [{'token_id': tensor(5613), 'token_str': 'Yes', 'probability': 0.55770934}]}
{'graded_output': [{'token_id': tensor(5613), 'token_str': 'Yes', 'probability': 0.562034}]}
{'graded_output': [{'token_id': tensor(5613), 'token_str': 'Yes', 'probability': 0.55572397}]}
{'graded_output': [{'token_id': tensor(5613), 'token_str': 'Yes', 'probability': 0.7264219}]}
{'graded_output': [{'token_id': tensor(2301), 'token_str': 'Is', 'probability': 0.2522639}]}


Map:  40%|████      | 4/10 [00:04<00:08,  1.38s/ examples]

{'graded_output': [{'token_id': tensor(2501), 'token_str': 'No', 'probability': 0.9220224}]}
{'graded_output': [{'token_id': tensor(2301), 'token_str': 'Is', 'probability': 0.3299616}]}


Map:  50%|█████     | 5/10 [00:04<00:05,  1.09s/ examples]

{'graded_output': [{'token_id': tensor(2501), 'token_str': 'No', 'probability': 0.779684}]}
{'graded_output': [{'token_id': tensor(5613), 'token_str': 'Yes', 'probability': 0.57565033}]}
{'graded_output': [{'token_id': tensor(1014), 'token_str': 'The', 'probability': 0.28809276}]}
{'graded_output': [{'token_id': tensor(2301), 'token_str': 'Is', 'probability': 0.25955176}]}


Map:  60%|██████    | 6/10 [00:06<00:04,  1.14s/ examples]

{'graded_output': [{'token_id': tensor(5613), 'token_str': 'Yes', 'probability': 0.3975593}]}
{'graded_output': [{'token_id': tensor(5613), 'token_str': 'Yes', 'probability': 0.90252525}]}
{'graded_output': [{'token_id': tensor(2501), 'token_str': 'No', 'probability': 0.89953196}]}
{'graded_output': [{'token_id': tensor(5613), 'token_str': 'Yes', 'probability': 0.6132235}]}
{'graded_output': [{'token_id': tensor(2501), 'token_str': 'No', 'probability': 0.70639944}]}


Map:  70%|███████   | 7/10 [00:07<00:03,  1.27s/ examples]

{'graded_output': [{'token_id': tensor(2501), 'token_str': 'No', 'probability': 0.84103245}]}
{'graded_output': [{'token_id': tensor(5613), 'token_str': 'Yes', 'probability': 0.87321466}]}
{'graded_output': [{'token_id': tensor(2301), 'token_str': 'Is', 'probability': 0.22599277}]}
{'graded_output': [{'token_id': tensor(5613), 'token_str': 'Yes', 'probability': 0.5481237}]}
{'graded_output': [{'token_id': tensor(5613), 'token_str': 'Yes', 'probability': 0.42372894}]}
{'graded_output': [{'token_id': tensor(28777), 'token_str': 'G', 'probability': 0.25677797}]}


Map:  80%|████████  | 8/10 [00:09<00:02,  1.38s/ examples]

{'graded_output': [{'token_id': tensor(5613), 'token_str': 'Yes', 'probability': 0.94298965}]}
{'graded_output': [{'token_id': tensor(2501), 'token_str': 'No', 'probability': 0.6223856}]}
{'graded_output': [{'token_id': tensor(5613), 'token_str': 'Yes', 'probability': 0.38676727}]}
{'graded_output': [{'token_id': tensor(5613), 'token_str': 'Yes', 'probability': 0.91877407}]}
{'graded_output': [{'token_id': tensor(2301), 'token_str': 'Is', 'probability': 0.44995272}]}
{'graded_output': [{'token_id': tensor(5613), 'token_str': 'Yes', 'probability': 0.5059207}]}
{'graded_output': [{'token_id': tensor(5613), 'token_str': 'Yes', 'probability': 0.81613564}]}
{'graded_output': [{'token_id': tensor(1014), 'token_str': 'The', 'probability': 0.35310444}]}
{'graded_output': [{'token_id': tensor(2501), 'token_str': 'No', 'probability': 0.49348438}]}
{'graded_output': [{'token_id': tensor(5613), 'token_str': 'Yes', 'probability': 0.91818935}]}
{'graded_output': [{'token_id': tensor(2301), 'token_st

Map:  90%|█████████ | 9/10 [00:18<00:03,  3.66s/ examples]

{'graded_output': [{'token_id': tensor(2501), 'token_str': 'No', 'probability': 0.9745897}]}
{'graded_output': [{'token_id': tensor(2301), 'token_str': 'Is', 'probability': 0.43321145}]}


Map: 100%|██████████| 10/10 [00:18<00:00,  1.86s/ examples]

{'graded_output': [{'token_id': tensor(5613), 'token_str': 'Yes', 'probability': 0.89880574}]}





In [17]:
# filter the answers
example_ds = example_ds.map(answer_filter)

Map: 100%|██████████| 10/10 [00:00<00:00, 1067.09 examples/s]


In [18]:
# remaining answers
example_ds["answers"]

[[{'answer_body': "Generally speaking, yes. Presidents often meet with with ex-presidents, regardless of political alignment.\n\nSpecifically for this case, yes. A lot. Eisenhower was President during the build-up to the Cuban Missile Crisis and widely renowned as an excellent general. Here is a recording of one such phone call:\n\n[LINK]\n\nHere's another:\n\n[LINK]",
   'answer_char_length': 407,
   'answer_created_utc': 1347938144000,
   'answer_deleted': False,
   'answer_id': 'c69qhes',
   'answer_retrieved_on': 1429926087000,
   'answer_score': 2,
   'graded_output': [{'probability': 0.7641037106513977,
     'token_id': 5613,
     'token_str': 'Yes'}]},
  {'answer_body': "Interestingly enough, Eisenhower was sharply criticized shortly before the Crisis because he made a statement criticizing Kennedy's foreign policy, thus breaking the informal rule against past presidents criticizing sitting ones. As Barrel-rider stated, however, Eisenhower offered his support during the Crisis."

### Toxicity

Currently not in use.

In [5]:
toxicity_pipe = pipeline("text-classification", model="tomh/toxigen_roberta", device=0)

In [13]:
toxicity_pipe('I love ML', top_k=None)

[{'label': 'LABEL_0', 'score': 0.9992640614509583},
 {'label': 'LABEL_1', 'score': 0.0007359233568422496}]

In [6]:
def run_toxicity_pipe(text):
    try: 
        result = toxicity_pipe(text, top_k=None)
        result = [r for r in result if r['label'] == 'LABEL_1'][0]
    except:
        return 0.5

    return result['score']

In [11]:
for answer in dataset[65]['answers']:
    print(run_toxicity_pipe(answer["answer_body"]))

0.0006564322975464165
0.0007360000745393336
0.0007385569042526186
0.000814662838820368
