In [178]:
sys.path.insert(0, '../tasks/quote_attribution/other_platforms/span-detection-approaches/')
from qa_dataset import fix_quote_type
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
import glob
import os 
import json
import pandas as pd 
import re
import numpy as np
import string

In [659]:
def clean(x):
    if pd.isnull(x):
        return x
    x = x.lower()
    words_to_remove = ['the']
    for w in words_to_remove:
        x = (' %s ' % x).replace(' %s ' % w, ' ')
    x = re.sub('\s+', ' ', x)
    x = re.sub('\d+', '', x)
    for p in string.punctuation:
        x = x.replace(p, '')
    return x.strip()

def test_in(true_label, gpt3_guess):
    if pd.isnull(true_label) or pd.isnull(gpt3_guess):
        return np.nan
    
    true_label, gpt3_guess = clean(true_label), clean(gpt3_guess)
    if true_label == gpt3_guess:
        return True
    if true_label in gpt3_guess:
        return True
    if gpt3_guess in true_label:
        return True
    return False


def read_results(f):
    callback_files = glob.glob(os.path.join(f, '*'))

    all_results = []
    for r in callback_files:
        callback_results = json.load(open(r))
        all_results.append(callback_results)

    all_results_df = pd.DataFrame(all_results)

    _e_cols = list(filter(lambda x: x.endswith('_e'), all_results_df.columns))
    # span-prediction and token classification
    if len(_e_cols) > 0:
        all_results_df = all_results_df[_e_cols]

    return all_results_df.loc[lambda df: df.full_e.idxmax()]

def get_openai_results(detection_df, attribution_df, results_type='true_label'):
    '''
    detection_df has columns  =   'doc_id', 'sent_idx', 'sent', 'is_quote_true_label', 'detection_prob'
    attribution_df has columns=   'doc_id', 'sent_idx', 'head', 'quote_type', 'attribution'
    '''
    
    attribution_df = attribution_df.rename(columns={'y_pred': 'attribution'})
    for col_to_drop in ['sent', 'detection_prob']:
        if col_to_drop in attribution_df.columns:
            attribution_df = attribution_df.drop(col_to_drop, axis=1)
    
    docs_too_long = (
        attribution_df
        .assign(a_null=lambda df: df['attribution'].notnull())
        .groupby('doc_id')['a_null']
        .any()
        .loc[lambda df: df == False].index
    )
    detection_df = detection_df.loc[lambda df: ~df['doc_id'].isin(docs_too_long)]
    merged_df = detection_df.merge(attribution_df, how='left', left_on=['doc_id', 'sent_idx'], right_on=['doc_id', 'sent_idx'])
    if results_type == 'true_label':
#         merged_df = merged_df.loc[lambda df: df['is_quote_true_label'] == True]
        merged_df = merged_df.loc[lambda df: ~df['quote_type'].fillna('NO QUOTE').isin(to_exclude)]
    else:
        merged_df = merged_df.loc[lambda df: df['detection_prob'] > .5]
        
    merged_df = merged_df.loc[lambda df: df['sent'].str.len() > 2]#.fillna('')
    merged_df['is_match'] = merged_df.apply(lambda x: test_in(x['head'], x['attribution']), axis=1)#.fillna(False)
    
    if results_type != 'true_label':
        merged_df['is_match'] = merged_df['is_match'].fillna(False)
    
    output_results = {}
    output_results['full'] = merged_df['is_match'].mean()
    category_results = (merged_df
     .assign(quote_type=lambda df: df.apply(fix_quote_type, axis=1))
     .groupby('quote_type')['is_match']
     .mean()
     .to_dict()
    )
    output_results.update(category_results)
    return output_results

# Detection Results

In [686]:
trials = glob.glob('../tasks/quote_detection/other_platforms/results/*')

In [711]:
all_trials_res = {}
for trial in trials:
    checkpoint_metrics = glob.glob(os.path.join(trial, 'call*'))
    trial_name = os.path.basename(trial)
    
    all_results = []
    for c in checkpoint_metrics:
        res = json.load(open(c))
        all_results.append(pd.Series(res))
    res_df = pd.concat(all_results, axis=1).T
    top_res = res_df.loc[lambda df: df['full_f1'].idxmax()]
    
    all_trials_res[trial_name] = top_res

In [712]:
all_trials_res_df = pd.DataFrame(all_trials_res).T

In [718]:
all_res_final = (all_trials_res_df
 .rename(columns=lambda x: x.replace('_f1', ''))
 .pipe(lambda df: 
   pd.concat([df] + [
       merge_cluster(df, quote_type_counts, c, m) 
       for c,m in to_merge_clusters
   ], axis=1))
 [col_order]
 .pipe(lambda s: s * 100).round(1)
)

In [721]:
print(all_res_final.to_latex())

\begin{tabular}{lrrrrrrr}
\toprule
{} &  full &  DIRECT QUOTE &  INDIRECT QUOTE &  Statement/Public Speech &  Email/Social Media &  Published Work/Press Report &  Other \\
\midrule
roberta-base\_\_sentence-model\_\_background-excluded  &  87.0 &          92.4 &            99.1 &                     95.6 &                94.1 &                         90.0 &   66.6 \\
roberta-base\_\_sentence-model\_\_background-exclud... &  87.1 &          91.0 &            98.7 &                     94.1 &                92.7 &                         85.4 &   61.4 \\
big-bird\_\_full-sequence-model\_\_background-excluded &  88.2 &          92.0 &            98.7 &                     96.4 &                89.8 &                         86.4 &   65.1 \\
\bottomrule
\end{tabular}



  print(all_res_final.to_latex())


# Attribution Results

In [660]:
trials = glob.glob('../tasks/quote_attribution/other_platforms/span-detection-approaches/results/*')

In [661]:
results_across_trials = {}
for trial in trials:
    trial_name = os.path.basename(trial)
    results_across_trials[trial_name] = read_results(trial)

In [662]:
results_across_trials_df = pd.DataFrame(results_across_trials).T
results_across_trials_df = results_across_trials_df.rename(columns=lambda x: x.replace('_e', ''))

In [663]:
index_order = [
    'big-bird__token-classification-model',
    'big-bird__token-classification-model__coref_resolved',
    ## 
    'big-bird__qa-model',
    'big-bird__qa-model__coref-resolved',
    'big-bird__salience-model__augmented-data', 
    'big-bird__loss-window-2',
    'big-bird__qa-model__roberta-large',
]

In [664]:
model_name_and_path = [
    ('babbage_with_nones', 'cache/2023-01-12__detection-and-attributions__attribution-model.csv'),
    ('babbage_with_coref', 'cache/2023-01-11__babbage__with-coref-evaluation.csv'),
    ('babbage_final', 'cache/2023-01-19__babbage-without-nones-all-training.csv'), # to rerun
    ('curie_final', 'cache/2023-01-19__curie-without-nones-all-training.csv'), # to rerun
]

In [665]:
detection_df = pd.read_csv('cache/2023-01-12__detection-and-attributions__attribution-model.csv')
detection_df= (
    detection_df[['doc_id', 'sent_idx', 'sent', 'detection_label', 'detection_prob']]
    .rename(columns={'detection_label': 'is_quote_true_label'})
)

In [666]:
open_ai_result_dicts = {}
for model_name, path in model_name_and_path:
    attributions_df = pd.read_csv(path)
    results_true = get_openai_results(detection_df, attributions_df, results_type='true_label')
    results_detect = get_openai_results(detection_df, attributions_df, results_type='detection')    
    open_ai_result_dicts[model_name + '_true_label'] = results_true
    open_ai_result_dicts[model_name + '_detect'] = results_detect

In [667]:
open_ai_results_df = pd.DataFrame(open_ai_result_dicts).T

In [668]:
all_results_df = pd.concat([
    results_across_trials_df,
    open_ai_results_df
])

In [669]:
index_order_negative_results = [
# 
    'big-bird__token-classification-model',
        'big-bird__token-classification-model__coref_resolved',
# 
    'big-bird__qa-model',
        'big-bird__qa-model__coref-resolved',
        'big-bird__loss-window-2',
        'big-bird__salience-model__augmented-data',
        'big-bird__qa-model__roberta-large',
# 
    'babbage_final_true_label',
        'babbage_with_nones_true_label',
        'babbage_with_coref_true_label',    
]

In [670]:
positive_results = [
    # 
    'big-bird__token-classification-model',
    # 
    'big-bird__qa-model',
        'big-bird__qa-model__coref-resolved',    
    'babbage_final_true_label',
    'babbage_with_nones_true_label',
        'babbage_with_coref_true_label',
    'curie_final_true_label',
    
    # 
    'babbage_with_nones_detect',
    'babbage_final_detect',
    'curie_final_detect'
]

In [671]:
positive_results_rename = {
        # 
    'big-bird__token-classification-model': 'Seq. Labeling',
    # 
    'big-bird__qa-model': 'Span Detection',
        'big-bird__qa-model__coref-resolved': 'SD+Coref Resolved',
    'babbage_final_true_label': 'GPT3 1.3B',
    'babbage_with_nones_true_label': 'GPT3 1.3B +Nones',
        'babbage_with_coref_true_label': 'GPT3 1.3B + Coref',
    'curie_final_true_label': 'GPT3 6.7B',
    
    # 
    'babbage_with_nones_detect': 'GPT3 1.3B +Nones, Detection',
    'babbage_final_detect': 'GPT3 1.3B, Detection',
    'curie_final_detect': 'GPT3 6.7B, Detection'
}

In [672]:
to_merge_clusters = [
    ('Statement/Public Speech', ['STATEMENT', 'PUBLIC SPEECH'],),
    ('Email/Social Media', ['COMMUNICATION', 'SOCIAL MEDIA POST',],),
    ('Published Work/Press Report', [ 'PUBLISHED WORK', 'PRESS REPORT',]),
    ('Other', [ 'VOTE/POLL', 'DECLINED COMMENT', 'DIRECT OBSERVATION', 'PRICE SIGNAL'])
]

In [673]:
def merge_cluster(res_df, quote_type_counts, output_cluster_name, to_merge):
    res = res_df[to_merge]
    counts = quote_type_counts[to_merge]
    summation = (res * counts).sum(axis=1)
    avg = summation / counts.sum()
    return avg.to_frame(output_cluster_name)

In [674]:
col_order = [
    'full',
    'DIRECT QUOTE',
    'INDIRECT QUOTE',
    # 
    'Statement/Public Speech',
    #
    'Email/Social Media',
    # 
    'Published Work/Press Report',
    # 
    'Other'    
]

In [675]:
quote_type_counts = pd.concat(list(map(pd.DataFrame, data_to_attribute))).apply(fix_quote_type, axis=1).value_counts()

In [676]:
final_all_results_df = (all_results_df
 .loc[positive_results]
 .rename(index=positive_results_rename)
 .pipe(lambda df: 
   pd.concat([df] + [
       merge_cluster(df, quote_type_counts, c, m) 
       for c,m in to_merge_clusters
   ], axis=1))
 [col_order]
 .pipe(lambda df: df*100).round(1)
 .fillna('-')
 .rename(columns=lambda x: x.title())
)

In [677]:
final_all_results_df

Unnamed: 0,Full,Direct Quote,Indirect Quote,Statement/Public Speech,Email/Social Media,Published Work/Press Report,Other
Seq. Labeling,38.5,37.2,43.4,39.7,33.8,31.9,13.2
Span Detection,59.5,61.1,59.5,67.5,48.9,51.1,34.1
SD+Coref Resolved,53.6,51.2,56.8,61.4,73.5,54.1,37.2
GPT3 1.3B,78.9,80.9,86.9,85.0,71.9,57.9,38.3
GPT3 1.3B +Nones,79.6,81.9,87.1,86.2,69.7,60.5,33.2
GPT3 1.3B + Coref,73.2,78.7,82.5,76.3,56.1,54.4,31.2
GPT3 6.7B,91.4,94.0,95.5,91.1,91.0,81.6,57.3
"GPT3 1.3B +Nones, Detection",73.1,82.4,84.8,85.9,73.4,61.0,64.5
"GPT3 1.3B, Detection",70.9,79.5,82.9,82.9,73.4,60.5,53.0
"GPT3 6.7B, Detection",80.0,90.4,90.7,89.9,91.1,78.0,68.9


In [678]:
import pyperclip
pyperclip.copy(final_all_results_df.to_latex())

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  pyperclip.copy(final_all_results_df.to_latex())


# run openai attribution model

In [133]:
ls ../

README.md             requirements-old.txt  setup.py
[34mapp[m[m/                  requirements.in       [34mtasks[m[m/
[34mmodels_other[m[m/         requirements.txt      [34mtmp[m[m/
[34mnotebooks[m[m/            [34mresources[m[m/
pluslab_setup.sh      [34mscripts[m[m/


In [134]:
import jsonlines
import sys
sys.path.insert(0, '../tasks/quote_detection_and_attribution/')

In [135]:
from score_new_articles import OpenAIAttributionDataset, OpenAIModel 

In [136]:
from transformers import AutoTokenizer

In [138]:
gpt_tok = AutoTokenizer.from_pretrained('gpt2')

In [154]:
openai_dataset = OpenAIAttributionDataset(tokenizer=gpt_tok)

In [294]:
data_to_attribute = list(jsonlines.open('../tasks/data_split_annotated_sources.jsonl'))
data_to_attribute = list(filter(lambda x: x['split'] == 'test', data_to_attribute))
data_to_attribute = list(map(lambda x: x['data'], data_to_attribute))

In [141]:
from tqdm.auto import tqdm

In [142]:
import openai

In [290]:
openai.api_key = "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"

In [291]:
CLEANR = re.compile('<.*?>')
def cleanhtml(raw_html):
    cleantext = re.sub(CLEANR, '', raw_html)
    return cleantext

class OpenAIAttributionDataset():
    def __init__(self, tokenizer, max_len=2040):
        self.prompt_template = '"""%s""".\n\nTo which source can we attribute this sentence:\n\n"""%s"""\n\n##\n\n'
        self.tokenizer = tokenizer
        self.max_len = max_len

    def make_prompts(self, one_doc):
        """
        Makes a prompt for OpenAI fine-tuned model. Here, we're not training. We expect `one_doc_df`
        to have the following columns:

        * `sent`     if detection has not been run, and
        * `sent`, `is_quote` if detection has been run.
        """

        doc_sents = list(map(lambda x: cleanhtml(x['sent']), one_doc))
        article = ' '.join(doc_sents)

        all_prompts = []
        for sent in one_doc:
            sent_text = cleanhtml(sent['sent'])
            num_toks = len(self.tokenizer.encode(sent_text))
            if (len(sent_text) > 2) and (num_toks < self.max_len):
                prompt = self.prompt_template % (article, sent_text)
            else:
                prompt = None
            all_prompts.append(prompt)

        return all_prompts

In [468]:
# model = OpenAIModel(model_name='babbage:ft-isi-nlp-2023-01-12-06-58-08')
model = OpenAIModel(model_name='curie:ft-isi-nlp:sep-training-set-base-2022-12-02-01-29-12')

In [473]:
all_attributions = []
for d in tqdm(data_to_attribute):
    doc_output = []
    prompts = openai_dataset.make_prompts(d)
    for packet, prompt in zip(d, prompts):
        if prompt is not None:
            attribution = model.query_openai_model(prompt)
            packet['attribution'] = attribution
        else:
            packet['attribution'] = 'None'
        doc_output.append(packet)
    all_attributions.append(doc_output)

  0%|          | 0/86 [00:00<?, ?it/s]

attribution error: This model's maximum context length is 2049 tokens, however you requested 2079 tokens (2069 in your prompt; 10 for the completion). Please reduce your prompt; or completion length.
attribution error: This model's maximum context length is 2049 tokens, however you requested 2100 tokens (2090 in your prompt; 10 for the completion). Please reduce your prompt; or completion length.
attribution error: This model's maximum context length is 2049 tokens, however you requested 2072 tokens (2062 in your prompt; 10 for the completion). Please reduce your prompt; or completion length.
attribution error: This model's maximum context length is 2049 tokens, however you requested 2086 tokens (2076 in your prompt; 10 for the completion). Please reduce your prompt; or completion length.
attribution error: This model's maximum context length is 2049 tokens, however you requested 2067 tokens (2057 in your prompt; 10 for the completion). Please reduce your prompt; or completion length.


attribution error: This model's maximum context length is 2049 tokens, however you requested 2060 tokens (2050 in your prompt; 10 for the completion). Please reduce your prompt; or completion length.
attribution error: This model's maximum context length is 2049 tokens, however you requested 2078 tokens (2068 in your prompt; 10 for the completion). Please reduce your prompt; or completion length.
attribution error: This model's maximum context length is 2049 tokens, however you requested 2075 tokens (2065 in your prompt; 10 for the completion). Please reduce your prompt; or completion length.
attribution error: This model's maximum context length is 2049 tokens, however you requested 2080 tokens (2070 in your prompt; 10 for the completion). Please reduce your prompt; or completion length.
attribution error: This model's maximum context length is 2049 tokens, however you requested 2094 tokens (2084 in your prompt; 10 for the completion). Please reduce your prompt; or completion length.


attribution error: This model's maximum context length is 2049 tokens, however you requested 2070 tokens (2060 in your prompt; 10 for the completion). Please reduce your prompt; or completion length.
attribution error: This model's maximum context length is 2049 tokens, however you requested 2060 tokens (2050 in your prompt; 10 for the completion). Please reduce your prompt; or completion length.
attribution error: This model's maximum context length is 2049 tokens, however you requested 2052 tokens (2042 in your prompt; 10 for the completion). Please reduce your prompt; or completion length.
attribution error: This model's maximum context length is 2049 tokens, however you requested 2059 tokens (2049 in your prompt; 10 for the completion). Please reduce your prompt; or completion length.
attribution error: This model's maximum context length is 2049 tokens, however you requested 2058 tokens (2048 in your prompt; 10 for the completion). Please reduce your prompt; or completion length.


attribution error: This model's maximum context length is 2049 tokens, however you requested 2054 tokens (2044 in your prompt; 10 for the completion). Please reduce your prompt; or completion length.
attribution error: This model's maximum context length is 2049 tokens, however you requested 2067 tokens (2057 in your prompt; 10 for the completion). Please reduce your prompt; or completion length.
attribution error: This model's maximum context length is 2049 tokens, however you requested 2071 tokens (2061 in your prompt; 10 for the completion). Please reduce your prompt; or completion length.
attribution error: This model's maximum context length is 2049 tokens, however you requested 2053 tokens (2043 in your prompt; 10 for the completion). Please reduce your prompt; or completion length.
attribution error: This model's maximum context length is 2049 tokens, however you requested 2054 tokens (2044 in your prompt; 10 for the completion). Please reduce your prompt; or completion length.


attribution error: This model's maximum context length is 2049 tokens, however you requested 2073 tokens (2063 in your prompt; 10 for the completion). Please reduce your prompt; or completion length.
attribution error: This model's maximum context length is 2049 tokens, however you requested 2058 tokens (2048 in your prompt; 10 for the completion). Please reduce your prompt; or completion length.
attribution error: This model's maximum context length is 2049 tokens, however you requested 2065 tokens (2055 in your prompt; 10 for the completion). Please reduce your prompt; or completion length.
attribution error: This model's maximum context length is 2049 tokens, however you requested 2099 tokens (2089 in your prompt; 10 for the completion). Please reduce your prompt; or completion length.
attribution error: This model's maximum context length is 2049 tokens, however you requested 2083 tokens (2073 in your prompt; 10 for the completion). Please reduce your prompt; or completion length.


attribution error: This model's maximum context length is 2049 tokens, however you requested 2085 tokens (2075 in your prompt; 10 for the completion). Please reduce your prompt; or completion length.
attribution error: This model's maximum context length is 2049 tokens, however you requested 2072 tokens (2062 in your prompt; 10 for the completion). Please reduce your prompt; or completion length.
attribution error: This model's maximum context length is 2049 tokens, however you requested 2118 tokens (2108 in your prompt; 10 for the completion). Please reduce your prompt; or completion length.
attribution error: This model's maximum context length is 2049 tokens, however you requested 2099 tokens (2089 in your prompt; 10 for the completion). Please reduce your prompt; or completion length.
attribution error: This model's maximum context length is 2049 tokens, however you requested 2080 tokens (2070 in your prompt; 10 for the completion). Please reduce your prompt; or completion length.


attribution error: This model's maximum context length is 2049 tokens, however you requested 2083 tokens (2073 in your prompt; 10 for the completion). Please reduce your prompt; or completion length.
attribution error: This model's maximum context length is 2049 tokens, however you requested 2075 tokens (2065 in your prompt; 10 for the completion). Please reduce your prompt; or completion length.
attribution error: This model's maximum context length is 2049 tokens, however you requested 2117 tokens (2107 in your prompt; 10 for the completion). Please reduce your prompt; or completion length.
attribution error: This model's maximum context length is 2049 tokens, however you requested 2082 tokens (2072 in your prompt; 10 for the completion). Please reduce your prompt; or completion length.
attribution error: This model's maximum context length is 2049 tokens, however you requested 2104 tokens (2094 in your prompt; 10 for the completion). Please reduce your prompt; or completion length.


In [474]:
curie_results = pd.concat(list(map(pd.DataFrame, all_attributions)))
curie_results.to_csv('cache/2023-01-19__curie-without-nones-all-training.csv')

In [299]:
babbage_results = pd.concat(list(map(pd.DataFrame, all_attributions)))
babbage_results.to_csv('cache/2023-01-19__babbage-without-nones-all-training.csv')

In [370]:
babbage_results

Unnamed: 0,sent,sent_idx,head,quote_type,source_type,doc_id,attribution,match
0,"In March , as small businesses across the coun...",0,Shanel Fields,QUOTE,Named Individual,905,MD Ally,False
1,"For Ms. Fields , the timing could n’t have bee...",1,Shanel Fields,BACKGROUND,,905,Shanel Fields,True
2,"Her company , MD Ally , allows 911 dispatchers...",2,Shanel Fields,BACKGROUND,,905,MD Ally,False
3,“,3,,,,905,,True
4,Something that a lot of people do n’t know is ...,4,Shanel Fields,QUOTE,,905,Shanel Fields,True
...,...,...,...,...,...,...,...,...
75,"The wind sounded like an airplane engine , he ...",75,Jorge Gutierrez,QUOTE,,215,Paul Block,False
76,And the fire was running across hillsides agai...,76,,,,215,Paul Block,True
77,“,77,,,,215,,True
78,"It looked like someone was throwing sand , ” h...",78,Jorge Gutierrez,QUOTE,,215,Paul Block,False


In [321]:
babbage_results['match'] = babbage_results.apply(lambda x: test_in(x['head'], x['attribution']), axis=1)

In [371]:
detection_df

Unnamed: 0,doc_id,sent_idx,sent,is_quote_true_label,detection_prob
0,905,0,"In March , as small businesses across the coun...",True,0.001217
1,905,1,"For Ms. Fields , the timing could n’t have bee...",False,0.006014
2,905,2,"Her company , MD Ally , allows 911 dispatchers...",False,0.000058
3,905,3,“,False,0.220838
4,905,4,Something that a lot of people do n’t know is ...,True,0.999502
...,...,...,...,...,...
6716,215,75,"The wind sounded like an airplane engine , he ...",True,0.917762
6717,215,76,And the fire was running across hillsides agai...,False,0.568652
6718,215,77,“,False,0.282584
6719,215,78,"It looked like someone was throwing sand , ” h...",True,0.983908


In [323]:
to_exclude = [
    'NO QUOTE', 
    'NARRATIVE',
    'BACKGROUND',
    'DIRECT OBSERVATION'
]

In [393]:
babbage_results.loc[lambda df: ~df['quote_type'].isin(to_exclude)]['match'].mean()#.fillna(False).mean()

0.8437928669410151

In [407]:
babbage_results = babbage_results.rename(columns={'y_pred': 'attribution'})
for col_to_drop in ['sent', 'detection_prob']:
    if col_to_drop in babbage_results.columns:
        babbage_results = babbage_results.drop(col_to_drop, axis=1)
babbage_results = babbage_results.assign(doc_id=lambda df: df['doc_id'].astype(int))

merged_df = detection_df.merge(babbage_results, how='left', left_on=['doc_id', 'sent_idx'], right_on=['doc_id', 'sent_idx'])

In [410]:
merged_df.loc[lambda df: ~df['quote_type'].isin(to_exclude)]['match'].mean()#.fillna(False).mean()

0.8437928669410151

In [414]:
babbage_results

Unnamed: 0,sent_idx,head,quote_type,source_type,doc_id,attribution,match
0,0,Shanel Fields,QUOTE,Named Individual,905,MD Ally,False
1,1,Shanel Fields,BACKGROUND,,905,Shanel Fields,True
2,2,Shanel Fields,BACKGROUND,,905,MD Ally,False
3,3,,,,905,,True
4,4,Shanel Fields,QUOTE,,905,Shanel Fields,True
...,...,...,...,...,...,...,...
75,75,Jorge Gutierrez,QUOTE,,215,Paul Block,False
76,76,,,,215,Paul Block,True
77,77,,,,215,,True
78,78,Jorge Gutierrez,QUOTE,,215,Paul Block,False


In [416]:
test_babbage_results = pd.read_csv('cache/2023-01-19__babbage-without-nones-all-training.csv').assign(head=lambda df: df['head'].fillna(''))

In [427]:
t = test_babbage_results.loc[lambda df: ~df['quote_type'].fillna('NO QUOTE').isin(to_exclude)]

In [442]:
babbage_results['quote_type'].value_counts()

                                2958
QUOTE                           2373
BACKGROUND                       447
STATEMENT                        161
NARRATIVE                        134
PUBLISHED WORK                   103
PROPOSAL/ORDER/LAW                71
PRESS REPORT                      68
COMMUNICATION, NOT TO JOURNO      63
PUBLIC SPEECH, NOT TO JOURNO      60
DIRECT OBSERVATION                50
LAWSUIT                           45
PROPOSAL                          29
VOTE/POLL                         23
PRICE SIGNAL                      21
DECLINED COMMENT                  19
DOCUMENT                          19
Other: LAWSUIT                    17
SOCIAL MEDIA POST                 16
TWEET                             10
Other: Evaluation                  9
Other: DIRECT OBSERVATION          8
Other: Campaign filing             7
Other: PROPOSAL                    5
Other: Campaign Filing             4
Other: Data Analysis               1
Name: quote_type, dtype: int64

In [445]:
(test_babbage_results
#  .assign(quote_type=lambda df: df['quote_type'].fillna('NO QUOTE'))
#  .loc[lambda df: ~df['quote_type'].isin(to_exclude)]
#  .loc[lambda df: df['sent'].str.len() > 2]
 ['quote_type'].value_counts()
)

QUOTE                           2373
BACKGROUND                       447
STATEMENT                        161
NARRATIVE                        134
PUBLISHED WORK                   103
PROPOSAL/ORDER/LAW                71
PRESS REPORT                      68
COMMUNICATION, NOT TO JOURNO      63
PUBLIC SPEECH, NOT TO JOURNO      60
DIRECT OBSERVATION                50
LAWSUIT                           45
PROPOSAL                          29
VOTE/POLL                         23
PRICE SIGNAL                      21
DECLINED COMMENT                  19
DOCUMENT                          19
Other: LAWSUIT                    17
SOCIAL MEDIA POST                 16
TWEET                             10
Other: Evaluation                  9
Other: DIRECT OBSERVATION          8
Other: Campaign filing             7
Other: PROPOSAL                    5
Other: Campaign Filing             4
Other: Data Analysis               1
Name: quote_type, dtype: int64

In [507]:
get_openai_results(detection_df, test_babbage_results)

{'full': 0.888160508541915,
 '': 1.0,
 'COMMUNICATION': 0.746031746031746,
 'COURT PROCEEDING': 0.5967741935483871,
 'DECLINED COMMENT': 0.631578947368421,
 'DIRECT OBSERVATION': 0.2222222222222222,
 'DIRECT QUOTE': 0.8173701298701299,
 'INDIRECT QUOTE': 0.8759305210918115,
 'PRESS REPORT': 0.5373134328358209,
 'PRICE SIGNAL': 0.5238095238095238,
 'PROPOSAL/ORDER/LAW': 0.6190476190476191,
 'PUBLIC SPEECH': 0.8983050847457628,
 'PUBLISHED WORK': 0.6287878787878788,
 'SOCIAL MEDIA POST': 0.6538461538461539,
 'STATEMENT': 0.8375,
 'VOTE/POLL': 0.4782608695652174}