In [30]:
import re
import ast
import pyperclip
import pprint
import os
from together import Together
from openai import OpenAI
import pandas as pd 
import glob

together_client = Together(api_key=open('/Users/spangher/.togetherai-usc-key.txt').read().strip())
os.environ['OPENAI_API_KEY']=open('/Users/spangher/.openai-isi-project-key.txt').read().strip()
openai_client = OpenAI()

def query_together(prompt, client=together_client):
    response = client.chat.completions.create(
        model="meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo",
        messages=[{
                    "role": "system",
                    "content": "You are an experienced journalist."
                },
                {
                    "role": "user",
                    "content": prompt
                }],
        max_tokens=1048,
        temperature=0.1,
        top_p=0.7,
        top_k=50,
        repetition_penalty=1,
        stop=["<|eot_id|>","<|eom_id|>"],
    )
    return response.choices[0].message.content

def query_openai(prompt, client=openai_client):
    completion = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": "You are a helpful journalist assistant."},
            {
                "role": "user",
                "content": prompt
            }
        ]
    )
    return completion.choices[0].message

def parse_sources(input_string):
    import re
    # Remove any starting text before the first 'Name:'
    match = re.search(r'\bName:', input_string)
    if match:
        input_string = input_string[match.start():]
    else:
        # No 'Name:' found, return empty list
        return []
        
    # Split the input string into blocks separated by two or more newlines
    blocks = re.split(r'\n\s*\n', input_string)
    source_list = []
    for block in blocks:
        block = block.strip()
        if not block:
            continue
            
        # Initialize a dictionary to store fields
        source_dict = {}
        current_field = None
        # Split the block into lines
        lines = block.split('\n')
        for line in lines:
            line = line.strip()
            if not line:
                continue
                
            # Match any line that looks like 'Field Name: Value'
            m = re.match(r'^([^:]+):\s*(.*)', line)
            if m:
                field_name = m.group(1).strip()
                field_name = re.sub(r'\s*(\(?\d+\)?[:.\s]*)', '', field_name).strip()
                field_name = re.sub(r'-|\**|Grouped', '', field_name).strip()
                field_value = m.group(2).strip()
                field_value = re.sub(r'\(.*\)', '', field_value).strip()
                source_dict[field_name] = field_value
                current_field = field_name
            else:
                # If the line doesn't start with a field name, it's part of the previous field
                if current_field:
                    source_dict[current_field] += ' ' + line
        # Only add the source if it contains at least one field
        if source_dict:
            source_list.append(source_dict)
    return source_list

os.environ['TOKENIZERS_PARALLELISM'] = 'false'

from tqdm.auto import tqdm
def batchify(iterable, n=1):
    l = len(iterable)
    for ndx in range(0, l, n):
        yield iterable[ndx:min(ndx + n, l)]

import numpy as np 
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer

import jsonlines
from io import StringIO
import glob
import json
import re 

def format_batch_prompt(prompts):
    output_ps = []
    for i, p in enumerate(prompts):
        message_body = {
            "custom_id": f"request-{i}", 
            "method": "POST", 
            "url": "/v1/chat/completions", 
            "body": {
                "model": "gpt-4o-mini", 
                "messages": [{
                    "role": "system", 
                    "content": "You are a helpful journalist's assistant."
                },{
                    "role": "user", 
                    "content": p
                }],"max_tokens": 1000
            }
        }
        output_ps.append(message_body)
    return output_ps

import random
def create_triplets(full_data_exp_df, max_negatives_per_positive=10, max_positives=10):
    """
    Creates triplets of the form {'anchor': source_1, 'positive': source_2_positive, 'negative': source_2_negative}
    where each source_1 is paired with a source_2 with "Yes" (positive) and "No" (negative).
    
    Parameters:
        full_data_exp_df (pd.DataFrame): DataFrame containing 'source_1', 'source_2', and 'output' columns.
    
    Returns:
        triplets (list): List of dictionaries containing 'anchor', 'positive', and 'negative' keys.
    """
    # Initialize an empty list to collect triplets
    triplets = []

    # Ensure the 'output' column is in a consistent format
    full_data_exp_df['output'] = full_data_exp_df['output'].str.strip().str.lower()

    # Group the DataFrame by 'source_1'
    grouped = full_data_exp_df.groupby('source_1')

    # For each group (each 'source_1')
    for source_1_value, group in grouped:
        # Get the positive and negative examples
        positive_examples = group.loc[group['output'] == 'yes', 'source_2'].tolist()
        negative_examples = group.loc[group['output'] == 'no', 'source_2'].tolist()

        # If we have at least one positive and one negative example
        if positive_examples and negative_examples:
            # For each positive example
            # Optionally limit the number of negatives per positive
            if max_positives:
                positives_to_use = random.sample(positive_examples, min(max_positives, len(positive_examples)))
            else:
                positives_to_use = positive_examples

            if max_negatives_per_positive:
                negatives_to_use = random.sample(negative_examples, min(max_negatives_per_positive, len(negative_examples)))
            else:
                negatives_to_use = negative_examples
                
            for pos in positives_to_use:
                for neg in negatives_to_use:
                    # Create triplet
                    triplet = {
                        'anchor': source_1_value,
                        'positive': pos,
                        'negative': neg
                    }
                    # Append to list
                    triplets.append(triplet)
                    
    return triplets

# Read in Keyword Files

In [2]:
files = glob.glob('../data/v2_narr_keywords/*')
dfs = []
for f in files:
    dfs.append(pd.read_json(f, lines=True))
df = pd.concat(dfs)

In [3]:
source_df = (
    df
    .assign(parsed_sources=lambda df: df['response'].apply(parse_sources))
    .explode('parsed_sources')
    .dropna()
)

source_df = (source_df[['url', 'parsed_sources']]
 .pipe(lambda df: pd.concat([
     df['url'].reset_index(drop=True),
     pd.DataFrame(df['parsed_sources'].tolist())
 ], axis=1)
    )
)

In [5]:
cols_to_keep = ['url', 'Name', 'Original Name', 'Narrative Function', 'Is_Error']
source_df = source_df[cols_to_keep]#.dropna()

In [5]:
# source_df.to_csv('../make_source_label_hierarchy/similarity_training_data/source-df-to-label.csv')
source_df = pd.read_csv('../make_source_label_hierarchy/similarity_training_data/source-df-to-label.csv')

In [244]:
pairwise_samples_to_evaluate = (
    source_df
     .loc[lambda df: df['Is_Error'] == 'No']['Narrative Function']
     .pipe(lambda s:
          pd.concat([
                s.sample(frac=10, replace=True).reset_index(drop=True).rename('source_1'),
                s.sample(frac=10, replace=True).reset_index(drop=True).rename('source_2')
            ], axis=1).dropna()
          )
)

In [None]:
model = SentenceTransformer("all-MiniLM-L6-v2")
embs = (
    source_df
        .loc[lambda df: df['Is_Error'] == 'No']['Narrative Function']
        .dropna()
        .pipe(lambda s: model.encode(s.str.split(':').str.get(0).tolist(), show_progress_bar=True))
)

In [735]:
idx_of_df = source_df.loc[lambda df: df['Is_Error'] == 'No']['Narrative Function'].dropna().index

In [369]:
high_sim_pairs = (
    pd.DataFrame(np.tril(cosine_similarity(embs, embs)), index=idx_of_df, columns=idx_of_df)
     .unstack()
     .loc[lambda s: s > .3]
     .loc[lambda s: s < .99999]
     .reset_index()
     .loc[lambda df: df['level_0'] != df['level_1']]
)

In [433]:
high_sim_sample = high_sim_pairs.sample(2_000_000)

In [434]:
high_sim_pairwise_samples_to_evaluate = (
    source_df['Narrative Function']
     .pipe(lambda s:
          pd.concat([
                s.loc[high_sim_sample['level_0']].rename('source_1').reset_index(drop=True),
                s.loc[high_sim_sample['level_1']].rename('source_2').reset_index(drop=True),
            ], axis=1).dropna()
          )
    .drop_duplicates()
)

In [456]:
k = 5
prompt = ("""I will show you {k} pairs of sources, all from different news articles.

Are the two sources in each pair playing similar narrative roles in their respective articles?
Think broadly about the role the source is playing, given the description. Don't pay attention to the specific events of each story.
Answer with "Yes" or "No".

""" + 
'\n'.join(list(map(lambda x: f'{x}. Source 1: {{source_1_{x}}}, Source 2: {{source_2_{x}}}', range(1, k+1))))

+"""

Answer each sequentially and number them with 1., 2., 3., etc.""")
# Explain your decision and then answer with "Yes" or "No".

In [None]:
all_prompts = []
for t in tqdm(batchify(high_sim_pairwise_samples_to_evaluate, k), total=int(len(high_sim_pairwise_samples_to_evaluate) / 5)):
    input_dict = {}
    for i, (_, row) in enumerate(t.iterrows()):
        r = row.rename(lambda x: f"{x}_{i+1}")
        input_dict.update(r)
    p = prompt.format(k=k, **input_dict)
    all_prompts.append(p)

In [658]:
# p_idx = 1
# resp = query_together(all_prompts[p_idx])
# # print(resp)
# resp = query_openai(all_prompts[p_idx])
# # print(resp)
# print(all_prompts[p_idx])

In [496]:
prompt_batches = []
for prompt_batch in batchify(all_prompts, 40_000):
    batch_to_write = format_batch_prompt(prompt_batch)
    prompt_batches.append(batch_to_write)

In [498]:
# ! mkdir -p ../data/openai-batches/narr-role-similarity/

In [499]:
for i, b in enumerate(prompt_batches):
    with jsonlines.open(f'../data/openai-batches/narr-role-similarity/narr-role-similarity-batch-{i}.jsonl', 'w') as f:
        f.write_all(b)

In [1]:
import pandas as pd 

In [7]:
t = pd.read_json("../data/openai-batches/narr-role-similarity/narr-role-similarity-batch-0.jsonl", lines=True)

In [18]:
print(t['body'][1]['messages'][1]['content'])

I will show you 5 pairs of sources, all from different news articles.

Are the two sources in each pair playing similar narrative roles in their respective articles?
Think broadly about the role the source is playing, given the description. Don't pay attention to the specific events of each story.
Answer with "Yes" or "No".

1. Source 1: "Supporting Organization": This source is an organization that is supporting the efforts of the main actor and providing additional context., Source 2: "Government Response": This source is a government official, providing a response to the breach and calling for regulatory action.
2. Source 1: "Example": This source is used to provide an example or anecdote to illustrate a point., Source 2: "Explainer": This source is used to explain how the software works and its benefits.
3. Source 1: "Example": This source is used as an example to illustrate the mobile payment solution., Source 2: "Evidence": This source is used to provide evidence for the effectiv

In [500]:
# rm ../data/openai-batches/narr-role-similarity/b*

In [501]:
all_batch_files = []
for f in tqdm(glob.glob('../data/openai-batches/narr-role-similarity/narr-role-similarity-batch-*')):
    batch_input_file = openai_client.files.create(file=open(f, "rb"), purpose="batch")
    all_batch_files.append(batch_input_file)

  0%|          | 0/10 [00:00<?, ?it/s]

In [541]:
all_batch_file_df = pd.DataFrame(list(map(lambda x: x.to_dict(), all_batch_files)))

In [493]:
# for b in all_batch_files:
    # openai_client.files.delete(b.id)

In [509]:
all_batch_ids = []
for b in all_batch_files[1:]:
    batch_input_file_id = b.id
    b_record = openai_client.batches.create(
        input_file_id=batch_input_file_id,
        endpoint="/v1/chat/completions",
        completion_window="24h",
        metadata={
          "description": "nightly eval job"
        }
    )
    all_batch_ids.append(b_record.id)

In [None]:
## download data from OpenAI

all_batch_info = list(map(openai_client.batches.retrieve, all_batch_ids))
all_batch_info_df = pd.DataFrame(list(map(lambda x: x.to_dict(), all_batch_info)))

df = (
    all_batch_info_df
         [['input_file_id', 'output_file_id']]
         .merge(all_batch_file_df[['id', 'filename']], left_on='input_file_id', right_on='id')
         [['output_file_id', 'filename']]
)
for _, (openai_completed_file_id, orig_filename) in df.iterrows():
    fname, fext = os.path.splitext(orig_filename)
    fout = f'../data/openai-batches/narr-role-similarity/{fname}__output{fext}'
    file_response = openai_client.files.content(openai_completed_file_id)
    with open(fout, 'w') as f:
        f.write(file_response.text)

In [None]:
import glob, os, re
import jsonlines
## read input/output files

output_files = glob.glob('../data/openai-batches/narr-role-similarity/*__output.jsonl')
input_files = glob.glob('../data/openai-batches/narr-role-similarity/*.jsonl')
input_files = list(filter(lambda x: '__output' not in x, input_files))

all_data = []
for f in output_files:
    data = list(jsonlines.open(f))
    data_df = pd.DataFrame(data)
    data_df['filename'] = os.path.basename(f)
    all_data.append(data_df)

all_input_data = []
for f in input_files:
    data = list(jsonlines.open(f))
    data_df = pd.DataFrame(data)
    data_df['filename'] = os.path.basename(f)
    all_input_data.append(data_df)

all_output_data_df = pd.concat(all_data)
all_input_data_df = pd.concat(all_input_data)

##
all_input_data_df['input'] = all_input_data_df['body'].str.get('messages').str.get(1).str.get('content')
all_input_data_df['f_key'] =  all_input_data_df['filename'].str.replace('.jsonl', '')
##
all_output_data_df['output'] = all_output_data_df['response'].str.get('body').str.get('choices').str.get(0).str.get('message').str.get('content')
all_output_data_df['f_key'] = all_output_data_df['filename'].str.split('__').str.get(0)

full_data_df = all_input_data_df[['custom_id', 'f_key', 'input']].merge(all_output_data_df[['custom_id', 'f_key', 'output']])
full_data_df['input'] = full_data_df['input'].str.split('\n').apply(lambda x: list(filter(lambda y: re.search(r'^\d\.', y), x)))
full_data_df['output'] = full_data_df['output'].str.split('\n')
full_data_df=  full_data_df.loc[lambda df: df['output'].str.len() == 5]
full_data_exp_df = full_data_df.explode(['input', 'output'])
full_data_exp_df['input_chunks'] = full_data_exp_df['input'].str.split(r'Source \d\:', regex=True)

full_data_exp_df = (
    full_data_exp_df
         .assign(source_1=lambda df: df['input_chunks'].str.get(1).str.strip())
         .assign(source_2=lambda df: df['input_chunks'].str.get(2).str.strip())
         .drop(columns=['input', 'input_chunks'])
)

full_data_exp_df['output'] = full_data_exp_df['output'].str.replace(r'\d\.', '', regex=True).str.strip()
full_data_exp_df = full_data_exp_df.reset_index(drop=True)
full_data_exp_df['source_1'] = full_data_exp_df['source_1'].str.rstrip(',')

In [95]:
# full_data_exp_df.to_csv('../make_source_label_hierarchy/similarity_training_data/batch-processed-openai-labeled-data.csv.gz', compression='gzip')

In [719]:
triplets = create_triplets(full_data_exp_df, max_positives=8, max_negatives_per_positive=8)

In [724]:
with jsonlines.open('../data/similarity_training_data/source-triplets.jsonl', 'w') as f:
    f.write_all(triplets)

In [725]:
triplets[:2]

[{'anchor': '"ADDITIONAL_INFO": This source is used to provide additional information and tips about credit scores.,',
  'positive': '"Additional Information": The source is used to provide additional information about the investigation.',
  'negative': '"Policy Announcement": This source is used to announce and explain the new policy.'},
 {'anchor': '"ADDITIONAL_INFO": This source is used to provide additional information and tips about credit scores.,',
  'positive': '"Additional Information": The source is used to provide additional information about the investigation.',
  'negative': '"Data Point": This source is used to provide data and statistics to support the article\'s claims about small business owners\' plans and concerns regarding cybersecurity and IoT investments.'}]

# Evaluate

In [15]:
from sklearn.metrics import f1_score
from tqdm.auto import tqdm
from sentence_transformers import SentenceTransformer
import pandas as pd 
import numpy as np 
from sklearn.metrics.pairwise import cosine_similarity

scratch_proj_dir = '/scratch1/spangher/conditional-information-retrieval'
full_data_exp_df = pd.read_csv(
    f'{scratch_proj_dir}/make_source_label_hierarchy/similarity_training_data/batch-processed-openai-labeled-data.csv.gz', 
    index_col=0
)

In [3]:
# model = SentenceTransformer("all-MiniLM-L6-v2")
model = SentenceTransformer("../make_source_label_hierarchy/trained-models/mpnet-trained-model")

In [7]:
source_df = pd.read_csv(f'{scratch_proj_dir}/make_source_label_hierarchy/similarity_training_data/source-df-to-label.csv', index_col=0)
source_df = source_df.loc[lambda df: df['Is_Error'] == 'No']
source_df = source_df.loc[lambda df: df['Narrative Function'].notnull()]

In [61]:
embs = np.loadtxt('../make_source_label_hierarchy/similarity_training_data/custom-embs.txt')
# embs = np.load('/scratch1/spangher/conditional-information-retrieval/make_source_label_hierarchy/similarity_training_data/source_label_hierarchy_train_embeddings.txt.npy')

In [62]:
embs.shape

(35903, 768)

In [63]:
high_sim_pairs = (
    pd.DataFrame(np.tril(cosine_similarity(embs, embs)), index=source_df.index, columns=source_df.index)
     .unstack()
     .loc[lambda s: s > .1]
)

In [64]:
input_sent_df = pd.concat([
    source_df.loc[high_sim_pairs.reset_index()['level_0']]['Narrative Function'].to_frame('sent_1').reset_index(drop=True),
    source_df.loc[high_sim_pairs.reset_index()['level_1']]['Narrative Function'].to_frame('sent_2').reset_index(drop=True),
    high_sim_pairs.to_frame('distance').reset_index(drop=True)
], axis=1)
# high_sim_pairs.reset_index()

In [82]:
sbert_eval_data = (
    full_data_exp_df
        .merge(input_sent_df, left_on=['source_1', 'source_2'], right_on=['sent_1', 'sent_2'], how='left')
        # [['output', 'distance']]
)

In [66]:
sbert_eval_data.fillna(0).groupby('output')['distance'].mean()

output
No     0.348549
Yes    0.517865
Name: distance, dtype: float64

In [816]:
sbert_eval_data.fillna(0).groupby('output')['distance'].mean()

output
no     0.280045
yes    0.365856
Name: distance, dtype: float32

In [67]:
from sklearn.metrics import precision_score, recall_score, f1_score
f1s = []
for i in tqdm(np.arange(0, 1, .05)):
    binarized_data = (
        sbert_eval_data
             .fillna(0)
             .assign(y_true=lambda df: df['output'].map({'yes': 1, 'no': 0, 'Yes': 1, 'No': 0}))
             .assign(y_pred=lambda df: df['distance'].fillna(0).pipe(lambda s: pd.Series(np.where(s < i, 0, 1))))
    )

    f1s.append({
        'threshold': i,
        'f1_score': f1_score(binarized_data['y_true'], binarized_data['y_pred']),
        'prec_score': precision_score(binarized_data['y_true'], binarized_data['y_pred']),
        'recall_score': recall_score(binarized_data['y_true'], binarized_data['y_pred']),
    })

  0%|          | 0/20 [00:00<?, ?it/s]

In [84]:
prec_errors = sbert_eval_data.loc[lambda df: df['distance'] > .7].loc[lambda df: df['output'] == 'No']

In [93]:
sbert_eval_data.loc[lambda df: df['distance'] < .3].loc[lambda df: df['output'] == 'Yes'].iloc[2].to_dict()

{'custom_id': 'request-36',
 'f_key': 'narr-role-similarity-batch-9',
 'output': 'Yes',
 'source_1': '"Alternative Perspective": This source is used to provide an alternative perspective on the labor dispute, and to offer a different analysis of the implications of the agreement.',
 'source_2': '"Counterpoint": This source provides a contrasting view to the main actor\'s plans and actions.',
 'sent_1': '"Alternative Perspective": This source is used to provide an alternative perspective on the labor dispute, and to offer a different analysis of the implications of the agreement.',
 'sent_2': '"Counterpoint": This source provides a contrasting view to the main actor\'s plans and actions.',
 'distance': 0.2797901793754877}

In [80]:
sbert_eval_data['distance'].round(1).value_counts(normalize=True).sort_index()

distance
0.1    0.027467
0.2    0.134139
0.3    0.246986
0.4    0.249522
0.5    0.188544
0.6    0.101467
0.7    0.037279
0.8    0.012369
0.9    0.002132
1.0    0.000095
Name: proportion, dtype: float64

In [68]:
pd.DataFrame(f1s)

Unnamed: 0,threshold,f1_score,prec_score,recall_score
0,0.0,0.433902,0.277059,1.0
1,0.05,0.43504,0.277989,0.999988
2,0.1,0.43504,0.277989,0.999988
3,0.15,0.444454,0.285749,0.999668
4,0.2,0.461914,0.300562,0.997294
5,0.25,0.49141,0.327175,0.986723
6,0.3,0.532405,0.368773,0.957085
7,0.35,0.578351,0.425103,0.904374
8,0.4,0.623124,0.498137,0.831842
9,0.45,0.632826,0.573694,0.70555


In [819]:
pd.DataFrame(f1s)

Unnamed: 0,threshold,f1_score
0,0.0,0.449812
1,0.05,0.394453
2,0.1,0.394453
3,0.15,0.394453
4,0.2,0.394453
5,0.25,0.394453
6,0.3,0.394453
7,0.35,0.460963
8,0.4,0.471455
9,0.45,0.476128


In [54]:
from sklearn.metrics import roc_auc_score
(sbert_eval_data
 .fillna(0)
 .assign(output=lambda df: df['output'].map({'yes': 1, 'no': 0, 'Yes': 1, 'No': 0}))
 .pipe(lambda df: roc_auc_score(y_true=df['output'], y_score=df['distance']))
)

0.8306007014120191

In [439]:
import tiktoken
enc = tiktoken.get_encoding("o200k_base")

In [440]:
tokenized_prompts = enc.encode_batch(all_prompts)

In [444]:
t = sum(list(map(len, tokenized_prompts)))

In [446]:
t / 1_000_000 * 0.075

9.963498750000001

# Try Original Narrative DF

In [106]:
import json
parsed_narr = (
    json.load(open('../data/v2_narr_parsed/v2_train_set_narr.json')) +
    json.load(open('../data/v2_narr_parsed/v2_test_set_narr.json'))
)

narr_df = pd.DataFrame(parsed_narr)

In [108]:
source_narr_df = narr_df.explode('sources').dropna().reset_index(drop=True)
source_narr_df = pd.concat([
    source_narr_df[['url']],
    source_narr_df['sources'].pipe(lambda s: pd.DataFrame(s.tolist()))
], axis=1)

In [110]:
source_narr_df['Narrative function']

0         Provides expert opinion on the risks of Facebo...
1         Provides information about the Cambridge Analy...
2         Provides a notification about push notifications.
3         Introduces the topic of Facebook's data collec...
4         Provides the results of the 2012 Paychex Payro...
                                ...                        
343923    Provides information about its own business pr...
343924    Moderates a discussion about Wall Street's val...
343925    Provides information about market data and ind...
343926    Provides information about the Dow Jones brand...
343927                    Claims copyright over the article
Name: Narrative function, Length: 343928, dtype: object

In [127]:
(source_df
     .loc[lambda df: df['Is_Error'] == 'No']
     [['url', 'Name', 'Narrative Function']]
     .merge(source_narr_df[['url', 'Name', 'Narrative function']], on=['url', 'Name'])
     .iloc[5]
     .to_dict()
)

{'url': 'www.ctvnews.ca/sci-tech/instagram-is-rolling-out-new-supervision-tools-for-parents-in-the-u-s-1.5821572',
 'Name': 'Instagram',
 'Narrative Function': '"Event Context": This source provides context for the events that led to the release of the new tools, including the pause of the plan to release a version of Instagram for kids under 13.',
 'Narrative function': 'Provides information about its own actions and features.'}