In [434]:
! pwd 

/Users/spangher/Projects/berkeley-research/information-retrieval-partially-obscured/notebooks


In [17]:
import re
import ast
import pyperclip
import pprint
import os
from together import Together

client = Together(api_key=open('/Users/spangher/.togetherai-usc-key.txt').read().strip())
def query_together(prompt, client=client):
    response = client.chat.completions.create(
        model="meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo",
        messages=[{
                    "role": "system",
                    "content": "You are an experienced journalist."
                },
                {
                    "role": "user",
                    "content": prompt
                }],
        max_tokens=1048,
        temperature=0.1,
        top_p=0.7,
        top_k=50,
        repetition_penalty=1,
        stop=["<|eot_id|>","<|eom_id|>"],
    )
    return response.choices[0].message.content

In [None]:
import pandas as pd 
from unidecode import unidecode

df = pd.read_csv('../source_classification/full-training-df.csv', index_col=0)
df = df.loc[lambda df: df['sent'].str.strip().str.len() > 1]

In [422]:
df_to_model = (
    df.loc[lambda df: df['doc_id'] == df['doc_id'].drop_duplicates().iloc[5]]
        .rename(columns={'sent': 'sentence', 'head': 'source'})
)
json_str = (
    df_to_model    
         [['sentence', 'source']]
         .to_json(lines=True, orient='records')
)

In [423]:
df_to_model

Unnamed: 0,sentence,sent_idx,source,quote_type,source_type,doc_id,label,affiliation,role
172,China is canceling plans to build more than 10...,0,China,Proposal/Order/Law,Named Group,903,True,Government,Decision Maker
173,"The announcement , made by China ’s National E...",1,China ’s National Energy Administration,Proposal/Order/Law,Named Group,903,True,Government,Decision Maker
174,That includes dozens of projects in 13 provinc...,2,China ’s National Energy Administration,Proposal/Order/Law,,903,True,,
175,Those projects alone would have had a combined...,3,Greenpeace,Published Work/Press Report,Named Group,903,True,NGO,Informational
176,The cancellations make it likelier that China ...,4,,No Quote,,903,True,,
177,"That huge figure , three times the total coal ...",5,,No Quote,,903,True,,
178,Its coal plants now run at about half of capac...,6,,No Quote,,903,True,,
179,"Nevertheless , China ’s capacity would have su...",7,,No Quote,,903,True,,
180,The new announcements are in addition to a ser...,8,,No Quote,,903,True,,
182,"The key thing is that yes , China has a long w...",10,Lauri Myllyvirta,Indirect Quote,Named Individual,903,True,NGO,Informational


In [424]:
pyperclip.copy(df_to_model[['source', 'sentence']].to_csv(sep='\t'))

In [18]:
INFORMATION_PROMPT = '''
Here is a news article, with each sentence annotated according to the source of it‛s information:
    ```{json_str}```

    Please summarize each source. Include unnamed sources (e.g. "witnesses"). Include any facts that might have come from the source, even if we didn't label it.
    Generate only ONE summary per source. Group sources that are clearly the same but named slightly differently. For example: "Andrew Dresden" and "Dresden" should be grouped together as one source. "Lao Diplomats" and "Laotian Diplomats" should be grouped together as one source.
    Split source annotations that refer to multiple sources into separate summaries. For example: if the annotation is "John and Jane", generate two separate summaries, one for "John" and one for "Jane". 
    
    For each source, provide the following information:
        (1) Name: who the source is.
        (2) Original Name: What their original name(s) are in our annotations.
        (3) Information: Restate the facts provided by the source. Be as SPECIFIC and be as VERBOSE as possible. 
            Contextualize all events the source describes, names mentioned and everything the source says with AS MUCH BACKGROUND INFORMATION relating to the main events of the article so I can fully understand the information the source is giving.
            For example, don't just say "the crash", say "the plane crash carrying key senior Laotian government officials"
            Or for another example, don't just say "There were about 20 passengers on board", say "there were 20 passengers on board the plane crash carrying key senior Laotian government officials."  
    Output the summary in a list of python dictionaries with one key per number. Don't say anything else.<|eot_id|>
'''

NARRATIVE_PROMPT = '''
Here is a news article, with each sentence annotated according to the source of it‛s information:
    ```{json_str}```

    Please describe the role each source plays. Include unnamed sources (e.g. "witnesses"). Include any facts that might have come from the source, even if we didn't label it.
    Generate only ONE summary per source. Group sources that are clearly the same but named slightly differently. For example: "Andrew Dresden" and "Dresden" should be grouped together as one source. "Lao Diplomats" and "Laotian Diplomats" should be grouped together as one source.
    Split source annotations that refer to multiple sources into separate summaries. For example: if the annotation is "John and Jane", generate two separate summaries, one for "John" and one for "Jane". 
    
    For each source, provide the following information:
        (1) Name: who the source is.
        (2) Original Name: What their original name(s) are in our annotations.
        (3) Narrative function: What is their narrative function in the article? (1-2 sentences)
        (4) Perspective: What is their perspective on the main events of the article? Choose from "Authoritative", "Informative", "Supportive", "Skeptical", "Against", "Neutral".
        (5) Centrality: How central is this source to the main events of the article? Choose from "High", "Medium", "Low".
        (6) JUSTIFY your choice for (4) and (5) in 1-2 sentences. 
    Output the summary in a list of python dictionaries with one key per number. Output only the python dictionary.
'''

CENTRALITY_PROMPT = '''
Here is a news article, with each sentence annotated according to the source of it‛s information:
    ```{json_str}```

    Please describe the role each source plays. Include unnamed sources (e.g. "witnesses"). Include any facts that might have come from the source, even if we didn't label it.
    Generate only ONE summary per source. Group sources that are clearly the same but named slightly differently. For example: "Andrew Dresden" and "Dresden" should be grouped together as one source. "Lao Diplomats" and "Laotian Diplomats" should be grouped together as one source.
    Split source annotations that refer to multiple sources into separate summaries. For example: if the annotation is "John and Jane", generate two separate summaries, one for "John" and one for "Jane". 
    
    For each source, provide the following information:
        (1) Name: who the source is.
        (2) Original Name: What their original name(s) are in our annotations.
        (3) Perspective: What is their perspective on the main events of the article? Choose from "Authoritative", "Informative", "Supportive", "Skeptical", "Against", "Neutral". Then, JUSTIFY your choice in 1-2 sentences.
        (4) Centrality: How central is this source to the main events of the article? Choose from "Central", "Medium", "Low". Then, JUSTIFY your choice in 1-2 sentences.
            
    JUSTIFY your choice for (3) and (4) in 1-2 sentences. Output the summary in a list of python dictionaries with one key per number.
'''
# ?

In [426]:
import re
import ast
import pyperclip
import pprint
import os
from together import Together

prompt = NARRATIVE_PROMPT.format(json_str=json_str)
response = query_together(prompt=prompt)

In [427]:
pattern = r'\[.*?\]'

# Replace matches with an empty string
r = response.choices[0].message.content
output = re.search(pattern, r, flags=re.DOTALL)
parsed_output = ast.literal_eval(output[0])

In [428]:
print(r)

[
    {
        "Name": "China",
        "Original Name": "China",
        "Narrative function": "Provides the main announcement of canceling coal-fired power plants, setting the tone for the article.",
        "Perspective": "Authoritative",
        "Centrality": "High",
        "Justification": "As the government making the announcement, China is the central authority on this decision, and their perspective is authoritative. The article revolves around this announcement, making China a high-centrality source."
    },
    {
        "Name": "China's National Energy Administration",
        "Original Name": "China\u2019s National Energy Administration",
        "Narrative function": "Provides specific details about the canceled projects, such as the number of projects and their locations.",
        "Perspective": "Informative",
        "Centrality": "Medium",
        "Justification": "As the agency responsible for energy administration, their perspective is informative, providing factua

In [429]:
pyperclip.copy(response.choices[0].message.content)

In [430]:
response_df = pd.DataFrame(parsed_output)

In [431]:
# pprint.pprint(response_df.to_dict( orient='records'), indent=4, width=120)

In [432]:
pyperclip.copy(response_df.to_csv(sep='\t'))

# Try to partially obscure sources from the original news articles

In [1]:
from datasets import load_from_disk
import pandas as pd
import numpy as np 
from tqdm.auto import tqdm
import json
orig_obscured_sources = json.load(open('../data/sources_data_70b__900_1000_obscured.json'))
orig_obscured_sources_df = pd.DataFrame(orig_obscured_sources)

In [3]:
# data_dir = '../data'
data_dir = '../../../bloomberg-research/press-releases/data/s_p_500_backlinks'
source_df = pd.read_json(f'{data_dir}/full-source-scored-data.jsonl.gz', lines=True, compression='gzip')
# source_df = source_df.loc[lambda df: df['article_url'].isin(orig_obscured_sources_df['article_url'])]

In [4]:
source_df.shape

(341175, 6)

In [4]:
article_d = load_from_disk(f'{data_dir}/all-coref-resolved')

Loading dataset from disk:   0%|          | 0/41 [00:00<?, ?it/s]

In [5]:
a_urls = set(source_df['article_url'])
filtered_article_d = article_d.filter(lambda x: x['article_url'] in a_urls, num_proc=10)
filtered_article_df = filtered_article_d.to_pandas()

Filter (num_proc=10):   0%|          | 0/496380 [00:00<?, ? examples/s]

In [79]:
disallowed_quote_types = set(['Other', 'No Quote'])
sentences_with_quotes = (
    filtered_article_d
         .to_pandas()
         .merge(source_df, on='article_url')
         [['article_url', 'attributions', 'quote_type', 'sent_lists',]]
         .explode(['attributions', 'quote_type', 'sent_lists'])
)

sentences_with_quotes = (sentences_with_quotes
     .assign(attributions=lambda df: 
             df.apply(lambda x: x['attributions'] if x['quote_type'] not in disallowed_quote_types else np.nan, axis=1)
    )
)

In [80]:
idx = 5

In [92]:
orig_obscured_sources_df[['article_url', 'obscured_sources']].iloc[idx].to_dict()['obscured_sources']

{'District Court of The Hague': 'The District Court of The Hague is a judicial institution.',
 'Nestle': 'OUTPUT: Nestle is a multinational food and drink company.',
 'Impossible Foods': 'OUTPUT: Impossible Foods is a company.',
 'Nielsen': 'Nielsen is a market research firm.'}

In [82]:
one_article = (
    sentences_with_quotes
         .loc[lambda df: df['article_url'] == df['article_url'].unique()[idx]]
         .reset_index(drop=True)
)

json_str = one_article[['sent_lists', 'attributions']].to_json(lines=True, orient='records')

In [83]:
one_article

Unnamed: 0,article_url,attributions,quote_type,sent_lists
0,www.wsj.com/articles/nestle-loses-fight-with-i...,,No Quote,The order by the District Court of The Hague i...
1,www.wsj.com/articles/nestle-loses-fight-with-i...,Nestle,Statement/Public Speech,Nestle said it would rebrand the product as Se...
2,www.wsj.com/articles/nestle-loses-fight-with-i...,,No Quote,The ruling is a setback for the world's bigges...
3,www.wsj.com/articles/nestle-loses-fight-with-i...,Nestle spokesman,Indirect Quote,We are disappointed by this provisional ruling...
4,www.wsj.com/articles/nestle-loses-fight-with-i...,,No Quote,"Nestle's Incredible Burger, made from soy and ..."
5,www.wsj.com/articles/nestle-loses-fight-with-i...,Nestle,Direct Quote,"Outside Europe, the burger is sold in Australi..."
6,www.wsj.com/articles/nestle-loses-fight-with-i...,Nestle,Background/Narrative,Nestle also sells a different plant-based burg...
7,www.wsj.com/articles/nestle-loses-fight-with-i...,,No Quote,Related Articles\n Plant-Based Meat Makers Com...
8,www.wsj.com/articles/nestle-loses-fight-with-i...,Impossible Foods,Court Proceeding,The U.S. company sought an injunction against ...
9,www.wsj.com/articles/nestle-loses-fight-with-i...,,No Quote,The legal spat underscores the race for domina...


In [88]:
INFORMATION_PROMPT = '''
Here is a news article, with each sentence annotated according to the source of it‛s information:
    ```{json_str}```

    Please restate the biographical information used to describe the source in the article. 
    Do NOT include any information that the source gives to the story. ONLY include biographical information about the source.
    Include unnamed sources (e.g. "witnesses"). Include any biographical information that might pertain to the source, even if we didn't label it.
    Generate only ONE summary per source. Group sources that are clearly the same but named slightly differently. For example: "Andrew Dresden" and "Dresden" should be grouped together as one source. "Lao Diplomats" and "Laotian Diplomats" should be grouped together as one source.
    Split source annotations that refer to multiple sources into separate summaries. For example: if the annotation is "John and Jane", generate two separate summaries, one for "John" and one for "Jane". 
    
    For each source, provide the following information:
        (1) Name: who the source is.
        (2) Original Name: What their original name(s) are in our annotations.
        (3) Information: Restate the biographical information provided by the source. Be as SPECIFIC and be as VERBOSE as possible. 
            Contextualize all biographical information about the source with AS MUCH BACKGROUND INFORMATION mentioned in the article so I can fully understand who the source is.
            Be verbose.
            For example, don't just say "the crash", say "the plane crash carrying key senior Laotian government officials".
            Don't include any information that the source directly provided to the story.
    Output the summary in a list of python dictionaries with one key per number. Don't say anything else.
'''

In [89]:
# , including biographical events and names mentioned

In [90]:
prompt = INFORMATION_PROMPT.format(json_str=json_str)
response = query_together(prompt=prompt)

In [91]:
print(response)

[
    {
        "Name": "Nestle",
        "Original Name": "Nestle",
        "Information": "Nestle is the world's biggest packaged-foods maker, a Swiss giant in the food industry."
    },
    {
        "Name": "Nestle spokesman",
        "Original Name": "Nestle spokesman",
        "Information": "The Nestle spokesman is a representative of Nestle, the world's biggest packaged-foods maker, a Swiss giant in the food industry."
    },
    {
        "Name": "Impossible Foods",
        "Original Name": "Impossible Foods",
        "Information": "Impossible Foods is a U.S. company that produces plant-based meat products, founded in 2011, and is seeking to enter the European market."
    },
    {
        "Name": "Nielsen",
        "Original Name": "Nielsen",
        "Information": "Nielsen is a research firm that tracks sales data in U.S. retail stores."
    },
    {
        "Name": "Smithfield Foods Inc.",
        "Original Name": "Smithfield Foods Inc.",
        "Information": "Smithfield

In [47]:
# , including biographical events and names mentioned