In [293]:
import re
import ast
import pyperclip
import pprint
import os
from together import Together
import os 
os.environ['TOKENIZERS_PARALLELISM'] = 'false'


client = Together(api_key=open('/Users/spangher/.togetherai-usc-key.txt').read().strip())
def query_together(prompt, client=client):
    response = client.chat.completions.create(
        model="meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo",
        messages=[{
                    "role": "system",
                    "content": "You are an experienced journalist."
                },
                {
                    "role": "user",
                    "content": prompt
                }],
        max_tokens=2048,
        temperature=0.1,
        top_p=0.7,
        top_k=50,
        repetition_penalty=1,
        stop=["<|eot_id|>","<|eom_id|>"],
    )
    return response.choices[0].message.content

In [213]:
CLEAN_ARTICLE_TEXT_PROMPT = """
You are a helpful editor assistant. I scraped this news article from the web and removed HTML tags, however, 
there is a lot of extraneous text from the HTML page that remains. Please only extract text related to the news article.
Do NOT miss any article text. Here is the news article:

```{article_text}```

Return only the CLEANED text of the news article, nothing else.
"""

DISCOURSE_PROMPT = """
You will receive a sentence from a {document_type}. Your task is to identify the structural role that this sentence plays in the overall {document_type}.
Please generate a generalizable keyword label to describe the structural role. The label must be as generalizable as possible and should not be document-specific. 

{definitions}

For reference, I'll also give you the full {document_type}.

[Examples]
Example 1:

Sentence: "This annual festival, which welcomes and celebrates large, gay men -- 'bears' -- on the tip of Cape Cod, ended Saturday."
Your response: 
"Main Event": This sentence directly describes a primary event, noting its conclusion, which is the focal point of the document.

Example 2:
Sentence: "But the league has also become one of the most prestigious summer internships for aspiring broadcasters."
Your response:
"Current Context": This sentence explains the background that contributes to the league's reputation, making it a relevant aspect of the broader discussion within the document.

[Instructions]
Again, determine a generalizable structure labels in the document.

Each keyword label must reflect a SINGLE structural feature instead of a combination of features.
If you add a new label, add a SHORT GENERAL LABEL and a DESCRIPTION OF THAT LABEL.
If the sentence is an error, return "Error". 
Return in the same format as the examples: "LABEL": DESCRIPTION. Please feel free to add a new label if it helps describe a sentence.

Now it's your turn. Here's a {document_type}:

[{document_type}]
```{document}```

What role does the following sentence play in the {document_type}?

[Sentence]
```{sentence}```

Please only state your response as "LABEL": DESCRIPTION. Say nothing else."""

SOURCE_EXTRACTION_PROMPT = """
You are a helpful news assistant. Here is a news article:

```{news_article}```

Please summarize each informational source providing information in the article. 
Include unnamed or passively expressed sources (e.g. "witnesses", "price signals") if there is information attributable to them.
Include any facts that might have come from the source.
Make sure each source you return refer to just one source. For example: if "John and Jane" both contribute the same information, generate two separate summaries, one for "John" and one for "Jane". 
Generate only ONE summary per source.

For each source, provide the following information:
    (1) Name: just the name of the source.
    (2) Biography: A brief biography of the source mentioned in the article.
    (3) Information: Restate the facts provided by the source. Be as SPECIFIC and be as VERBOSE as possible. 
        Contextualize ALL the information the source describes. State the full names of all people, places, events and ideas mentioned and
        everything the source says with AS MUCH BACKGROUND INFORMATION from the article so I can fully understand the information
        the source is giving. I will look at each source independently without looking at any others, so help me understand the context.

Here are some examples:
example 1:
{{ "Name": "Supermarkets around the country",
   "Biography": "Retail stores that sell food and other household items",
   "Information": "Supermarkets around the country alerted shoppers that prices are likely to continue going up due to the avian flu outbreak, with eggs now average $2.88 per dozen, up 52% since the first confirmed case of avian influenza in February."
}}

example 2:
{{
  "Name": "The article's author (unnamed)",
  "Biography": "The author of the article",
  "Information": "The author stated that Wing, which is collaborating with FedEx and Walgreens on drone delivery, was the first to receive a limited Part 135 certificate. Wing is launching operations in Virginia this month, and the Standard certification allows UPS to send an unlimited number of drones to the skies, for their cargo load to exceed 55 pounds and for them to fly at night."
}}

example 3:
{{
   "Name": "Delta's customers",
   "Biography": "People who travel with Delta Air Lines",
   "Information": "Delta's customers suggested that they preferred more space on flights amid the COVID-19 pandemic, and they continue to tell Delta that more space provides more peace of mind."
}}

example 4:
{{
   "Name": "European Union countries",
   "Biography": "Countries that are part of the European Union",
   "Information": "European Union countries are working on adopting copyright rules that allow news companies and publishers to negotiate payments with large tech companies like Facebook, Microsoft and Google that use their content on their platforms."
}}

Output the summary in a list of python dictionaries as in the examples. Don't say anything else.
"""

NARRATIVE_KEYWORD_PROMPT = """
You will receive a news article and a set of sources to examine in that article.

For each source in the list, provide the following information, once per source:
    (1) Name: Exactly copy the name of the source.
    (2) Narrative Function: Give a generic keyword label to categorize the narrative role the source playes in the article. 
    Infer why the author used the source, and a generalizable statement about the role they play in the article.
    Don't just summarize their identity. Return in the format: "LABEL": DESCRIPTION.

Here are example outputs. Again, your main task here is to identify a generalizable label that can characterize the narrative role of each source and why the author used them. 

[Examples]
Example 1:

{{
    "Name": "Match Group",
    "Narrative Function": "\"Counterpoint\": This source is used to compare to the main actor in the news article and provide grounding."
}}

Example 2:

{{
    "Name": "Dubai Airshow",
    "Narrative Function": "\"More Context\": This source is used to further expand the context offered and offer a visual setting."
}}

Example 3:
{{

    "Name": "Ann Gough",
    "Narrative Function": "\"Victim\": This source provides the voice of a user for the product, giving us a personal view of the harm caused by the event.
}}

[Instructions]

Now it's your turn. Here is a news article:

```{news_article}```

Please examine the narrative role of each of the following sources: 

```[{target_sources}]```

For each source, answer the questions above. Output the summary in a list of python dictionaries as in the examples. Don't say anything else.
"""


CENTRALITY_AND_PERSPECTIVE_PROMPT = """
You will receive a news article and a set of sources to examine in that article.
    
    For each source, provide the following information:
        (1) Name: who the source is.
        (2) Perspective: What is their perspective on the main events of the article? Choose as many labels as fit from: ["Authoritative", "Informative", "Supportive", "Skeptical", "Against", "Neutral"].
        (3) Centrality: How central is this source to the main events of the article? Choose from "High", "Medium", "Low".
        (4) Is_Error: Did we annotate this source in error? This can happen for many reasons, including if a sentence from the webpage was included in the story unintentionally. Answer with "Yes" or "No".

Here is a news article:

```{news_article}```

Please examine the role of each of the following sources: 

```[{target_sources}]```

For each source, answer the questions above. Output the summary in a list of python dictionaries as in the examples. Don't say anything else.
"""

In [237]:
def robust_parse_narr_json_str(t):
    t = t.replace(']','').replace('[','').replace('`', '').strip()
    t2_chunks = re.split(r'\},\s+\{', t)
    
    all_ds = []
    for t2 in t2_chunks:
        d = {}
        t3_chunks = t2.replace('{', '').replace('}', '').strip().split('\n')
        for t3 in t3_chunks:
            if 'Name' in t3:
                d['Name'] = t3.replace('"Name":', '').strip().removeprefix('"').removesuffix('",')
            if 'Narrative Function' in t3:
                d['Narrative Function'] = t3.replace('"Narrative Function":', '').strip()
        all_ds.append(d)
    return all_ds

In [158]:
import spacy
nlp = spacy.load("en_core_web_lg")
# spacy pipeline
def sentence_splitting_pipeline(texts):    
    docs = nlp.pipe(texts)
    # Extract sentences from each processed document
    sentences_list = [[sent.text for sent in doc.sents] for doc in docs]
    return sentences_list



In [160]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from tqdm.auto import tqdm

model = SentenceTransformer("all-MiniLM-L6-v2")

# Student Writing

In [209]:
import pandas as pd
student_article_df = pd.read_csv('../data/batch_article_text.csv')

In [230]:
student_article_df.iloc[0]['url']

'/2024/10/08/dream-big-says-ezra-frech-a-two-time-paralympic-gold-medalist/'

In [214]:
clean_prompt = CLEAN_ARTICLE_TEXT_PROMPT.format(article_text=student_article_df['article_text'].iloc[0])

In [215]:
student_article_text = query_together(clean_prompt)

In [227]:
sentences_by_doc = sentence_splitting_pipeline([student_article_text])
sentences_by_doc = list(map(lambda x: x.strip().replace('\n', ' '), sentences_by_doc[0]))

In [278]:
pyperclip.copy('\n'.join(sentences_by_doc))

In [231]:
responses = []
for s in tqdm(sentences_by_doc):
    prompt = DISCOURSE_PROMPT.format(document_type='news article', definitions='', document='\n'.join(sentences_by_doc), sentence=s)
    responses.append(query_together(prompt))

  0%|          | 0/39 [00:00<?, ?it/s]

In [233]:
# pyperclip.copy('\n'.join(list(map(lambda x: x.split(':')[0], responses))))
pyperclip.copy('\n'.join(list(map(lambda x: x.split(':')[1], responses))))

In [234]:
prompt = SOURCE_EXTRACTION_PROMPT.format(news_article='\n'.join(sentences_by_doc))
r = query_together(prompt=prompt)
source_df = pd.DataFrame(json.loads(r))
prompt = NARRATIVE_KEYWORD_PROMPT.format(news_article='\n'.join(sentences_by_doc), target_sources=', '.join(source_df['Name'].tolist()))
narr_r = query_together(prompt=prompt)
narr_df = pd.DataFrame(robust_parse_narr_json_str(narr_r))
prompt = CENTRALITY_AND_PERSPECTIVE_PROMPT.format(news_article='\n'.join(sentences_by_doc), target_sources=', '.join(source_df['Name'].tolist()))
cent_r = query_together(prompt=prompt)
cent_df = pd.DataFrame(json.loads(cent_r.replace('`python\n', '').replace('`', '')))
full_ex_df = source_df.merge(narr_df, on='Name').merge(cent_df, on='Name')

In [246]:
full_ex_df.to_csv('cache/student_ex_source_df_2.csv', index=False)

# Match with a professional article

In [253]:
import glob
professional_article_text_files = glob.glob('../../../berkeley-research/conditional-information-retrieval/data/v3_sources/v3_source_summaries/*article_text*')

In [256]:
all_files = list(map(lambda x: pd.read_json(x, lines=True), professional_article_text_files))

In [263]:
full_file_df = pd.concat(all_files).sample(50_000)

In [264]:
# 2. Calculate embeddings by calling model.encode()
embeddings = model.encode(full_file_df['response'].tolist(), show_progress_bar=True)

Batches:   0%|          | 0/1563 [00:00<?, ?it/s]

In [265]:
# match professional articles with student articles 
sample_emb = model.encode(['\n'.join(sentences_by_doc)])
cos_sims = cosine_similarity(sample_emb, embeddings)

In [270]:
sample_professional_article = full_file_df.iloc[np.argmax(cos_sims)]
sample_professional_text = sample_professional_article['response']

In [274]:
pyperclip.copy(sample_professional_article['url'])

In [286]:
sentences_by_doc = sentence_splitting_pipeline([sample_professional_text])
sentences_by_doc = list(map(lambda x: x.strip().replace('\n', ' '), sentences_by_doc[0]))

In [287]:
pyperclip.copy('\n'.join(sentences_by_doc))

In [280]:
responses = []
for s in tqdm(sentences_by_doc):
    prompt = DISCOURSE_PROMPT.format(document_type='news article', definitions='', document='\n'.join(sentences_by_doc), sentence=s)
    responses.append(query_together(prompt))

  0%|          | 0/64 [00:00<?, ?it/s]

In [289]:
# pyperclip.copy('\n'.join(list(map(lambda x: x.split(':')[0], responses))))
pyperclip.copy('\n'.join(list(map(lambda x: x.split(':')[1], responses))))

In [294]:
prompt = SOURCE_EXTRACTION_PROMPT.format(news_article='\n'.join(sentences_by_doc))
r = query_together(prompt=prompt)
source_df = pd.DataFrame(json.loads(r))
prompt = NARRATIVE_KEYWORD_PROMPT.format(news_article='\n'.join(sentences_by_doc), target_sources=', '.join(source_df['Name'].tolist()))
narr_r = query_together(prompt=prompt)
narr_df = pd.DataFrame(robust_parse_narr_json_str(narr_r))
prompt = CENTRALITY_AND_PERSPECTIVE_PROMPT.format(news_article='\n'.join(sentences_by_doc), target_sources=', '.join(source_df['Name'].tolist()))
cent_r = query_together(prompt=prompt)
cent_df = pd.DataFrame(json.loads(cent_r.replace('`python\n', '').replace('`', '')))
full_ex_df = source_df.merge(narr_df, on='Name').merge(cent_df, on='Name')

In [296]:
full_ex_df.to_csv('cache/professional_ex_source_df_2.csv', index=False)

# Professional Writing

In [17]:
sample_file_name = "../../../berkeley-research/conditional-information-retrieval/data/v3_sources/v3_source_summaries/test_sources__article_text__0_500.txt"
text_df = pd.read_json(sample_file_name, lines=True)
text_df = (text_df
 .loc[lambda df: ~df['response'].str.contains('python', case=False)]
 .loc[lambda df: ~df['response'].str.contains('<div', case=False)]
 .assign(response=lambda df: df['response'].str.replace('`', '').str.replace(r'<[^>]*>', '').str.strip())
 )

In [51]:
sample_text = (
    text_df['response']
               .pipe(lambda df: 
                     df.iloc[1]
                     .split('Here is the cleaned-up text:')[1]
                     .split('I removed the extraneous text from the news article')[0]
                     .strip()
               ))
print(sample_text)

To mark Suicide Awareness Month, a team of researchers set out to determine who teens turn to when faced with mental health concerns. by Gianna Melillo | Sep. 08, 2022 Story at a glance More educators than parents report being approached by young adults with mental health concerns. Findings of the new survey underscore the important role teachers and educators can play in promoting youth mental health. In an effort to improve access to care, the Biden Administration has allotted nearly $300 million to bolster mental health resources in schools. New data suggests teens reach out to educators more than their parents about mental health concerns, while experts stress both adults can play complementary roles in supporting young people's mental health. Most educators, 78 percent, have been approached by a child about a mental or emotional concern, according to a CVS Health/Morning Consult survey published Thursday. That's compared to 58 percent of parents who report the same, underscoring h

# News Discourse Prompt

In [62]:
sentences_by_doc = sentence_splitting_pipeline([sample_text])

In [66]:
responses = []
for s in tqdm(sentences_by_doc[0]):
    prompt = DISCOURSE_PROMPT.format(document_type='news article', definitions='', document=sample_text, sentence=s)
    responses.append(query_together(prompt))

  0%|          | 0/28 [00:00<?, ?it/s]

In [74]:
import pyperclip
pyperclip.copy('\n'.join(sentences_by_doc[0]))

In [78]:
pyperclip.copy('\n'.join(list(map(lambda x: x.split(':')[1], responses))))

# Sample Source Parsing

In [108]:
sample_source_file_name = "../../../berkeley-research/conditional-information-retrieval/data/v3_sources/v3_source_summaries/test_sources__summaries__0_500.txt"
sample_source_centrality_name = "../../../berkeley-research/conditional-information-retrieval/data/v3_sources/v3_source_summaries/test_sources__centrality-perspective__0_500.txt"
sample_source_narrative_keyword_file_name = "../../../berkeley-research/conditional-information-retrieval/data/v3_sources/v3_source_summaries/test_sources__narrative-keyword__0_500.txt"
source_summary_df = pd.read_json(sample_source_file_name, lines=True)
source_centrality_df = pd.read_json(sample_source_centrality_name, lines=True)
source_narrative_keyword_df = pd.read_json(sample_source_narrative_keyword_file_name, lines=True)

In [100]:
import json 
source_example_df = pd.DataFrame(json.loads(source_summary_df.loc[3]['response'].split('Here is the list of sources:')[1].replace('`', '')))

In [133]:
centrality_df = pd.DataFrame(json.loads(source_centrality_df.loc[3]['response'].replace('`', '')))

In [127]:
narr_func_str = [{
    "Name": "U.S. Surgeon General Vivek Murthy",
    "Narrative Function": '''"Expert": This source is used to provide a quote from an expert in the field, adding credibility to the article and providing a statement about the severity of the issue.'''
},
{
    "Name": "CVS Health President and Chief Executive Officer Karen S. Lynch",
    "Narrative Function": '''"Counterpoint": This source is used to provide a quote from a business leader, adding a counterpoint to the article and providing a statement about the importance of mental health in everyday life.'''
},
{
    "Name": "Educators",
    "Narrative Function": '''"Data": This source is used to provide data about educators' experiences and perspectives on mental health, adding a quantitative element to the article and providing evidence for the importance of educators in supporting young people's mental health.'''
},
{
    "Name": "Parents",
    "Narrative Function": '''"Data": This source is used to provide data about parents' experiences and perspectives on mental health, adding a quantitative element to the article and providing evidence for the importance of parents in supporting young people's mental health.'''
},
{
    "Name": "The Biden Administration",
    "Narrative Function": '''"Solution": This source is used to provide information about a solution to the problem of mental health in schools, adding a sense of hope and action to the article and providing evidence for the importance of government support for mental health initiatives.'''
}]

In [134]:
from io import StringIO
narr_funct_df = pd.DataFrame(narr_func_str)

In [136]:
pyperclip.copy('\n'.join(narr_funct_df['Narrative Function']))

In [137]:

narr_funct_df

Unnamed: 0,Name,Narrative Function
0,U.S. Surgeon General Vivek Murthy,"""Expert"": This source is used to provide a quo..."
1,CVS Health President and Chief Executive Offic...,"""Counterpoint"": This source is used to provide..."
2,Educators,"""Data"": This source is used to provide data ab..."
3,Parents,"""Data"": This source is used to provide data ab..."
4,The Biden Administration,"""Solution"": This source is used to provide inf..."


In [140]:
centrality_df
pyperclip.copy('\n'.join(centrality_df['Perspective']))

In [132]:
# pyperclip.copy('\n'.join(source_example_df['Information']))

In [None]:
prior = 