In [5]:
from datasets import load_from_disk
import pandas as pd 
import pandas as pd
import numpy as np 
from tqdm.auto import tqdm
import requests
import pyperclip



  from .autonotebook import tqdm as notebook_tqdm


In [6]:
data_dir = '../data'
source_df = pd.read_json(f'{data_dir}/full-source-scored-data.jsonl.gz', lines=True, compression='gzip', nrows=100)
article_d = load_from_disk(f'./all-coref-resolved')
filtered_article_d = article_d.filter(lambda x: x['article_url'] in set(source_df['article_url']), num_proc=10)
disallowed_quote_types = set(['Other', 'Background/Narrative', 'No Quote'])

sentences_with_quotes = (
    filtered_article_d
         .to_pandas()
         .merge(source_df, on='article_url')
         [['article_url', 'attributions', 'quote_type', 'sent_lists',]]
         .explode(['attributions', 'quote_type', 'sent_lists'])
)

sentences_with_quotes = (sentences_with_quotes
     .assign(attributions=lambda df: 
             df.apply(lambda x: x['attributions'] if x['quote_type'] not in disallowed_quote_types else np.nan, axis=1)
    )
)

In [7]:
samples = []
for i in range(20):
    one_article = (
        sentences_with_quotes
            .loc[lambda df: df['article_url'] == df['article_url'].unique()[i]]
            .reset_index(drop=True)
    )
    json_str = one_article[['sent_lists', 'attributions']].to_json(lines=True, orient='records')
    doc_str = one_article[['sent_lists', 'attributions']].to_csv(sep='\t', index=False)
    r = requests.post(
        'http://localhost:11434/api/generate', 
        json = {
            "model": "llama3",
            "prompt":f"""
                Here is a news article, with each sentence annotated according to the source of it's information:
                ```
                {json_str}
                ```

                Please summarize each of our source annotations. Tell me in one paragraph per source: (1) who the source is (2) what informational content they provide to the article. 
                Only rely on the annotations I have provided, don't identify additional sources. Please write in the format of "source:summary".
            """,
            "stream": False
    })
    samples.append([doc_str, r.json()['response']])

samples_df = pd.DataFrame(samples, columns=['sources', 'summarized_sources'])

samples_df.to_csv('samples.csv', index=False)


KeyboardInterrupt: 

In [8]:

print(samples_df.iloc[1]['summarized_sources'])


Here are the summaries for each source:

**Socrata Foundation**: The Socrata Foundation provides the core philosophical and mission-oriented content to the article, outlining its purpose and approach to supporting unique organizations that lack resources or financial means.

**Robert Runge**: Robert Runge, a member of Socrata's Board of Directors, offers additional insight into the Socrata Foundation's role in bridging the gap between publicly funded open data projects and underfunded opportunities.

**Mike Duggan**: Detroit Mayor Mike Duggan provides context on why his city turned to data-driven government, highlighting its potential for transparency, accountability, and fact-based decision-making.

**Susan Scrupski**: Susan Scrupski, Founder of Big Mountain Data (BMD), shares the organization's long-term ambition to establish a national open source repository on repeat offender data and discusses its current efforts in tracking heating violations in New York City.

**William Jeffries

In [9]:
samples_df = pd.read_csv('samples.csv')

with open('sources.txt', 'w') as f:
    for index, row in samples_df.iterrows():
        f.write(row['summarized_sources'] + '\n')


In [8]:
system_prefix = '''
For each given text, obscure the specific details by leaving out all important information except for a short, generalized biographical description.

Format:
1. **Original**: Identity information + Biographical information + Given information
2. **Obscured**: Identity information + Biographical information

Here're some examples:
1. **Socrata Foundation:** The Socrata Foundation provides information about its philanthropic philosophy and mandate to support unique organizations that lack resources or financial means to fulfill their data-driven mission. It also explains how it will proactively support open data efforts that deliver social impact and long-term value.
   **Socrata Foundation:** The Socrata Foundation supports organizations lacking resources or financial means.

2. **Robert Runge:** Robert Runge, a member of the Socrata Board of Directors, provides additional context on the role of the Socrata Foundation in bridging the gap between publicly funded open data projects and underfunded or unfunded opportunities.
   **Robert Runge:** Robert Runge, a board member of the Socrata Foundation.

It's important to return the obscured text only.
Here's the text:
{source}
'''

In [28]:
def obscure(source):
    # print("source:", source)
    prompt = system_prefix.format(source=source)
    r = requests.post(
        'http://localhost:11434/api/generate', 
        json = {
            "model": "llama3",
            "prompt":prompt,
            "stream": False
    })
    return r.json()['response']

In [29]:
obscured_sources = []

with open('sources.txt', 'r') as f:
    for line in f:
        # print("line:", len(line))
        if line == '\n':
            continue
        obscured_sources.append(obscure(line))
        
with open('obscured_sources.txt', 'w') as f:
    for obscured_source in obscured_sources:
        f.write(obscured_source + '\n')


