In [2]:
from datasets import load_from_disk
import pandas as pd 

In [None]:
! mkdir ../data 
! wget https://storage.googleapis.com/usc-data/all-coref-resolved.tar.gz
! wget https://storage.googleapis.com/usc-data/full-source-scored-data.jsonl.gz
! mv all-coref-resolved.tar.gz ../data
! mv full-source-scored-data.jsonl.gz ../data
! tar -xvzf ../data/all-coref-resolved.tar.gz

In [16]:
import pandas as pd
import numpy as np 
from tqdm.auto import tqdm
data_dir = '../data'
data_dir = '../../../bloomberg-research/press-releases/data/s_p_500_backlinks'
source_df = pd.read_json(f'{data_dir}/full-source-scored-data.jsonl.gz', lines=True, compression='gzip', nrows=100)

In [6]:
article_d = load_from_disk(f'{data_dir}/all-coref-resolved')

Loading dataset from disk:   0%|          | 0/41 [00:00<?, ?it/s]

In [9]:
filtered_article_d = article_d.filter(lambda x: x['article_url'] in set(source_df['article_url']), num_proc=10)

Filter (num_proc=10):   0%|          | 0/496380 [00:00<?, ? examples/s]

In [49]:
disallowed_quote_types = set(['Other', 'Background/Narrative', 'No Quote'])

sentences_with_quotes = (
    filtered_article_d
         .to_pandas()
         .merge(source_df, on='article_url')
         [['article_url', 'attributions', 'quote_type', 'sent_lists',]]
         .explode(['attributions', 'quote_type', 'sent_lists'])
)

sentences_with_quotes = (sentences_with_quotes
     .assign(attributions=lambda df: 
             df.apply(lambda x: x['attributions'] if x['quote_type'] not in disallowed_quote_types else np.nan, axis=1)
    )
)

In [94]:
one_article = (
    sentences_with_quotes
         .loc[lambda df: df['article_url'] == df['article_url'].unique()[1]]
        .reset_index(drop=True)
)

In [95]:
doc_str = one_article[['sent_lists', 'attributions']].to_csv(sep='\t', index=False)
json_str = one_article[['sent_lists', 'attributions']].to_json(lines=True, orient='records')

Run this in your terminal:

`ollama run llama3`

In [77]:
import requests
import pyperclip

In [None]:
pyperclip.copy(doc_str)
pyperclip.copy(json_str)

In [105]:
r = requests.post(
    'http://localhost:11434/api/generate', 
    json = {
        "model": "llama3",
        "prompt":f"""
            Here is a news article, with each sentence annotated according to the source of it's information:
            ```
            {json_str}
            ```

            Please summarize each of our source annotations. Tell me in one paragraph per source: (1) who the source is (2) what informational content they provide to the article. 
            Only rely on the annotations I have provided, don't identify additional sources.
        """,
        "stream": False
})

In [106]:
print(r.json()['response'])

**Socrata Foundation**: The Socrata Foundation is a 501(c)(3) organization that provides information about its philanthropic philosophy and mission. They describe their support for unique organizations that lack resources or financial means to fulfill their data-driven missions. They also mention the importance of open data in removing barriers to social justice and economic progress.

**Robert Runge**: Robert Runge, a member of Socrata's Board of Directors, provides additional context about the Socrata Foundation's purpose and how it bridges the gap between publicly funded open data projects and underfunded or unfunded opportunities.

**Mike Duggan**: Detroit Mayor Mike Duggan shares his perspective on why he turned to data-driven government in Detroit. He highlights the importance of transparency, accountability, and fact-based decision-making enabled by open data. He also explains how the Socrata Foundation helped Detroit gain access to the necessary technology and infrastructure.

