In [None]:
from datasets import load_from_disk
import pandas as pd


In [None]:
! mkdir ../data
! wget https://storage.googleapis.com/usc-data/all-coref-resolved.tar.gz
! wget https://storage.googleapis.com/usc-data/full-source-scored-data.jsonl.gz
! mv all-coref-resolved.tar.gz ../data
! mv full-source-scored-data.jsonl.gz ../data
! tar -xvzf ../data/all-coref-resolved.tar.gz

In [None]:
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
data_dir = '../data'
# data_dir = '../../../bloomberg-research/press-releases/data/s_p_500_backlinks'
source_df = pd.read_json(f'{data_dir}/full-source-scored-data.jsonl.gz', lines=True, compression='gzip', nrows=100)

In [None]:
article_d = load_from_disk('../all-coref-resolved')

In [None]:
filtered_article_d = article_d.filter(lambda x: x['article_url'] in set(source_df['article_url']), num_proc=10)

In [None]:
#experimentation, please ignore
print(type(filtered_article_d))
articles = filtered_article_d.to_pandas()


sentences_with_quotes = (
    filtered_article_d
         .to_pandas()
         .merge(source_df, on='article_url'))

print(sentences_with_quotes.columns)
for i in range(100):
    print(sentences_with_quotes.iloc[i]['article_url'])


In [None]:
disallowed_quote_types = set(['Other', 'Background/Narrative', 'No Quote'])

sentences_with_quotes = (
    filtered_article_d
         .to_pandas()
         .merge(source_df, on='article_url')
         [['article_url', 'attributions', 'quote_type', 'sent_lists',]]
         .explode(['attributions', 'quote_type', 'sent_lists'])
)

sentences_with_quotes = (sentences_with_quotes
     .assign(attributions=lambda df:
             df.apply(lambda x: x['attributions'] if x['quote_type'] not in disallowed_quote_types else np.nan, axis=1)
    )
)

In [None]:
one_article = (
    sentences_with_quotes
         .loc[lambda df: df['article_url'] == df['article_url'].unique()[1]]
        .reset_index(drop=True)
)

In [None]:
doc_str = one_article[['sent_lists', 'attributions']].to_csv(sep='\t', index=False)
json_str = one_article[['sent_lists', 'attributions']].to_json(lines=True, orient='records')

I ran llama3 locally using a cool program called Ollama: https://ollama.com/. You can follow the link to install the program.

This can be useful for trying things out before we work out compute access.

Run this in your terminal:

`ollama run llama3`

Even llama3 8b is pretty powerful. Llama 70b is better, but that may not run on your local computer.

In [None]:
import requests
import pyperclip

In [None]:
pyperclip.copy(doc_str)
pyperclip.copy(json_str)

In [None]:
r = requests.post(
    'http://localhost:11434/api/generate',
    json = {
        "model": "llama3",
        "prompt":f"""
            Here is a news article, with each sentence annotated according to the source of it's information:
            ```
            {json_str}
            ```

            Please summarize each of our source annotations. Tell me in one paragraph per source: (1) who the source is (2) what informational content they provide to the article.
            Only rely on the annotations I have provided, don't identify additional sources.
        """,
        "stream": False
})

In [None]:
print(r.json()['response'])