In [36]:
import pandas as pd
import orjson 
import orjsonl

# Read Data

### NYTimes

In [44]:
from more_itertools import flatten
import xopen
import gzip
import pandas as pd 
from urllib.parse import urlparse

fname = '../data/open-sourced-articles/nytimes-business-articles-sans-html.jsonl.gz'
nytimes_fetched = []
with xopen.xopen(fname, 'rb') as f:
    for line in f:
        nytimes_fetched.append( orjson.loads(line))

nytimes_fetched_df = pd.DataFrame(nytimes_fetched).loc[lambda df: df['links'].str.len() > 0]

In [68]:
def flatten_list_of_links(df):
    all_links = []
    for url, links in (
        df.drop_duplicates('article_url')
        .set_index('article_url')
        ['links'].items()
    ):
        for link in links:
            link['url'] = url
            all_links.append(link)
    return pd.DataFrame(all_links)

In [66]:
all_nyt_links_df = flatten_list_of_links(nytimes_fetched_df)


In [105]:
press_release_articles = (
    pd.concat([
        all_nyt_links_df.loc[lambda df: df['text'].str.contains('press release', case=False)].assign(method='text: press release'),
        all_nyt_links_df.loc[lambda df: df['text'].str.contains('news release', case=False)].assign(method='text: news release'),
        all_nyt_links_df.loc[lambda df: df['href'].str.contains('prnewswire', case=False)].assign(method='href: prnewswire'),
        all_nyt_links_df.loc[lambda df: df['href'].str.contains('businesswire', case=False)].assign(method='href: businesswire'),
        all_nyt_links_df.loc[lambda df: df['href'].str.contains('press', case=False)].assign(method='href: press'),
        all_nyt_links_df.loc[lambda df: df['href'].str.contains('release', case=False)].assign(method='href: release'),
    ])
    .assign(domain=lambda df: df['href'].apply(lambda x: urlparse(x).netloc))
    .loc[lambda df: ~df['domain'].str.contains('nytimes')]
    .loc[lambda df: ~df['href'].str.startswith('/')]
)

In [113]:
print('Total articles:', press_release_articles.shape[0])

Total articles: 10240


In [114]:
print('Total unique articles:', press_release_articles['url'].nunique())

Total unique articles: 5602


In [115]:
press_release_articles['method'].value_counts()

## how are these being used in the article?
## are these articles about press releases, or something else that just happens to use press release?
##      our preference is

method
href: release          4429
href: press            4392
text: news release      661
href: businesswire      389
href: prnewswire        349
text: press release      20
Name: count, dtype: int64

In [116]:
press_release_articles.drop_duplicates('url')['href'].value_counts()

href
http://www.bls.gov/news.release/empsit.nr0.htm                                                                                                                             49
http://www.dol.gov/opa/media/press/eta/ui/current.htm                                                                                                                      28
http://www.bea.gov/newsreleases/national/gdp/gdpnewsrelease.htm                                                                                                            23
http://www.bls.gov/news.release/cpi.nr0.htm                                                                                                                                19
https://www.bls.gov/news.release/empsit.nr0.htm                                                                                                                            16
                                                                                                                             

In [174]:
press_release_articles.drop_duplicates('url').shape 

(5602, 5)

In [173]:
nytimes_fetched_df.shape 

(100200, 9)

### WSJ

In [118]:
fname = '../data/open-sourced-articles/wsj-business-articles-sans-html.jsonl.gz'
wsj_fetched = []
with xopen.xopen(fname, 'rb') as f:
    for line in f:
        wsj_fetched.append( orjson.loads(line))

wsj_fetched_df = pd.DataFrame(wsj_fetched).loc[lambda df: df['links'].str.len() > 0]

In [122]:
all_wsj_links_df = flatten_list_of_links(wsj_fetched_df)

In [123]:
all_wsj_links_df

Unnamed: 0,text,href,url
0,\n\n\n\n\n\n\n,https://www.wsj.com/articles/eviction-bans-exp...,"com,wsj)/amp/articles/com,wsj)/amp/articles/ev..."
1,\n Real Estate\n,https://www.wsj.com/news/realestate?mod=breadc...,"com,wsj)/amp/articles/com,wsj)/amp/articles/ev..."
2,Will Parker,https://www.wsj.com/news/author/will-parker,"com,wsj)/amp/articles/com,wsj)/amp/articles/ev..."
3,citing a Supreme Court,https://www.wsj.com/articles/supreme-court-dec...,"com,wsj)/amp/articles/com,wsj)/amp/articles/ev..."
4,failed to reach an agreement,https://www.wsj.com/articles/house-adjourns-wi...,"com,wsj)/amp/articles/com,wsj)/amp/articles/ev..."
...,...,...,...
33326623,Legal Policies,https://www.wsj.com/policy/legal-policies,"com,wsj)/articles/com,wsj)/articles/u-s-paymen..."
33326624,Google Play,https://play.google.com/store/apps/details?id=...,"com,wsj)/articles/com,wsj)/articles/u-s-paymen..."
33326625,App Store,https://apps.apple.com/us/app/the-wall-street-...,"com,wsj)/articles/com,wsj)/articles/u-s-paymen..."
33326626,Sign In,https://accounts.wsj.com/login?target=http%3A%...,"com,wsj)/articles/com,wsj)/articles/u-s-paymen..."


In [None]:
all_wsj_links_df.loc[lambda df: df['href'].str.contains('muckrack.com')]

In [161]:
all_wsj_links_df['domain'] = all_wsj_links_df['href'].apply(lambda x: urlparse(x).netloc)
all_wsj_links_df_sans_self_links = all_wsj_links_df.loc[lambda df: ~df['domain'].str.contains('wsj')]

In [171]:
wsj_press_release_articles = (
    pd.concat([
        all_wsj_links_df_sans_self_links.loc[lambda df: df['text'].str.contains('press release', case=False)].assign(method='text: press release'),
        all_wsj_links_df_sans_self_links.loc[lambda df: df['text'].str.contains('news release', case=False)].assign(method='text: news release'),
        all_wsj_links_df_sans_self_links.loc[lambda df: df['href'].str.contains('prnewswire', case=False)].assign(method='href: prnewswire'),
        all_wsj_links_df_sans_self_links.loc[lambda df: df['href'].str.contains('businesswire', case=False)].assign(method='href: businesswire'),
        all_wsj_links_df_sans_self_links.loc[lambda df: df['href'].str.contains('press', case=False)].assign(method='href: press'),
        all_wsj_links_df_sans_self_links.loc[lambda df: df['href'].str.contains('release', case=False)].assign(method='href: release'),
        all_wsj_links_df_sans_self_links.loc[lambda df: df['href'].str.contains('announcement', case=False)].assign(method='href: announcement'),
        all_wsj_links_df_sans_self_links.loc[lambda df: df['text'].str.contains('announcement', case=False)].assign(method='text: announcement'),
        all_wsj_links_df_sans_self_links.loc[lambda df: df['text'].str.contains('statement', case=False)].assign(method='text: statement'),
    ])
    .assign(domain=lambda df: df['href'].apply(lambda x: urlparse(x).netloc))
    .loc[lambda df: ~df['domain'].str.contains('wsj')]
    # .loc[lambda df: ~df['href'].str.startswith('/')]
)

In [172]:
wsj_press_release_articles['method'].value_counts()

method
href: press           134
href: release         106
text: statement        29
href: announcement      6
href: businesswire      3
text: announcement      3
href: prnewswire        2
text: news release      1
Name: count, dtype: int64

In [169]:
(all_wsj_links_df_sans_self_links
     .loc[lambda df: df['text'].str.contains('statement', case=False)]['href']
     .iloc[2]
)

'http://files.shareholder.com/downloads/YHOO/1133897927x0x436856/1ae67a98-19a9-49d2-a9f9-74f09d500817/YHOO_Q410PressRelease_Final.pdf'

In [None]:
# ---------------------------------------
# reuters :
# business, markets, technology
# ---------------------------------------
# washington post:
# business, technology,
# ---------------------------------------

In [None]:
# we definitely want to have multiple news sources 
#
## crawling stuff from the web 

## we'd like to have more richer details in the flash headlines, but 
## it's not new reporting. just summarized/etc. of other documents or whatever is out there.
## 
## look at lexical form a
## NI FLASH HEADLINES
## BFW are quick stories
## 
## NI PRESS RELEASE

## Does Bloomberg cover more government or business press releases? 
# -> Will have to filter down by the subject
## Reporters break things down based on expertise:
    ## example: Reporter from europe covers the EU press release


## Models
## instruction-tuned BloombergGPT
## 
## Evaluation:
## * We don't want to generate the entire article from just the press release
## * How 
## Take the story and summary of it, and then the summary becomes the gold-standard of 
## What we are trying to produce
## Should the summary be totally automatic
## Take the opening paragraph and that's a summary 
## 
## Flash headlines, it's less necessary.
## 
## ************ How can we make sure we validate the summaries generated?
## 
## We can generate bullet points from the press release alone
## We can generate bullet points from the article
## We can summarize the common points between the two
## These three will actually come out as quite different.

## If humans are asked to summarize the news:
##    * When the news adds background, the summarization point will be very different.
##    * When they read both, that will be very different.

## Ex.
## News article covers press release and adds information about the stock
## 

## Setting up the evaluation
## Metrics for scoring
## Starting to augment this... dense retrieval?
## Here's a summary of the press release, find other things that are related? 
## Produce a bulleted list of the major highlights from the press release
## Turn each one of the bullets into a query and search the archive for similar articles. 
## 
## Ex. A drug is mentioned along with other things... one bullet point will mention the drug
## so, search the archive for similar things about that.

## Having a step where we produce a bunch of stuff, following up on them

## You can say  to Language model: "A company just produced this, what questions 
# do you have aboubt this?"
# Can the model generate a bunch of things:
# 1. What's the approval timeline... 

In [145]:
top_sites = all_wsj_links_df_sans_self_links['domain'].value_counts().head(20)

In [146]:
all_wsj_links_df_sans_self_links.loc[lambda df: ~df['domain'].isin(top_sites.index)]

Unnamed: 0,text,href,url,domain
54,Google’s Auto Update policy website,https://support.google.com/chrome/a/answer/622...,"com,wsj)/amp/articles/com,wsj)/amp/articles/be...",support.google.com
58,the outdated device,https://www.acer.com/ac/en/US/content/recyclin...,"com,wsj)/amp/articles/com,wsj)/amp/articles/be...",www.acer.com
59,Gallium,https://galliumos.org,"com,wsj)/amp/articles/com,wsj)/amp/articles/be...",galliumos.org
60,optimized for Chromebooks,https://wiki.galliumos.org/Hardware_Compatibility,"com,wsj)/amp/articles/com,wsj)/amp/articles/be...",wiki.galliumos.org
61,advanced hackery,https://wiki.galliumos.org/Installing,"com,wsj)/amp/articles/com,wsj)/amp/articles/be...",wiki.galliumos.org
...,...,...,...,...
33326099,REA,https://www.rea-group.com/,"com,wsj)/articles/com,wsj)/articles/u-s-paymen...",www.rea-group.com
33326100,realtor.com,https://www.realtor.com,"com,wsj)/articles/com,wsj)/articles/u-s-paymen...",www.realtor.com
33326101,Storyful,https://storyful.com/,"com,wsj)/articles/com,wsj)/articles/u-s-paymen...",storyful.com
33326103,The Sun,https://www.thesun.co.uk/,"com,wsj)/articles/com,wsj)/articles/u-s-paymen...",www.thesun.co.uk


# Slosh

In [176]:
import glob

In [180]:
glob.glob('../data/open-sourced-articles/reuters-*')

['../data/open-sourced-articles/reuters-business-cc-articles-to-fetch.txt.gz',
 '../data/open-sourced-articles/reuters-technology-cc-articles-to-fetch.txt.gz',
 '../data/open-sourced-articles/reuters-markets-cc-articles-to-fetch.txt.gz']

In [192]:
import xopen

reuters_lines = []
for f in glob.glob('../data/open-sourced-articles/reuters-*-cc*'):
    for line in xopen.xopen(f, 'rb'):
        reuters_lines.append(line)
with xopen.xopen('../data/open-sourced-articles/reuters-business-articles-to-fetch.txt.gz', 'wb') as f:
    for line in reuters_lines:
        f.write(line)

In [220]:
wp_lines = []
for f in glob.glob('../data/open-sourced-articles/wp-*-cc*'):
    for line in xopen.xopen(f, 'rb'):
        wp_lines.append(line)
        
with xopen.xopen('../data/open-sourced-articles/wp-business-articles-to-fetch.txt.gz', 'wb') as f:
    for line in wp_lines:
        f.write(line)

In [10]:

fname = '../data/open-sourced-articles/nytimes-business-articles.jsonl.gz'
articles = []
for idx, f in enumerate(orjsonl.stream(fname)):
    if idx == 1000:
        break 
    articles.append(f) 
article_df = pd.DataFrame(articles)
htmls = article_df['article_html'].drop_duplicates()
from bs4 import BeautifulSoup
soup = BeautifulSoup( htmls.iloc[-1])

# find all links with non-null href
links = soup.find_all('a', href=True)
links = list(filter(lambda x: x.get_text() != '', links))
links_obj = list(map(lambda x: {'text': x.get_text(), 'href': x['href']}, links))

In [None]:
import orjsonl
import gzip 
import xopen 
import jsonlines

# test orjsonl
with xopen.xopen('test.jsonl.gz', 'wb') as f:
    for obj in links_obj:
        orjsonl.append(path=f, data=[obj])

test = []
for idx, f in enumerate(orjsonl.stream('test.jsonl.gz')):
    if idx == 100:
        break 
    test.append(f) 
gzipped = gzip.compress(line_str)


# test gzip compress 
with open('test-stream.jsonl.gz', 'wb') as f:
    for line in links_obj:
        line_str = orjson.dumps(line)
        gzipped = gzip.compress(line_str + b'\n')
        f.write(gzipped)

# test jsonlines
with xopen.xopen('test-stream-jsonwriter.jsonl.gz', 'wb') as f:
    with jsonlines.Writer(f) as w:
        for line in links_obj:
            w.write(line)

# test regular xopen
with xopen.xopen('test-bin.txt', mode='wb') as f:
    f.write(b'hello world\n')

with xopen.xopen('test-bin.txt') as f:
    print(f.read())

for line in xopen.xopen('test-stream.jsonl.gz', 'rb'):
    line 

orjson.loads(line)

In [33]:
# test gzip compress 
with xopen.xopen('test-stream.jsonl.gz', 'wb') as f:
    for line in links_obj:
        line_str = orjson.dumps(line) + b'\n'
        # gzipped = gzip.compress(line_str + b'\n')
        f.write(line_str)

In [34]:
fname = 'test-stream.jsonl.gz'
all_data = []
with xopen.xopen(fname, 'rb') as f:
    for line in f:
        break 

In [35]:
line 

b'{"text":"Skip to content","href":"#site-content"}\n'

In [25]:
line_str

b'{"text":"Subscriptions","href":"https://www.nytimes.com/subscription?campaignId=37WXW"}\n'

In [15]:
line 

b'{"text":"Skip to content","href":"#site-content"}\n'