In [1]:
from tqdm.auto import tqdm
from urllib.parse import urlparse
import pandas as pd
import orjson 
import orjsonl
import spacy 
spacy_model = spacy.load('en_core_web_lg')
from more_itertools import flatten
import xopen
import gzip
import os 
import pandas as pd 
from urllib.parse import urlparse
from tldextract import extract

def flatten_list_of_links(df):
    all_links = []
    df_to_iter = (
        df
        .drop_duplicates('article_url')
        .set_index('article_url')
        ['links']
    )
    for url, links in tqdm(df_to_iter.items(), total=len(df_to_iter)):
        for link in links:
            link['url'] = url
            all_links.append(link)
    return pd.DataFrame(all_links)

domain_exclusions = open('../data/utility-files/domain-exclusions-master-list.txt').read().split('\n')
domain_exclusions = list(filter(lambda x: x != '', domain_exclusions))
text_candidates = ['press release', 'news release', ]
href_whitelist_candidates = [
    'prnewswire',
    'businesswire',
    'press',
    'release',
    'globenewswire',
    'news',
]
def find_press_release(href, text, debug=False):
    for s in ['/', '#', 'mailto:']:
        if href.startswith(s):
            if debug:
                return False, 'domain starts with', s
            else:
                return False

    # parse domain
    try:
        domain = urlparse(href).netloc
        domain = tldextract.extract(domain).domain
    except:
        if debug:
            return False, 'urlparse', None
        else:
            return False

    # blacklist
    if d in domain_exclusions:
        if debug:
            return False, 'domain contains', d
        else:
            return False
        
    # text    
    for t in text_candidates:
        if t in text:
            return True 
    
    # href 
    for h in href_whitelist_candidates:
        if h in href:
            return True

    return False

def open_and_process(fname):
    fetched = []
    with xopen.xopen(fname, 'rb') as f:
        for line in f:
            fetched.append( orjson.loads(line))

    fetched_df = pd.DataFrame(fetched).loc[lambda df: df['links'].str.len() > 0]

    tqdm.pandas()
    all_links_df = flatten_list_of_links(fetched_df)
    press_release_articles = (
        all_links_df
            .loc[lambda df: df.progress_apply(find_press_release, axis=1)]
    )
    return fetched_df, press_release_articles

def retrieve_ents_for_col(id_col, text_col):
    desired_ents = set(['ORG', 'PRODUCT', 'FAC', 'LAW', 'EVENT'])
    ner_pipe = spacy_model.pipe(
        text_col.str.replace('\n', ' '), 
        disable=["tok2vec", "tagger", "parser", "attribute_ruler", "lemmatizer", "textcat"]
    )
    entities = []
    for idx, doc in tqdm(zip(id_col, ner_pipe), total=len(id_col)):
        ents = list(filter(lambda x: x.label_ in desired_ents, doc.ents))
        if len(ents) > 0:
            ents = list(map(str, ents))
            entities.append({
                'ents': ents,
                'idx': idx
            })
    return entities

def load_discourse_df(filename):
    discourse_df = []
    with xopen.xopen(filename) as f:
        for line in f:
            discourse_doc = orjson.loads(line)
            discourse_df.append(pd.DataFrame(discourse_doc))
    return pd.concat(discourse_df)

In [2]:
urlparse('http://www.nytimes.com').netloc

'www.nytimes.com'

# Read Data

### NYTimes

In [3]:
fname = '../data/open-sourced-articles/nytimes-business-articles-sans-html.jsonl.gz'
nytimes_fetched = []
with xopen.xopen(fname, 'rb') as f:
    for line_idx, line in enumerate(f):
        if line_idx > 10000:
            break
        
        nytimes_fetched.append( orjson.loads(line))

nytimes_fetched_df = pd.DataFrame(nytimes_fetched).loc[lambda df: df['links'].str.len() > 0]

In [4]:
all_nyt_links_df = flatten_list_of_links(nytimes_fetched_df)

  0%|          | 0/9980 [00:00<?, ?it/s]

In [8]:
t = all_nyt_links_df['href'].apply(lambda href: urlparse(href).netloc)

In [12]:
from tldextract import tldextract

In [13]:
tldextract.extract('help.nytimes.com')

ExtractResult(subdomain='help', domain='nytimes', suffix='com')

In [10]:
t.loc[lambda s: s != '']

30                help.nytimes.com
31                   www.nytco.com
32                help.nytimes.com
33                help.nytimes.com
34                   www.nytco.com
                    ...           
1454988           help.nytimes.com
1454989           help.nytimes.com
1454990    spiderbites.nytimes.com
1454991           help.nytimes.com
1454992            www.nytimes.com
Name: href, Length: 1355760, dtype: object

In [19]:
tqdm.pandas()
fpath = '../data/open-sourced-articles/nytimes-parsed-press-release-urls.csv'
if not os.path.exists(fpath):
    nyt_press_release_article_map = (
        all_nyt_links_df
            .loc[lambda df: df.progress_apply(find_press_release, axis=1)]
    )
    nyt_press_release_article_map.to_csv(fpath)
else:
    nyt_press_release_article_map = pd.read_csv(fpath, index_col=0)

In [21]:
print('Total articles:', nyt_press_release_article_map.shape[0])

Total articles: 11981


In [22]:
print('Total unique articles:', nyt_press_release_article_map['url'].nunique())

Total unique articles: 8276


In [7]:
## how are these being used in the article?
## are these articles about press releases, or something else that just happens to use press release?
##      our preference is

In [11]:
# (nytimes_fetched_df
#  .loc[lambda df: df['article_url'].isin(press_release_articles['url'])]
#  .to_csv('../data/open-sourced-articles/nytimes-articles-that-have-press-releases.csv')
# )

In [14]:
fpath = '../data/open-sourced-articles/nytimes-business-articles-with-entities.jsonl.gz'
if not os.path.exists(fpath):
    articles = retrieve_ents_for_col(
        nytimes_fetched_df['article_url'], nytimes_fetched_df['article_text']
    )
    nyt_ent_article_df = pd.DataFrame(articles)
    nyt_ent_article_df = (
        nytimes_fetched_df
             .assign(article_text=lambda df: df['article_text'].str.replace('\n', ' '))
             [['article_text', 'article_url']]
             .drop_duplicates('article_text')
             .merge(
                 nyt_ent_article_df.drop_duplicates('article'), 
                 left_on='article_text', 
                 right_on='article', 
                 how='right'
             )
             .drop(columns='article_text')
    )
    nyt_ent_article_df['ents'] = nyt_ent_article_df['ents'].apply(set)
    nyt_ent_article_df.to_json(
     fpath, 
     compression='gzip',
     lines=True, orient='records')
else:
    nyt_ent_article_df = pd.read_json(fpath, compression='gzip', 
                                      lines=True, orient='records')

In [29]:
## press releases
fpath = '../data/open-sourced-articles/nytimes-press-release-files-with-entities.jsonl.gz'
if not os.path.exists(fpath):
    nyt_press_release_df = pd.read_json(
        '../data/open-sourced-articles/nytimes-press-release-files.jsonl', 
        lines=True
    )
    nyt_press_release_entities = retrieve_ents_for_col(
        nyt_press_release_df['article_url'],
        nyt_press_release_df['article_text']
    )
    press_release_entities_df = pd.DataFrame(nyt_press_release_entities)
    nyt_press_release_df_with_entities = (
        nyt_press_release_df
             .merge(
                 press_release_entities_df, 
                 left_on='article_url',
                 right_on='idx'
             )
             .drop(columns='idx')
    )
    nyt_press_release_df_with_entities.to_json(
        fpath, compression='gzip', orient='records', lines=True
    )
else:
    nyt_press_release_df_with_entities = pd.read_json(
        fpath, compression='gzip', orient='records', lines=True
    )

In [30]:
nyt_merged_press_release_and_articles = (
    nyt_press_release_df_with_entities
     .rename(columns=lambda x: x.replace('article_', 'press_release_'))
     .rename(columns={'ents': 'press_release_ents'})
     .merge(
        nyt_press_release_article_map
             .rename(columns={'href': 'press_release_url', 'url': 'article_url', 'text': 'link_text'})
         ,
        on='press_release_url'
    )
     .drop(columns=['press_release_authors', 'press_release_top_image', 'press_release_video', 'press_release_wayback_timestamp', 'all_press_release_wayback_timestamps',])
     .merge(nyt_ent_article_df, on='article_url')
     .rename(columns={'article': 'article_text'})
     .rename(columns={'ents': 'article_ents'})
)

In [31]:
idx = 200
num_chars = 1000
press_release_ents = nyt_merged_press_release_and_articles['press_release_ents'].iloc[idx]
press_release_ents = set(
    pd.Series(press_release_ents)
        .value_counts()
        .loc[lambda s: s > 1]
        .index
)
news_ents = nyt_merged_press_release_and_articles['article_ents'].iloc[idx]
merged_ents = set(press_release_ents) & set(news_ents)
print(press_release_ents)
print('\n ------- PRESS RELEASE----------\n')
print('url: ' + nyt_merged_press_release_and_articles['press_release_url'].iloc[idx])
print()
print(nyt_merged_press_release_and_articles['press_release_text'].iloc[idx][:num_chars] + '...')
print()
print('\n------- NEWS ARTICLE----------\n')
print(nyt_merged_press_release_and_articles['article_text'].iloc[idx][:num_chars] + '...')

{'ISO New England', 'ISO'}

 ------- PRESS RELEASE----------

url: http://isonewswire.com/updates/2014/1/28/new-england-governors-announce-proposal-to-expand-regional-e.html

The six New England governors have announced a cooperative regional initiative designed to expand energy infrastructure in New England.

A letter sent to ISO New England on January 21, from the New England States Committee on Electricity (NESCOE), an organization made up of representatives appointed by the governors in each state, proposed a plan to expand the capacity of the natural gas pipeline system in the region and facilitate the development of additional electric transmission line infrastructure to improve access to renewable power sources. To spur the construction of new or expanded natural gas pipeline capacity, the states have asked the ISO to seek a tariff change to allow the ISO to collect the costs from electric market participants.

ISO New England is encouraged by the states’ focus on developing add

In [47]:
cc_to_fetch = '../data/open-sourced-articles/nytimes-cc-business-articles-to-fetch.txt.gz'

In [53]:
fetch_data = []
with xopen.xopen(cc_to_fetch) as f_handle:
    for line in f_handle:
        chunks = line.split(' ')
        fetch_data.append({
            'url': chunks[0],
            'date': chunks[1]
        })

In [55]:
to_fetch_data_df = pd.DataFrame(fetch_data)

In [83]:
strip_url = lambda s: (
    s.str.split('?').str.get(0)
     .str.split(')').str.get(-1)
     .str.strip()
)

nyt_ent_article_df['article_url'] = nyt_ent_article_df['article_url'].pipe(strip_url)
to_fetch_data_df['url'] = to_fetch_data_df['url'].pipe(strip_url)

In [87]:
nyt_ent_article_df = (nyt_ent_article_df
 .merge(to_fetch_data_df, how='left', left_on='article_url', right_on='url')
 .drop(columns='url')
)

In [90]:
nyt_press_release_article_map['url'] = nyt_press_release_article_map['url'].pipe(strip_url)

In [94]:
nyt_ent_article_df.head()

Unnamed: 0,article_url,article,ents,date
0,/video/business/100000002794456/bmw-ramps-up-p...,new video loaded: BMW Ramps Up Production at U...,"[BMW Ramps Up Production, U.S. Plant ]",20230208234349
1,/video/business/1194817103719/executive-pursui...,1:02 Biden Says Inflation Report Is ‘Proof’ H...,"[FTX Founder Leaves Court, Federal Reserve Rai...",20230128065940
2,/video/business/1194817093331/the-tussle-over-...,new video loaded: The Tussle Over CNet CNBC ...,[CNet CNBC Advertisement Continue],20230207144642
3,/interactive/2019/03/29/business/boeing-737-ma...,"This is a Boeing 737 in normal flight, pointed...","[Boeing, 737, Boeing, 737 Max, Boeing, MCAS, M...",20230127040043
4,/guides/business/manage-a-successful-team,This last point may not seem as big a deal as ...,[International Medical Corps],20230131235214


In [126]:
## discourse 
full_nytimes_discourse_df = load_discourse_df(
    '../data/open-sourced-articles/nytimes-articles-news-discourse.jsonl.gz'
)

In [127]:
nyt_article_grouped_discourse_df = (
    full_nytimes_discourse_df
     .loc[lambda df: 
          ~df['discourse_preds'].isin(['NA'])
     ]
     .sort_values(['doc_id', 'sent_idx'])
     .groupby('doc_id')[['sentences', 'discourse_preds']]
     .aggregate(list)
     .reset_index()
)

In [128]:
nyt_article_grouped_discourse_df['doc_id'] = nyt_article_grouped_discourse_df['doc_id'].pipe(strip_url)
nyt_article_grouped_discourse_df_merged  = (
    nyt_article_grouped_discourse_df
     .merge(
         nyt_ent_article_df, 
         left_on='doc_id', 
         right_on='article_url', 
     )
     .drop(columns=['article', 'article_url'])
     .assign(is_target_article=lambda df: df['doc_id'].isin(nyt_press_release_article_map['url']))
)

In [125]:
(nyt_article_grouped_discourse_df_merged
 .to_json(
     '../data/open-sourced-articles/nytimes-articles-for-factual-verification.jsonl.gz',
     orient='records',
     compression='gzip',
     lines=True
 )
)

In [132]:
full_nytimes_discourse_df = full_nytimes_discourse_df.sort_values(['doc_id', 'sent_idx'])

In [135]:
doc_ids = full_nytimes_discourse_df['doc_id'].unique()

In [146]:
doc_ids = list(filter(lambda x: '/video' not in x and '/slideshow' not in x, doc_ids))

In [172]:
pd.options.display.max_colwidth = 400
pd.options.display.html.use_mathjax = False
doc_id = doc_ids[-10006]
(full_nytimes_discourse_df
 .loc[lambda df: df['doc_id'] == doc_id]
 [['sentences', 'discourse_preds']]
)

Unnamed: 0,sentences,discourse_preds
0,"Kevin Hassett, the chairman of the White House Council of Economic Advisers, said on Sunday that administration economists expected continued boosts to growth this year from the $1.5 trillion in tax cuts that Mr. Trump signed in 2017, including further increases in labor force participation by older workers.",Distant_Expectations_Consequences
1,Other economists have criticized the sustained forecast of 3 percent growth over a decade as overly optimistic.,Distant_Evaluation
2,"Mr. Hassett defended it by pointing to the administration's forecast for last year, which was only slightly above the actual growth rate. """,Distant_Evaluation
3,"We have the same forecast we had last year,"" Mr. Hassett said, ""because we got last year precisely correct.""",Distant_Evaluation
4,"The budget is the first of Mr. Trump's tenure to theoretically adhere to spending caps that Congress adopted under President Barack Obama, only to bust in the years since.",Cause_General
5,"But Mr. Trump accomplishes this only through budgetary legerdemain by pushing much of his 4.7 percent military spending increase out of the regular budget and into an account called Overseas Contingency Operations that has been used mainly to finance wars in Iraq, Afghanistan and Syria and has therefore been exempt from congressional caps.",Cause_General
6,"At the same time, domestic discretionary programs would be cut 5 percent, an idea certain to go nowhere with Democrats.",Distant_Expectations_Consequences
7,"Mr. Trump is hoping to avoid a repeat of last year's budget deal, in which he was forced to agree to major domestic spending increases to secure his military boost.",Distant_Expectations_Consequences
8,"But he has even less sway now than he did last year, when Republicans controlled both houses of Congress.",Distant_Evaluation
9,"The overseas operations fund would receive $165 billion, compared with $69 billion this year, even as Mr. Trump is scaling back military operations in Afghanistan and Syria.",Distant_Expectations_Consequences


In [173]:
nyt_article_grouped_discourse_df_merged['ents']

Unnamed: 0,doc_id,sentences,discourse_preds,ents,date,is_target_article
0,/1981/01/05/business/advertising-wither-braniff-s-account-as-chairman-retires.html,"[With the word out that Harding L. Lawrence is retiring as chairman and chief executive of Braniff International, agency executives are going to have a field day betting on when the airline will break with its agency., Wells, Rich, Greene is Braniff's current agency; its chairman is Mary Wells Lawrence, wife of Harding Lawrence., She married him, as a matter of fact, soon after she started the...","[Distant_Expectations_Consequences, Cause_General, Distant_Historical, Distant_Historical]","[Braniff International, Wells, Rich, Greene, Braniff, Braniff, Braniff, Trans World Airlines, T.W.A., Madison Avenue, Foote, Cone & Belding]",20191121232819,False
1,/1981/01/06/business/kennecott-s-problems-persist.html,"[It was to get away from the sharp turns in the metals markets and the world glut of copper that developed in the mid-1970's that Kennecott bought Carborundum., That Kennecott needed to diversify seems clear, but the Carborundum acquisition was controversial and it provoked a 1978 proxy fight with T. Roland Berner, chairman of Curtiss-Wright., Mr. Berner won seats on Kennecott's board - for hi...","[Distant_Historical, Distant_Evaluation, Cause_General, Cause_General, Distant_Evaluation, Distant_Historical, Distant_Historical, Distant_Historical, Cause_General, Cause_General, Cause_General, Distant_Evaluation, Distant_Historical, Cause_General, Cause_General, Cause_General, Distant_Expectations_Consequences, Distant_Evaluation, Distant_Evaluation, Distant_Evaluation, Distant_Expectations...","[Kennecott, Carborundum, Carborundum, Curtiss-Wright, Kennecott, Kennecott, Carborundum, Kennecott, Carborundum, Kennecott, the Exxon Corporation, Carborundum, Kennecott, Kennecott, Chase Brass and Copper Company, Kennecott, Kennecott, Venture, Mitsubishi, the Mitsubishi Corporation, Kennecott]",20190823091225,False
2,/1981/01/06/business/venture-capitalist-a-rise-to-riches.html,"[He is equally quick to decide when he does not like a situation. '', Once a well-known investment bank took me to California to look at an electronics company they were in,'' he relates. '', The first thing I noticed was a gigantic Mercedes with initialed plates., The president said it was his, leased by the company., Then we went up to his office, which was almost as big as the production ar...","[Distant_Evaluation, Distant_Anecdotal, Distant_Anecdotal, Distant_Anecdotal, Cause_General, Distant_Evaluation, Distant_Historical, Distant_Evaluation, Distant_Evaluation, Distant_Evaluation, Distant_Anecdotal, Distant_Evaluation, Distant_Anecdotal, Distant_Evaluation, Distant_Expectations_Consequences, Distant_Evaluation, Cause_General, Cause_General, Cause_General, Distant_Evaluation, Dista...","[Mercedes, Blunt Approach Officers, Park Avenue, the Telxon Corporation, Adler & Company, The New York Times, The New York Times's, New York Times, The Data General Corporation, the Four Seasons, The Quilted Giraffe, Advanced Technology Laboratories, Squibb, Acuity Systems, Midas, Sanders Technology, Chapter 11, Adler & Company, Venad Associates, Citicorp, the Morgan Guaranty Trust Company, N...",20190717135851,False
3,/1981/01/10/business/sun-to-phase-out-shipbuilding.html,"[When the phasing out is finished, with the completion of seven ships now under construction, the present work force of 4,200 in Chester will be reduced to 1,100 workers, Mr. Campbell said., The seven ships include three freighters., ''This is going to have a catastrophic effect on the city of Chester,'' said Joseph Battle, Mayor of the city of 52,000 people, reached in his office about 10 mil...","[Distant_Expectations_Consequences, Cause_General, Distant_Evaluation, Distant_Historical, Distant_Expectations_Consequences, Distant_Evaluation, Cause_General, Cause_General, Distant_Historical, Cause_General, Distant_Evaluation, Distant_Evaluation, Cause_General]","[Sun Ship, Pennslvania's, Department of Labor and Industry, Chester]",20200222050217,False
4,/1981/01/12/business/aston-martin-challenge-for-new-owners.html,"[Victor Gauntlett, the 38-year-old head of Pace Petroleum, a Bentley and Aston Martin enthusiast who enters and sponsors automobile competitions, became a director and shareholder of Aston Martin last July and now becomes the executive chairman of the company., Tim Hearley, C.H.I.'s chairman (also made an Aston director last year), is now also a chairman, but without any executive duties., Mr....","[Cause_General, Cause_General, Distant_Historical, Cause_General, Cause_General, Distant_Historical, Distant_Historical, Distant_Historical, Distant_Historical, Distant_Expectations_Consequences, Cause_General, Distant_Evaluation, Cause_General, Main, Distant_Evaluation, Distant_Evaluation, Distant_Evaluation, Distant_Evaluation, Distant_Historical, Distant_Historical, Distant_Historical, Dist...","[Pace Petroleum, Bentley, Aston Martin, C.H.I., Aston, Aston, Aston, Aston, Aston Martin Lagonda Inc., Aston, Lagonda, London Motor Show, Lagonda, The New York Times, The New York Times's, New York Times, Aston, Aston, Aston Martin, V8, Aston, Aston Martin V8, Lagonda, Rolls-Royce, World War II, Newport Pagnell]",20190120033912,False
...,...,...,...,...,...,...
84789,/video/business/smallbusiness/1247468015760/vintage-filings-llc.html,"[Shai Stern, the C.E.O. of Vintage Filings LLC, explains how he saved his company after the market collapsed.]",[Distant_Anecdotal],[Vintage Filings LLC],20191120122351,False
84790,/video/business/smallbusiness/1247468445721/milemeter.html,"[How MileMeter, an auto insurance company, was saved by venture capital funds in the middle of the financial crisis.]",[Distant_Historical],[MileMeter],20190818064319,False
84791,/video/business/smallbusiness/1248069170884/surviving-on-the-front-lines.html,"[At the New York Times Small Business Summit, two business owners discussed networking, hiring and firing, and the role of luck.]",[Distant_Anecdotal],[the New York Times Small Business Summit],20190917092907,False
84792,/video/business/worldbusiness/1194832504471/dhl-chief-on-move-to-cut-jobs.html,"[CNBC interview with John Mullen, global chief executive of DHL Express, on the company's moves to cut jobs and close branches in the United States.]",[Cause_General],"[CNBC, DHL Express]",20190920170255,False


In [178]:
doc_ids = nyt_article_grouped_discourse_df_merged['doc_id'].drop_duplicates().sample(frac=1)

In [224]:
from dateutil.parser import parse as date_parse
import numpy as np
from more_itertools import flatten

In [195]:
nyt_article_grouped_discourse_df_merged['date'] = (
    nyt_article_grouped_discourse_df_merged['date'].apply(lambda x: date_parse(x) if isinstance(x, str) else np.nan)
)

In [223]:
nyt_article_grouped_discourse_df_merged['ents'] = nyt_article_grouped_discourse_df_merged['ents'].apply(set)

In [225]:
to_exclude = set(
    nyt_article_grouped_discourse_df_merged
     .pipe(lambda s: pd.Series(list(flatten(s['ents'].tolist()))))
     .value_counts()
     .pipe(lambda s: s/s.sum())
     .loc[lambda s: s > .01]
     .index.tolist()
)

In [226]:
to_exclude

{'New York Times', 'The New York Times', "The New York Times's"}

In [231]:
nyt_article_grouped_discourse_df_merged['ents'] = nyt_article_grouped_discourse_df_merged['ents'].apply(lambda x: x - to_exclude)

In [254]:
from datetime import timedelta

article = (
    nyt_article_grouped_discourse_df_merged
        .loc[lambda df: df['doc_id'] == doc_ids[1]]
        .iloc[0]
)
article_matching_keys = set(article['ents'])
article_date = article['date']

start_dt = article_date - timedelta(days=2 * 30)
archive = (
    nyt_article_grouped_discourse_df_merged
        .loc[lambda df: df['date'] < article_date]
        .loc[lambda df: df['date'] > start_dt]
        .loc[lambda df: df['doc_id'] != doc_id]
        .loc[lambda df: df['ents'].apply(lambda x: len(set(x) & article_matching_keys) > 0)]
)

In [255]:
(archive['ents']
     .pipe(lambda s: pd.Series(list(flatten(s.tolist()))))
     .value_counts()
     .pipe(lambda s: s/s.sum())
     .loc[lambda s: s > .01]
     .index.tolist()
)

['Venture', 'Mitsubishi', 'Congress']

In [259]:
archive.shape 

(8, 6)

In [260]:
archive['sentences'].str.len().sum()

347

### WSJ

In [11]:
import xopen
import orjson

fname = '../data/open-sourced-articles/wsj-business-articles-sans-html.jsonl.gz'
wsj_fetched = []
with xopen.xopen(fname, 'rb') as f:
    for line in f:
        wsj_fetched.append( orjson.loads(line))

wsj_fetched_df = pd.DataFrame(wsj_fetched).loc[lambda df: df['links'].str.len() > 0]

In [5]:
tqdm.pandas()
all_wsj_links_df = flatten_list_of_links(wsj_fetched_df)
wsj_press_release_articles = all_wsj_links_df.loc[lambda df: df.progress_apply(find_press_release, axis=1)]

In [41]:
excluded_urls = [
    'https://www.dowjones.com/professional/newswires/',
    'https://www.getnewsmart.com/',
    'https://newscorp.com/business/dow-jones/',
    'https://www.dowjones.com/products/newswires/',
]

In [43]:
wsj_press_release_articles = wsj_press_release_articles.loc[lambda df: ~df['href'].isin(excluded_urls)]

In [46]:
wsj_press_release_articles.to_csv('../data/open-sourced-articles/wsj-parsed-press-release-urls.csv')

In [50]:
wsj_relevant_articles = wsj_fetched_df.loc[lambda df: df['article_url'].isin(wsj_press_release_articles['url'])]

# Barrons

In [8]:
fname = '../data/open-sourced-articles/barrons-business-articles-sans-html.jsonl.gz'
barrons_fetched_df, barrons_press_release_article_map = open_and_process(fname)

  0%|          | 0/91420 [00:00<?, ?it/s]

  0%|          | 0/5334347 [00:00<?, ?it/s]

In [17]:
barrons_fetched_df = barrons_fetched_df.drop_duplicates('article_text')

In [24]:
barrons_entities = retrieve_ents_for_col(barrons_fetched_df['article_url'],
                                         barrons_fetched_df['article_text'])

  0%|          | 0/66309 [00:00<?, ?it/s]

In [28]:
barrons_entities_df = pd.DataFrame(barrons_entities)

In [99]:
# barrons_entities_df.to_json('../data/open-sourced-articles/barrons-article-entities.json.gz', compression='gzip', lines=True, orient='records')

In [33]:
barrons_fetched_df_with_ents = barrons_fetched_df.merge(
    barrons_entities_df,
    left_on='article_url',
    right_on='idx'
).drop(columns='idx')
barrons_fetched_df_with_ents['ents'] = barrons_fetched_df_with_ents['ents'].apply(set)

In [38]:
barrons_press_release_df = (pd
 .read_json('../data/open-sourced-articles/barrons-press-release-files.jsonl', lines=True, orient='records')
 .rename(columns=lambda x: x.replace('article_', 'press_release_'))
)

In [51]:
barrons_pr_entities = retrieve_ents_for_col(
    barrons_press_release_df['press_release_url'],
    barrons_press_release_df['press_release_text'].apply(lambda x: x[:10_000])
)

  0%|          | 0/969 [00:00<?, ?it/s]

In [98]:
# pd.DataFrame(barrons_pr_entities).to_json('../data/open-sourced-articles/barrons-press-release-entities.json.gz', compression='gzip', lines=True, orient='records')

In [54]:
barrons_press_release_df_with_ents = barrons_press_release_df.merge(
    pd.DataFrame(barrons_pr_entities),
    left_on='press_release_url',
    right_on='idx'
).drop(columns='idx')

In [60]:
barrons_merged_df = (
    barrons_press_release_df_with_ents
        .rename(columns={'ents': 'press_release_ents'})
        .merge(
            barrons_press_release_article_map
             .rename(columns={'href': 'press_release_url', 'url': 'article_url', 'text': 'link_text'}),
            on='press_release_url'
        )
 .rename(columns={"homepage_key": "press_release_homepage_key"})
 .drop(columns=['press_release_authors', 'press_release_top_image', 'press_release_video', 'press_release_wayback_timestamp', 'all_press_release_wayback_timestamps',])
 .merge(barrons_fetched_df_with_ents, on='article_url')
 .rename(columns={'ents': 'article_ents'})
 .drop(columns=['article_authors', 'article_top_image', 'article_video', 'source'])
)

In [91]:
most_common_ents = set(
    barrons_press_release_df_with_ents['ents']
     .pipe(lambda s: pd.Series(list(flatten(s.tolist())))).value_counts()
     .head(3).index
)

In [130]:
idx = 0
num_chars = 500
press_release_ents = barrons_merged_df['press_release_ents'].iloc[idx]
press_release_ents = set(
    pd.Series(press_release_ents)
        .value_counts()
        .loc[lambda s: s > 1]
        .index
)
press_release_ents -= most_common_ents
news_ents = barrons_merged_df['article_ents'].iloc[idx]
merged_ents = set(press_release_ents) & set(news_ents)
print(press_release_ents)
print('\n ------- PRESS RELEASE----------\n')
print('url: ' + barrons_merged_df['press_release_url'].iloc[idx])
print()
print(barrons_merged_df['press_release_text'].iloc[idx][:num_chars] + '...')
print()
print('\n------- NEWS ARTICLE----------\n')
print(barrons_merged_df['article_text'].iloc[idx][:num_chars] + '...')

{'REEcorner', 'American Axle & Manufacturing', 'AVs', 'REE'}

 ------- PRESS RELEASE----------

url: https://www.prnewswire.com/news-releases/ree-commences-trials-of-all-new-electric-p7-modular-platform-for-delivery-fleets-301454522.html

Supporting up to 8,800 lbs. max payload, the P7 platform packs up to 35% more packages than comparable commercial vehicles or the equivalent and can comfortably carry up to 30 passengers, making it the optimal platform for target markets such as delivery and logistic fleet owners, transit authorities, school buses and mobility operators. REEcorner and X-by-Wire technology allows each wheel to move independently for enhanced driving dynamics and safety with all-wheel steer, drive and brake option...


------- NEWS ARTICLE----------

Cars are the new phones. Look no further than CES 2022 in Las Vegas. Many of the cool announcements at the consumer electronics show are all about a cutting-edge technology: the automobile.

Take Sony (ticker: SONY). It mak

In [93]:
pd.options.display.max_colwidth = 500
(barrons_fetched_df_with_ents
     .loc[lambda df: df['ents'].apply(lambda x: len(x& press_release_ents) > 0)]
     [['article_text']]
)

Unnamed: 0,article_text
6410,"Text size\n\nUsed car prices hit a record recently, in what can be described as a surprising development amid a global pandemic and falling new car sales. There are a few reasons for the rise, which are a positive for all automotive stocks.\n\nThe Manheim used vehicle price index hit a record in mid-June, rising about 4% year over year, 7% compared with the May reading, and surging more than 16% compared with the April pandemic-induced low.\n\nIt’s a surprising data point. “What planet am I ..."
15128,"Text size\n\nAbout the author: Clifford Winston is a senior fellow at the Brookings Institution.\n\nCongestion and delays at ports and on highways are raising fears of a disrupted holiday shopping season. It may be too late to rescue this year’s Christmas stockings, but there is a way to reduce congestion delays significantly and prevent future breakdowns of the supply chain. Policymakers should see the supply-chain crisis as a reason to prioritize autonomous transportation across the econom..."
15602,"Text size\n\nAbout the authors: Clifford Winston is a senior fellow at the Brookings Institution. Joan Winston is a technology policy analyst in Washington, D.C.\n\nAutonomous vehicles have the potential to address major social problems that exist today and will persist in the future, including nerve-wracking congestion, millions of fatal and nonfatal accidents, and violent police confrontations with drivers. AVs can enable increased economic activity without increasing the spread of a virus..."
15930,"Forget the race into space. The competition to be the first country to launch driverless cars has grabbed the attention of more nations than a new lunar landing.\n\nThe United Kingdom has emerged as the No. 1 location on earth to support autonomous vehicles, or AVs, in a new analysis conducted by the Society of Motor Manufacturers and Traders, or SMMT. The U.K. lobby group has calculated that it will generate an economic boost of 62 billion pounds sterling ($81.1 billion) per annum by 2030, ..."
33465,"Text size\n\nIt’s been a cruel winter for many parts of the U.S. but it’s been even colder for the auto industry, with Ford Motor (F) and General Motors (GM) firmly in the red year-to-date, along with suppliers like American Axle & Manufacturing (AXL) and Delphi Technologies (DLPH).\n\nSpencer Platt/Getty Images\n\nOf course, there’s plenty of obvious reasons why, like the specter of a trade war and NAFTA risks. But all that noise has created a buying opportunity, argues Deutsche Bank’s Rod ..."
41355,"Text size\n\nREE Automotive Ltd. [NASDAQ: ""REE""], a leader in e-Mobility, today announced that it will open its U.S. headquarters in Austin, Texas to address the growing U.S. market demand for mission-specific EVs from delivery and logistics companies, Mobility-as-a-Service and new technology players. In addition, Austin will be the location of REE's first asset-light Integration Center for the assembly and testing of its disruptive REEcorner(TM) technology and ultra-modular EV platforms. Th..."


In [139]:
full_barrons_discourse_df = load_discourse_df(
    '../data/open-sourced-articles/barrons-articles-news-discourse.jsonl.gz'
)

In [145]:
barrons_articles_with_discourse = (full_barrons_discourse_df
 .loc[lambda df: df['discourse_preds'].isin(['Main', 'Cause_General', 'Cause_Specific'])]
 .groupby('doc_id')
 [['sentences', 'discourse_preds']].aggregate(list)
)

In [152]:
sort_order = {'Main':0, 'Cause_General':1, 'Cause_Specific': 2}
barrons_articles_with_discourse = (barrons_articles_with_discourse
 .loc[lambda df: 
  df['discourse_preds']
      .apply(lambda x: len(set(x)  & set(['Main', 'Cause_General'])) > 1)
     ]
 .apply(lambda x: list(zip(x['sentences'], x['discourse_preds'])), axis=1)
 .apply(lambda x: sorted(x, key=lambda y: sort_order[y[1]]))
)

In [159]:
(barrons_articles_with_discourse
 .pipe(lambda s: pd.DataFrame(s.iloc[0]))
 .loc[lambda df: df[0].str.split(' ').str.len() > 3]
)

Unnamed: 0,0,1
0,"Lawsuits reveal that big wealth management companies are poring over emails, texts, printer usage, and more to find evidence that advisors may be violating employment or nonsolicitation agreements.",Main
2,"Since acquiring Merrill Lynch at the nadir of the financial crisis, Bank of America has been putting its stamp on the storied brokerage and integrating it more tightly into the larger bank.",Cause_General
3,An overhaul of Merrill's advisor training program earlier this year represents the latest step in this evolution.,Cause_General
6,"In this Big Q, advisors share how they're responding, including borrowing more money, recalibrating clients' bond strategies, and looking to commodities and real estate as an inflation hedge.",Cause_General
7,"Whether the goal is growing the business or partially cashing out in a liquidity event, more and more RIA owners are selling minority stakes in their firms.",Cause_General
8,"Also, in the ""best of 2021"" spirit, this week we share some of the most insightful answers to questions in our weekly Advisor Q&A. Topics range from portfolio construction to balancing work and family to the risks that might lie ahead for an industry that has experienced growth and prosperity for many years.",Cause_General
9,"But amid the changes, some veteran Merrill advisors managing lots of money are walking out the door.",Cause_Specific
10,"In just 18 months through early May, when this story was published, Canada's CI Financial gobbled up 16 U.S. wealth management firms, quickly turning the company into a major force in the white-hot M&A market for RIAs.",Cause_Specific


In [None]:
# ---------------------------------------
# reuters :
# business, markets, technology
# ---------------------------------------
# washington post:
# business, technology,
# ---------------------------------------

In [None]:
# we definitely want to have multiple news sources 
#
## crawling stuff from the web 

## we'd like to have more richer details in the flash headlines, but 
## it's not new reporting. just summarized/etc. of other documents or whatever is out there.
## 
## look at lexical form a
## NI FLASH HEADLINES
## BFW are quick stories
## 
## NI PRESS RELEASE

## Does Bloomberg cover more government or business press releases? 
# -> Will have to filter down by the subject
## Reporters break things down based on expertise:
    ## example: Reporter from europe covers the EU press release

## Models
## instruction-tuned BloombergGPT
## 
## Evaluation:
## * We don't want to generate the entire article from just the press release
## * How 
## Take the story and summary of it, and then the summary becomes the gold-standard of 
## What we are trying to produce
## Should the summary be totally automatic
## Take the opening paragraph and that's a summary 
## 
## Flash headlines, it's less necessary.
## 
## ************ How can we make sure we validate the summaries generated?
## 
## We can generate bullet points from the press release alone
## We can generate bullet points from the article
## We can summarize the common points between the two
## These three will actually come out as quite different.

## If humans are asked to summarize the news:
##    * When the news adds background, the summarization point will be very different.
##    * When they read both, that will be very different.
## Ex.
## News article covers press release and adds information about the stock
## 

## Setting up the evaluation
## Metrics for scoring
## Starting to augment this... dense retrieval?
## Here's a summary of the press release, find other things that are related? 
## Produce a bulleted list of the major highlights from the press release
## Turn each one of the bullets into a query and search the archive for similar articles. 
## 
## Ex. A drug is mentioned along with other things... one bullet point will mention the drug
## so, search the archive for similar things about that.

## Having a step where we produce a bunch of stuff, following up on them

## You can say  to Language model: "A company just produced this, what questions 
# do you have aboubt this?"
# Can the model generate a bunch of things:
# 1. What's the approval timeline... 

# Process Domain Blacklist

In [None]:
import glob
domain_files = glob.glob('../data/utility-files/*.*')

all_domain_files = []
for f in domain_files:
    if f.endswith('.json'):
        df= pd.read_json(f, lines=True)
    else:
        df = pd.read_csv(f)
    all_domain_files.append(df)

news_domains_df = pd.concat(all_domain_files)
news_domains_df= news_domains_df.loc[lambda df: ~df['domain'].str.contains('.gov', regex=False)]
news_domains = news_domains_df['domain'].str.split('.').str.get(0).drop_duplicates().tolist()

social_media_blog_lists = glob.glob('../data/utility-files/Social-media-Blocklists/*.txt')
all_social_media_sites = []
for l_file in social_media_blog_lists:
    with open(l_file) as f:
        lines = f.read().split('\n')
        matching_lines= list(filter(lambda x: x.startswith('||'), lines))
        all_social_media_sites += matching_lines

social_media_domains = list(map(lambda x: x.replace('||','').replace('^', ''), all_social_media_sites))
social_media_domains = list(set(map(lambda x: tldextract.extract(x).domain, social_media_domains)))

additional_exclusions = [
    'google',
    'xinhuanet',
    'news.sky',
    'billboard',
    'india',
    'the-sun',
    'timesofindia'
]

# domain exclusions
domain_exclusions = news_domains + social_media_domains + additional_exclusions

with open('../data/utility-files/domain-exclusions-master-list.txt', 'w') as f:
    for d in domain_exclusions:
        f.write(d)
        f.write('\n')

# Slosh

In [176]:
import glob

In [180]:
glob.glob('../data/open-sourced-articles/reuters-*')

['../data/open-sourced-articles/reuters-business-cc-articles-to-fetch.txt.gz',
 '../data/open-sourced-articles/reuters-technology-cc-articles-to-fetch.txt.gz',
 '../data/open-sourced-articles/reuters-markets-cc-articles-to-fetch.txt.gz']

In [192]:
import xopen

reuters_lines = []
for f in glob.glob('../data/open-sourced-articles/reuters-*-cc*'):
    for line in xopen.xopen(f, 'rb'):
        reuters_lines.append(line)
with xopen.xopen('../data/open-sourced-articles/reuters-business-articles-to-fetch.txt.gz', 'wb') as f:
    for line in reuters_lines:
        f.write(line)

In [220]:
wp_lines = []
for f in glob.glob('../data/open-sourced-articles/wp-*-cc*'):
    for line in xopen.xopen(f, 'rb'):
        wp_lines.append(line)
        
with xopen.xopen('../data/open-sourced-articles/wp-business-articles-to-fetch.txt.gz', 'wb') as f:
    for line in wp_lines:
        f.write(line)

In [None]:
import orjsonl
import gzip 
import xopen 
import jsonlines

# test orjsonl
with xopen.xopen('test.jsonl.gz', 'wb') as f:
    for obj in links_obj:
        orjsonl.append(path=f, data=[obj])

test = []
for idx, f in enumerate(orjsonl.stream('test.jsonl.gz')):
    if idx == 100:
        break 
    test.append(f) 
gzipped = gzip.compress(line_str)


# test gzip compress 
with open('test-stream.jsonl.gz', 'wb') as f:
    for line in links_obj:
        line_str = orjson.dumps(line)
        gzipped = gzip.compress(line_str + b'\n')
        f.write(gzipped)

# test jsonlines
with xopen.xopen('test-stream-jsonwriter.jsonl.gz', 'wb') as f:
    with jsonlines.Writer(f) as w:
        for line in links_obj:
            w.write(line)

# test regular xopen
with xopen.xopen('test-bin.txt', mode='wb') as f:
    f.write(b'hello world\n')

with xopen.xopen('test-bin.txt') as f:
    print(f.read())

for line in xopen.xopen('test-stream.jsonl.gz', 'rb'):
    line 

orjson.loads(line)

In [33]:
# test gzip compress 
with xopen.xopen('test-stream.jsonl.gz', 'wb') as f:
    for line in links_obj:
        line_str = orjson.dumps(line) + b'\n'
        # gzipped = gzip.compress(line_str + b'\n')
        f.write(line_str)

In [34]:
fname = 'test-stream.jsonl.gz'
all_data = []
with xopen.xopen(fname, 'rb') as f:
    for line in f:
        break 

In [35]:
line 

b'{"text":"Skip to content","href":"#site-content"}\n'

In [25]:
line_str

b'{"text":"Subscriptions","href":"https://www.nytimes.com/subscription?campaignId=37WXW"}\n'

In [15]:
line 

b'{"text":"Skip to content","href":"#site-content"}\n'

# Read Bloomberg Press Release Data

In [None]:
import glob
from tqdm.auto import tqdm
press_release_files = glob.glob('../data/zomo-downloads/*rele*')

all_press_releases = []
for f in tqdm(press_release_files):
    all_press_releases.append(pd.read_excel(f))

all_press_releases_df = pd.concat(all_press_releases)

# all_press_releases_df.to_csv('../data/zomo-downloads/all-press-release-df.csv')

found_df = (
    all_press_releases_df[['release_web_url']]
     .dropna()
     .assign(
        found=lambda df:
         df['release_web_url'].pipe(lambda s: 
         s.str.contains('prnewswire', case=False) |
         s.str.contains('businesswire', case=False) |
         s.str.contains('press', case=False) |
         s.str.contains('/news/', case=False) |
         s.str.contains('release', case=False)     
)))

'ReportPage',
'contentDownload'
'Archives'

found_df['found'].value_counts()

import string

def split_all_punct(s):
    for p in string.punctuation:
        s = s.replace(p, ' ')
    return s.split()

urls_split_by_punct  = (found_df
 .loc[lambda df: df['found'] == False]
 ['release_web_url']
 .apply(split_all_punct)
)

(urls_split_by_punct
 .pipe(lambda s: pd.Series(list(flatten(s.tolist()))))
 .value_counts()
 .head(20)
)