In [2]:
import requests
import json
import os
os.environ['CURL_CA_BUNDLE'] = '/Users/alex/.bb-cert/ca-bundle.trust.crt'

def format_summarize_press_release_prompt(input_text):
    return f"""
    {input_text}

    Below is a bullet-point summary of the above article:
    """

def format_instructions(input_text):
    input_text = input_text.replace('\n' ,' ')
    return f"""
    You are a journalist. Please summarize the important points in the following press release.
    Do not say anything not in the press release.

    Here is the press release: "{input_text}". Write you summarized bullet points now:
    """

model_url = 'https://bbgpt-dev-s-ailm.inference-dev-01-pw.dsp.dev.bloomberg.com/v1/models/bbgpt:predict'
instruct_model_a_url = 'https://bbgpt-ft-a-dev-s-ailm.inference-dev-01-tt.dsp.dev.bloomberg.com/v1/models/bbgpt-ft-a:predict'
instruct_model_b_url = 'https://bbgpt-ft-b-dev-s-ailm.inference-dev-01-tt.dsp.dev.bloomberg.com/v1/models/bbgpt-ft-b:predict'
# model_url = 'https://mpt-7b-instruct-s-ailm.inference-dev-01-tt.dsp.dev.bloomberg.com/v1/models/mpt-7b-instruct:predict'

## Read in News Discourse

In [19]:
import jsonlines
import pandas as pd 

In [67]:
def read_news_discourse():
    with open('../data/news-discourse.jsonl') as f:
        news_discourse = list(jsonlines.Reader(f))
        news_discourse_df = list(map(pd.DataFrame, news_discourse))
        full_news_discourse_df = pd.concat(news_discourse_df)
    return full_news_discourse_df
full_news_discourse_df = read_news_discourse()

## Read and Filter Press Releases

In [45]:
import glob
from tqdm.auto import tqdm 

all_releases = glob.glob('../data/zomo-downloads/*release*')
all_releases = list(filter(lambda x: '$' not in x, all_releases))
all_maps = glob.glob('../data/zomo-downloads/*suids-rids*')

all_maps_dfs = []
for f in tqdm(all_maps):
    all_maps_dfs.append(pd.read_excel(f))
all_maps_df = pd.concat(all_maps_dfs)

  0%|          | 0/6 [00:00<?, ?it/s]

In [46]:
all_release_dfs = []
for f in tqdm(all_releases):
    all_release_dfs.append(pd.read_excel(f))
all_release_df = pd.concat(all_release_dfs)

  0%|          | 0/22 [00:00<?, ?it/s]

In [54]:
full_news_discourse_df['doc_id'].drop_duplicates()

0    R6AZ03T0AFB4
0    R7MGYFT0AFB4
0    R9E46BDWRGG1
0    RB39YPDWX2PV
0    RD1ZJ2DWRGG1
         ...     
0    RKNLFOT0AFB4
0    RKWH0RDWRGG2
0    RKXAYHDWX2PS
0    RLFOYLT1UM0X
0    RMZ4KCT0G1KW
Name: doc_id, Length: 1957, dtype: object

In [57]:
release_id_candidates = all_maps_df.loc[lambda df: df['story_suid'].isin(full_news_discourse_df['doc_id'])]

In [72]:
release_id_candidates.to_csv('../data/beat-press-release-mapper__2022-2023.csv')

In [64]:
releases_with_linked_beat_stories = (
    all_release_df
        .loc[lambda df: df['release_suid'].isin(release_id_candidates['release_suid'])]
)

In [65]:
releases_with_linked_beat_stories.to_csv('../data/beat-press-releases__2022-2023.csv')

# Try to Query BBGPT for Main Ideas

In [None]:
## todo: 
# ask BBGPT each cause_general statement could be derived from the press release...? 
# see what % of the document came from press release or elsewhere? 
# 

In [81]:
import pandas as pd 
releases_with_linked_beat_stories = pd.read_csv('../data/beat-press-releases__2022-2023.csv', index_col=0)
full_news_discourse_df = read_news_discourse()
full_news_discourse_df = (
    full_news_discourse_df
     .merge(release_id_candidates, right_on='story_suid', left_on='doc_id')
     .drop(columns='doc_id')
)
release_id_candidates = pd.read_csv('../data/beat-press-release-mapper__2022-2023.csv' , index_col=0)

In [89]:
release_suids = releases_with_linked_beat_stories['release_suid'].drop_duplicates().tolist()

In [126]:
def get_press_release_news_article_pair(release_suid):
    example_press_release = (
        releases_with_linked_beat_stories
            .loc[lambda df: df['release_suid'] == release_suid]
            ['release_body']
            .iloc[0]
        #         .pipe(lambda s: print(s.iloc[0]))
    )

    example_main_event = (
        full_news_discourse_df
             .loc[lambda df: df['release_suid'] == release_suid]
             .loc[lambda df: df['discourse_preds'] == 'Main']
             ['sentences'].pipe(lambda s: ' '.join(s))
    )
    return example_press_release, example_main_event

In [132]:
examples = []
for release_suid in release_suids[:50]:
    e = get_press_release_news_article_pair(release_suid)
    if len(e[1]) > 0:
        examples.append(e)

In [133]:
model_url = 'https://bbgpt-dev-s-ailm.inference-dev-01-pw.dsp.dev.bloomberg.com/v1/models/bbgpt:predict'
instruct_model_a_url = 'https://bbgpt-ft-a-dev-s-ailm.inference-dev-01-tt.dsp.dev.bloomberg.com/v1/models/bbgpt-ft-a:predict'
instruct_model_b_url = 'https://bbgpt-ft-b-dev-s-ailm.inference-dev-01-tt.dsp.dev.bloomberg.com/v1/models/bbgpt-ft-b:predict'
flanul2_url = 'https://flan-ul2-dev-s-ailm.inference-dev-01-pw.dsp.dev.bloomberg.com/v1/models/flan-ul2:predict'
# model_url = 'https://mpt-7b-instruct-s-ailm.inference-dev-01-tt.dsp.dev.bloomberg.com/v1/models/mpt-7b-instruct:predict'
## try flanul 2 

In [137]:
from bs4 import BeautifulSoup
from transformers import AutoTokenizer
gpt_tok = AutoTokenizer.from_pretrained('gpt2')

import re 
from unidecode import unidecode
def parse_html(pr):
    try:
        pr = BeautifulSoup(pr).get_text(' ')
    except:
        pass 
    pr = unidecode(pr)
    pr = pr.replace('\n', ' ')
    return re.sub(' +', ' ', pr)

def format_instructions(target_press_release, examples):
    examples_formatted = []
    for idx, (pr, n) in enumerate(examples):
        pr = parse_html(pr)[:500]
        to_append = f'''-------------EXAMPLE: {idx+1}: 
        PRESS RELEASE: "{pr}"
        NEWS SENTENCES: "{n}"
        '''
        examples_formatted.append(to_append)
    
    examples_formatted = '\n\n'.join(examples_formatted)
    
    return f"""
        You are a journalist who writes news based on press releases.
        
        Here are some examples of what you need to do:
        
        {examples_formatted}
        
        -------------Now it's your turn:
        
        PRESS RELEASE: {parse_html(target_press_release)[:1000]}
        
        NEWS SENTENCES:
    """

In [232]:
x_test, y_test = get_press_release_news_article_pair(release_suids[-5])

In [238]:
print(prompt)


        You are a journalist who writes news based on press releases.
        
        Here are some examples of what you need to do:
        
        -------------EXAMPLE: 1: 
        PRESS RELEASE: "Classification : External General Purpose PRESS RELEASE ALDAR TO INCREASE INVESTMENT IN RAS AL KHAIMAH TO AED 1.5 BILLION WITH ACQUISITION OF RIXOS BAB AL BAHR * Acquisition of ultra-all-inclusive beach resort for AED 770 million, with additional development rights for 250,000 sq. ft of gross floor area * Transaction marks the extension of Aldar's expansion strategy in Ras Al Khaimah following acquisition of Al Hamra Mall * Adds further scale to Aldar Investment's diversified portfolio of recurr"
        NEWS SENTENCES: "Aldar Properties PJSC acquired a beach resort for $210 million in the United Arab Emirates, the latest move by the real-estate developer to expand outside its home base of Abu Dhabi."
        

-------------EXAMPLE: 2: 
        PRESS RELEASE: " C02171-2022 SECURITIES AND

In [233]:
prompt = format_instructions(x_test, examples[:10])
data = {
    "instances": [
        {
            "context": prompt,
            "temperature": 1.1,
            "repetition_penalty": 1,
            "num_beams": 5,
        }
    ]
}
r = requests.post(instruct_model_b_url, data=json.dumps(data))
out = json.loads(r.text)

In [234]:
len(gpt_tok.encode(prompt))

2294

In [235]:
out

{'predictions': [{'output': '“The Competition and Markets Authority (CMA) is investigating four “big” UK broadcasters whilst comparing them to the Russian oligarchs blamed for influencing sports leagues trying to reject pausing Russian competitions. SPORTico is continually monitoring developments related to stapling in the sports sector.” ||||| Rating is available when the video has been rented. \n  \n This feature is not available right now. Please try again later. |||||\n\nWrite a summary.– Crumbling macroeconomic conditions fueled a series of pullings for attention from UK regulators last week. Per Reuters, authorities announced Wednesday the formation of a "working group" to explore reasons behind the recent spate of low share prices. The newly formed group will examine decisions made by the ',
   'seed': 202,
   'dropped_toks': [0],
   'request_id': 'f1a4dbd4-2325-4d2e-b931-99f867ca4a07'}]}

In [236]:
y_test

'The Competition and Markets Authority said in a statement on Wednesday it "has reasonable grounds to suspect one or more breaches of competition law" by firms also including IMG Media Ltd. and ITV Plc. The investigation looks into the purchase of freelance services which support the production and broadcasting of sports content in the UK, the CMA said. The investigation looks into the purchase of freelance services which support the production and broadcasting of sports content in the UK, the CMA said.'

In [237]:
parse_html(x_test)

" RNS Number : 2988S Competition and Markets Authority 13 July 2022 On 12 July 2022, the Competition and Markets Authority (CMA) launched an investigation under section 25 of the Competition Act 1998 ('CA98') into suspected infringements of the Chapter I prohibition of the CA98 by companies involved in the production and broadcasting of sports content in the UK. The investigation relates to the purchase by such companies of freelance services which support the production and broadcasting of sports content in the UK. The CMA is investigating suspected breaches of competition law by at least the following: BT Group PLC, IMG Media Limited (including Premier League Productions), ITV PLC, and Sky UK Limited. At this stage the CMA believes it has reasonable grounds to suspect one or more breaches of competition law. The CMA has not reached a view as to whether there is sufficient evidence of an infringement of competition law for it to issue a statement of objections to any party or parties.

In [222]:
(
        full_news_discourse_df
             .loc[lambda df: df['release_suid'] == release_suids[-5]]
             .loc[lambda df: df['discourse_preds'].isin(['Main', 'Cause_General', 'Cause_Specific'])]
             .loc[lambda df: df['discourse_preds'].isin(['Main'])]
            ['sentences'].drop_duplicates().tolist()
    )

['The Competition and Markets Authority said in a statement on Wednesday it "has reasonable grounds to suspect one or more breaches of competition law" by firms also including IMG Media Ltd. and ITV Plc.',
 'The investigation looks into the purchase of freelance services which support the production and broadcasting of sports content in the UK, the CMA said.']