In [1]:
# ! pip install pandas==1.5.3
# ! pip install numpy==1.23.1

In [2]:
import pandas as pd 
from pyspark.sql.functions import array_contains, array_intersect, array, udf, size
from pyspark.sql.functions import lit , size, col
from pyspark.sql.types import BooleanType
import re 
from bloomberg.ai.librarian import Librarian, get_config
import pyspark.sql.functions as F
from pyspark.sql.window import Window
from pyspark.sql.functions import lit
import pandas as pd
from bs4 import BeautifulSoup


def check_overlap(nicodes):
    to_exclude = [
        # 'BORGBV',
        # 'BORGFF',
        # 'CMPAUTO',
        # 'BORGDONE',
        # 'BORGSCND',
        # 'BORG2',
        # 'BORG2DON',
        # 'CYBORG',
        'HEADS'
    ]
    
    to_include = [
        'MGMTCHG', # Management Change
        'INI', # Company IPOs
        'SECONDBA', # secondary offerings
        'ERN', # Earnings cannot be fully automated, because we never know what companies will put in there
               # We have extraction to pull non-numerical numbers, it’s not job cuts that we look out for 
        'BORDONE', #: automated stories with human intervention 
        'JOBCUTS',  # This is hard, because companies use different wordings to use 
        'CREDITCG'
    ]
    
    nicode_vals = list(map(lambda x: x.nicode_val, nicodes))
    is_excluded = len(set(nicode_vals) & set(to_exclude)) == 0
    is_included = len(set(nicode_vals) & set(to_include)) > 0
    is_included = True
    return is_excluded & is_included
    
check_overlap_udf = udf(f=check_overlap, returnType=BooleanType())

In [None]:
config = get_config()
config.spark.properties["spark.executor.instances"] = 30
config.spark.properties["spark.executor.memory"] = "20G"
config.spark.properties["spark.driver.memory"] = "20G"

librarian = Librarian(cfg=config)
spark = librarian.spark_session

newsarchive_df = (
    librarian
        .datasets
        .newsarchive
        .prod()
        .pvf_newsldap_4()
        .pvf_newsldap_6()
        .begin(2022, 1, 1)
        .end(2023, 1, 30)
    .fetch()
)
newsarchive_df = newsarchive_df.filter(newsarchive_df.bloom_lang == 1)

In [9]:
suid_rid_map = pd.read_csv('all_suid_rid_maps.csv.gz', index_col=0).dropna()

In [10]:
suid_rid_map_df = spark.createDataFrame(suid_rid_map) 

  for column, series in pdf.iteritems():


In [40]:
# press_release_na_df = (
#     newsarchive_df
#        .join(suid_rid_map_df, suid_rid_map_df.release_suid==newsarchive_df.suid)
#        .select(['story_suid', 'suid', 'wire', 'headline', 'body', 'timeofarrival', 'bunch_id', 'nicodes'])
# )

In [11]:
full_newsarchive_df = (
    newsarchive_df
       .join(suid_rid_map_df, suid_rid_map_df.story_suid==newsarchive_df.suid)
       .select(['release_suid', 'suid', 'wire', 'headline', 'body', 'timeofarrival', 'bunch_id', 'nicodes'])
)

In [12]:
beat_articles_df = (
    full_newsarchive_df
        .filter(full_newsarchive_df.wire.isin([25, 2345]))
        .filter(check_overlap_udf(full_newsarchive_df.nicodes))
)               

In [13]:
beat_articles_df.count()

23/07/03 19:52:11 WARN kns=aspangher appId=spark-application-1688413485984 execId=driver ExecutorPodsAllocator: Executors with ids 51,52,53,54,55 were not detected in the Kubernetes cluster after 60000 ms despite the fact that a previous allocation attempt tried to create them. The executors may have been deleted but the application missed the deletion event.
                                                                                

102294

In [15]:
partition_count = 1
file_format = 'json'
output_dir = 'hdfs://DOB2-GEN/user/aspangher/press-release-newsarchive/beat-article-limited.json'
compression = "gzip"
(beat_articles_df
     .repartition(partition_count)
     .write.mode("overwrite")
     .format(file_format)
     .option("compression", compression)
     .save(output_dir)
)

23/07/03 19:52:44 INFO kns=aspangher appId=spark-application-1688413485984 execId=driver FileOutputCommitter: File Output Committer Algorithm version is 1
23/07/03 19:52:44 INFO kns=aspangher appId=spark-application-1688413485984 execId=driver FileOutputCommitter: FileOutputCommitter skip cleanup _temporary folders under output directory:false, ignore cleanup failures: false
23/07/03 19:53:12 WARN kns=aspangher appId=spark-application-1688413485984 execId=driver ExecutorPodsAllocator: Executors with ids 56,57,58,59,60 were not detected in the Kubernetes cluster after 60000 ms despite the fact that a previous allocation attempt tried to create them. The executors may have been deleted but the application missed the deletion event.
23/07/03 19:54:13 WARN kns=aspangher appId=spark-application-1688413485984 execId=driver ExecutorPodsAllocator: Executors with ids 61,62,63,64,65 were not detected in the Kubernetes cluster after 60000 ms despite the fact that a previous allocation attempt tri

In [16]:
# ! pip install bloomberg.ai.remoteio
from bloomberg.ai.remoteio import RemoteIO

In [19]:
RemoteIO.ls('hdfs://DOB2-GEN/user/aspangher/press-release-newsarchive/beat-article-limited.json')

['hdfs:///user/aspangher/press-release-newsarchive/beat-article-limited.json/_SUCCESS',
 'hdfs:///user/aspangher/press-release-newsarchive/beat-article-limited.json/part-00000-699a8040-6a06-4664-ae7d-4e40d06da694-c000.json.gz']

In [None]:
from bloomberg.ai.remoteio import BCSConfig

bcs_cfg = BCSConfig(
    endpoint_url="http://s3.dev.obdc.bcs.bloomberg.com",
    access_key="VN2M29BH4PCAJT9ABZKN",
    secret_key="O9bcBpaprrXr6Q3dorn0XYI4Kp8go6oBDBYFYqeD",
    proxy=None
)

In [27]:
RemoteIO.transfer(
    'hdfs://DOB2-GEN/user/aspangher/press-release-newsarchive/beat-article-limited.json',
    's3://aspangher/press-release-newsarchive/beat-article-limited',
    to_storage_config=bcs_cfg
)

23/07/03 20:11:35 WARN kns=aspangher appId=spark-application-1688413485984 execId=driver ExecutorPodsAllocator: Executors with ids 146,147,148,149,150 were not detected in the Kubernetes cluster after 60000 ms despite the fact that a previous allocation attempt tried to create them. The executors may have been deleted but the application missed the deletion event.
23/07/03 20:12:37 WARN kns=aspangher appId=spark-application-1688413485984 execId=driver ExecutorPodsAllocator: Executors with ids 151,152,153,154,155 were not detected in the Kubernetes cluster after 60000 ms despite the fact that a previous allocation attempt tried to create them. The executors may have been deleted but the application missed the deletion event.
23/07/03 20:13:38 WARN kns=aspangher appId=spark-application-1688413485984 execId=driver ExecutorPodsAllocator: Executors with ids 156,157,158,159,160 were not detected in the Kubernetes cluster after 60000 ms despite the fact that a previous allocation attempt trie

In [43]:
one_article = beat_articles_df.limit(2).toPandas()

In [45]:
soup = BeautifulSoup(one_article['body'].iloc[0])

In [49]:
soup.find_all('a')[0].attrs

{'href': 'bbg://securities/MRCO%20IN%20Equity',
 'title': 'Company Overview',
 'itemscope': 'itemscope',
 'itemprop': 'StoryLink'}

In [53]:
soup.find_all('a')[0].get_text()

'Marico'

23/07/03 20:20:47 WARN kns=aspangher appId=spark-application-1688413485984 execId=driver ExecutorPodsAllocator: Executors with ids 191,192,193,194,195 were not detected in the Kubernetes cluster after 60000 ms despite the fact that a previous allocation attempt tried to create them. The executors may have been deleted but the application missed the deletion event.


In [7]:
from pyspark.sql.types import ArrayType, StructType, StructField, StringType

def get_hyperlinks(body):
    import os 
    os.system("pip install beautifulsoup4")
    from bs4 import BeautifulSoup
    output = []
    soup = BeautifulSoup(body)
    a_s = soup.find_all('a')
    a_s = list(filter(lambda x: x.attrs.get('href'), a_s))
    
    output = []
    for a in a_s:
        attrs = a.attrs
        attrs['text'] = a.get_text().strip()
        output.append(attrs)
    
    return output
    
get_hyperlinks_udf = udf(f=get_hyperlinks, returnType=ArrayType(
    StructType([
        StructField('href', StringType()),
        StructField('title', StringType()),
        StructField('itemscope', StringType()),
        StructField('itemprop', StringType()),
        StructField('text', StringType()),
    ])
))

In [4]:
full_beat_df = (
    newsarchive_df
     .filter(newsarchive_df.wire == 25)
     .filter(check_overlap_udf(newsarchive_df.nicodes))
)

In [None]:
hyperlinks_df = (
    newsarchive_df
     .filter(newsarchive_df.wire == 25)
     .filter(check_overlap_udf(newsarchive_df.nicodes))
     .limit(2000)
     .select(get_hyperlinks_udf('body').alias('hyperlinks'))
     .toPandas()
)

