In [1]:
import pandas as pd

In [4]:
ls ../data/zomo-downloads/

2022-01-01__2022-02-01__release-data.xlsx
2022-01-01__2022-03-01__suids-rids.xlsx
2022-03-01__2022-05-01__suids-rids.xlsx
2022-05-01__2022-07-01__suids-rids.xlsx
2022-07-01__2022-09-01__suids-rids.xlsx
2022-09-01__2022-11-01__suids-rids.xlsx
2022-11-01__2023-02-01__suids-rids.xlsx
~$2022-01-01__2022-02-01__release-data.xlsx


In [20]:
from bs4 import BeautifulSoup

In [6]:
first_batch = pd.read_excel('../data/zomo-downloads/2022-01-01__2022-02-01__release-data.xlsx')

In [13]:
t = first_batch.loc[lambda df: df['release_body'].str.len() == 10_000]

In [23]:
soup = BeautifulSoup(t['release_body'].iloc[3]).get_text(' ')

In [83]:
from pyspark.sql.functions import array_contains, array_intersect, array, udf, size
from pyspark.sql.functions import lit , size, col
from pyspark.sql.types import BooleanType
import re 
from bloomberg.ai.librarian import Librarian, get_config
import pyspark.sql.functions as F
from pyspark.sql.window import Window
from pyspark.sql.functions import lit


def check_overlap(nicodes):
    to_exclude = [
        'BORGBV',
        'BORGFF',
        'CMPAUTO',
        'BORGDONE',
        'BORGSCND',
        'BORG2',
        'BORG2DON',
        'CYBORG',
        'HEADS'
    ]
    
    to_include = [
        'MGMTCHG', # Management Change
        'INI', # Company IPOs
        'SECONDBA', # secondary offerings
        'ERN', # Earnings cannot be fully automated, because we never know what companies will put in there
               # We have extraction to pull non-numerical numbers, it’s not job cuts that we look out for 
        'BORDONE', #: automated stories with human intervention 
        'JOBCUTS',  # This is hard, because companies use different wordings to use 
        'CREDITCG'
    ]
    
    nicode_vals = list(map(lambda x: x.nicode_val, nicodes))
    is_excluded = len(set(nicode_vals) & set(to_exclude)) == 0
    is_included = len(set(nicode_vals) & set(to_include)) > 0
    return is_excluded & is_included
    
check_overlap_udf = udf(f=check_overlap, returnType=BooleanType())

In [None]:
config = get_config()
config.spark.properties["spark.executor.instances"] = 20
config.spark.properties["spark.executor.memory"] = "4G"
config.spark.properties["spark.driver.memory"] = "6G"

librarian = Librarian(cfg=config)
spark = librarian.spark_session

In [88]:
wires_to_include = [
    284, 1883, 12, 39, 1814,
    809, 89, 733, 1481, 324,
    1883, 1923, 1814, 1925, 
    1884, 1886, 835, 1924, 921,
    1169, 529, 1926, 412,
    831, 1897, 1887, 927
]

In [3]:
newsarchive_df = (
    librarian
        .datasets
        .newsarchive
        .prod()
        .pvf_newsldap_4()
        .pvf_newsldap_6()
        .begin(2023, 4, 1)
        .end(2023, 4, 30)
    .fetch()
)
newsarchive_df = newsarchive_df.filter(newsarchive_df.bloom_lang == 1)

23/06/28 16:34:39 WARN kns=aspangher appId=spark-application-1687970047226 execId=driver DomainSocketFactory: The short-circuit local reads feature cannot be used because libhadoop cannot be loaded.
23/06/28 16:34:49 WARN kns=aspangher appId=spark-application-1687970047226 execId=driver package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


In [49]:
press_release_filtered_df = (
    newsarchive_df
        .filter(newsarchive_df.wire.isin(wires_to_include))
        .select(['suid', 'wire', 'headline', 'body', 'timeofarrival', 'bunch_id'])
)

In [70]:
newsarchive_with_press_releases_df = (newsarchive_df
   .join(press_release_filtered_df.select('bunch_id'), on='bunch_id', how='left')
   .select(['suid', 'wire', 'headline', 'body', 'timeofarrival', 'bunch_id', 'nicodes'])
)

newsarchive_with_press_releases_df = (
    newsarchive_with_press_releases_df
        .filter(newsarchive_with_press_releases_df.wire.isin([25, 2345]))
)

beat_articles_df = (
    newsarchive_with_press_releases_df
     .filter(check_overlap_udf(newsarchive_with_press_releases_df.nicodes))
)

In [116]:
press_release_filtered_df.columns

['suid', 'wire', 'headline', 'body', 'timeofarrival', 'bunch_id']

In [117]:
beat_articles_df.columns

['suid', 'wire', 'headline', 'body', 'timeofarrival', 'bunch_id', 'nicodes']

In [118]:
joined_df = (
    press_release_filtered_df
    .withColumnRenamed('suid', 'press_release_suid')
    .withColumnRenamed('wire', 'press_release_wire')
    .withColumnRenamed('headline', 'press_release_headline')
    .withColumnRenamed('body', 'press_release_body')
    .withColumnRenamed('timeofarrival', 'press_release_datetime')    
    .join(
        beat_articles_df
            .withColumnRenamed('suid', 'news_article_suid')
            .withColumnRenamed('wire', 'news_article_wire')
            .withColumnRenamed('headline', 'news_article_headline')
            .withColumnRenamed('body', 'news_article_body')
            .withColumnRenamed('timeofarrival', 'news_article_datetime')
            .withColumnRenamed('nicodes', 'news_article_nicodes')        
        , on='bunch_id', how='inner')
)

In [128]:
RemoteIO.ls('hdfs://POB2/user/aspangher/')

[]

In [134]:
partition_count = 1
file_format = 'json'
output_dir = 'file:///test.json'
compression = "none"
(joined_df
     .repartition(partition_count)
     .write.mode("overwrite")
     .format(file_format)
     .option("compression", compression)
     .save(output_dir)
)

23/06/29 18:14:36 INFO kns=aspangher appId=spark-application-1687970047226 execId=driver FileOutputCommitter: File Output Committer Algorithm version is 1
23/06/29 18:14:36 INFO kns=aspangher appId=spark-application-1687970047226 execId=driver FileOutputCommitter: FileOutputCommitter skip cleanup _temporary folders under output directory:false, ignore cleanup failures: false
23/06/29 18:14:36 ERROR kns=aspangher appId=spark-application-1687970047226 execId=driver FileOutputCommitter: Mkdirs failed to create file:/test.json/_temporary/0
23/06/29 18:15:04 WARN kns=aspangher appId=spark-application-1687970047226 execId=driver TaskSetManager: Lost task 0.0 in stage 200.0 (TID 11812, 10.208.157.244, executor 11): java.io.IOException: Mkdirs failed to create file:/test.json/_temporary/0/_temporary/attempt_202306291815043152790991228277009_0200_m_000000_11812 (exists=false, cwd=file:/workspace)
	at org.apache.hadoop.fs.ChecksumFileSystem.create(ChecksumFileSystem.java:456)
	at org.apache.hado

Py4JJavaError: An error occurred while calling o680.save.
: org.apache.spark.SparkException: Job aborted.
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.write(FileFormatWriter.scala:231)
	at org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelationCommand.run(InsertIntoHadoopFsRelationCommand.scala:178)
	at org.apache.spark.sql.execution.command.DataWritingCommandExec.sideEffectResult$lzycompute(commands.scala:108)
	at org.apache.spark.sql.execution.command.DataWritingCommandExec.sideEffectResult(commands.scala:106)
	at org.apache.spark.sql.execution.command.DataWritingCommandExec.doExecute(commands.scala:131)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$execute$1(SparkPlan.scala:180)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:218)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:215)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:176)
	at org.apache.spark.sql.execution.QueryExecution.toRdd$lzycompute(QueryExecution.scala:127)
	at org.apache.spark.sql.execution.QueryExecution.toRdd(QueryExecution.scala:126)
	at org.apache.spark.sql.DataFrameWriter.$anonfun$runCommand$1(DataFrameWriter.scala:962)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$5(SQLExecution.scala:100)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:160)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:87)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:764)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:64)
	at org.apache.spark.sql.DataFrameWriter.runCommand(DataFrameWriter.scala:962)
	at org.apache.spark.sql.DataFrameWriter.saveToV1Source(DataFrameWriter.scala:414)
	at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:398)
	at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:287)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)
Caused by: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 200.0 failed 4 times, most recent failure: Lost task 0.3 in stage 200.0 (TID 11815, 10.208.157.251, executor 13): java.io.IOException: Mkdirs failed to create file:/test.json/_temporary/0/_temporary/attempt_202306291815046863977987715006190_0200_m_000000_11815 (exists=false, cwd=file:/workspace)
	at org.apache.hadoop.fs.ChecksumFileSystem.create(ChecksumFileSystem.java:456)
	at org.apache.hadoop.fs.ChecksumFileSystem.create(ChecksumFileSystem.java:441)
	at org.apache.hadoop.fs.FileSystem.create(FileSystem.java:1067)
	at org.apache.hadoop.fs.FileSystem.create(FileSystem.java:1048)
	at org.apache.hadoop.fs.FileSystem.create(FileSystem.java:937)
	at org.apache.spark.sql.execution.datasources.CodecStreams$.createOutputStream(CodecStreams.scala:81)
	at org.apache.spark.sql.execution.datasources.CodecStreams$.createOutputStreamWriter(CodecStreams.scala:92)
	at org.apache.spark.sql.execution.datasources.json.JsonOutputWriter.<init>(JsonOutputWriter.scala:47)
	at org.apache.spark.sql.execution.datasources.json.JsonFileFormat$$anon$1.newInstance(JsonFileFormat.scala:83)
	at org.apache.spark.sql.execution.datasources.SingleDirectoryDataWriter.newOutputWriter(FileFormatDataWriter.scala:126)
	at org.apache.spark.sql.execution.datasources.SingleDirectoryDataWriter.<init>(FileFormatDataWriter.scala:111)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.executeTask(FileFormatWriter.scala:269)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.$anonfun$write$15(FileFormatWriter.scala:210)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:127)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:467)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1377)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:470)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:748)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2059)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2008)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2007)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2007)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:973)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:973)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:973)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2239)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2188)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2177)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:775)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2143)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.write(FileFormatWriter.scala:200)
	... 32 more
Caused by: java.io.IOException: Mkdirs failed to create file:/test.json/_temporary/0/_temporary/attempt_202306291815046863977987715006190_0200_m_000000_11815 (exists=false, cwd=file:/workspace)
	at org.apache.hadoop.fs.ChecksumFileSystem.create(ChecksumFileSystem.java:456)
	at org.apache.hadoop.fs.ChecksumFileSystem.create(ChecksumFileSystem.java:441)
	at org.apache.hadoop.fs.FileSystem.create(FileSystem.java:1067)
	at org.apache.hadoop.fs.FileSystem.create(FileSystem.java:1048)
	at org.apache.hadoop.fs.FileSystem.create(FileSystem.java:937)
	at org.apache.spark.sql.execution.datasources.CodecStreams$.createOutputStream(CodecStreams.scala:81)
	at org.apache.spark.sql.execution.datasources.CodecStreams$.createOutputStreamWriter(CodecStreams.scala:92)
	at org.apache.spark.sql.execution.datasources.json.JsonOutputWriter.<init>(JsonOutputWriter.scala:47)
	at org.apache.spark.sql.execution.datasources.json.JsonFileFormat$$anon$1.newInstance(JsonFileFormat.scala:83)
	at org.apache.spark.sql.execution.datasources.SingleDirectoryDataWriter.newOutputWriter(FileFormatDataWriter.scala:126)
	at org.apache.spark.sql.execution.datasources.SingleDirectoryDataWriter.<init>(FileFormatDataWriter.scala:111)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.executeTask(FileFormatWriter.scala:269)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.$anonfun$write$15(FileFormatWriter.scala:210)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:127)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:467)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1377)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:470)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	... 1 more


In [119]:
joined_df.count()

                                                                                

88

In [72]:
joined_df.groupby('bunch_id').count().orderBy('count', ascending=False).limit(1000).show()



+------------+-----+
|    bunch_id|count|
+------------+-----+
|RTPVSZMB2SK6|    3|
|RTRDQ8MB2SJQ|    2|
|RSL8LQMB2SJR|    2|
|RSXDJNMB2SJK|    1|
|RSIP9WMB2SJN|    1|
|RSJ7YEMB2SJL|    1|
|RSMX7JMB2SJS|    1|
|RSXHOTNL4AO2|    1|
|RSYCFABLKPOJ|    1|
|RTG2DXMB2SJS|    1|
|RTRNO1MB2SKX|    1|
|RSOPRYMB2SJP|    1|
|RT02ZYMB2SJO|    1|
|RT3OEMMB2SJN|    1|
|RTT99WDWRGG0|    1|
|RT0962MB2SJT|    1|
|RTBWP0MB2SJN|    1|
|RTCMYBMB2SJP|    1|
|RTCPSUMB2SK7|    1|
|RTT4G2MB2SJZ|    1|
+------------+-----+
only showing top 20 rows



                                                                                

In [75]:
joined_pandas_df = (
        joined_df
        .filter(
            joined_df
                .bunch_id
                .isin(joined_df.groupby('bunch_id')
                      .count()
                      .orderBy('count', ascending=False)
                      .limit(1)
                      .bunch_id
                     )
        )
        .limit(1000)
        .toPandas()
)

                                                                                ]

In [76]:
one_df = joined_pandas_df.loc[lambda df: df['bunch_id'] == df['bunch_id'].value_counts().index[0]]

In [95]:
from bloomberg.ai.remoteio import RemoteIO

In [113]:
RemoteIO.ls('hdfs://POB2-GEN/user/aspangher/')

[]

In [54]:
from bs4 import BeautifulSoup

In [40]:
press_releases = newsarchive_with_press_releases_df.limit(200).toPandas()

                                                                                

In [9]:
joined_df

Unnamed: 0,bunch_id,suid,wire,headline,body,timeofarrival,nicodes,suid.1,wire.1,headline.1,body.1,timeofarrival.1,nicodes.1
0,RL7AWWT1UM0Y,RL7AWWT1UM0Y,25,Water Theft Proves Lucrative in a Dangerously ...,"<sections>\n<section class=""news-rsf-text-head...",2023-04-20T13:00:00.003,"[(CMD, None, False, False, False, False, 0, No...",RL7AWWT1UM0Y,25,Water Theft Proves Lucrative in a Dangerously ...,"<sections>\n<section class=""news-rsf-text-head...",2023-04-20T13:00:00.003,"[(CMD, None, False, False, False, False, 0, No..."
1,RQ001LT1UM0W,RTGJ4AT0G1KX,25,China Central Bank Hints It Will Dial Back Pan...,"<sections>\n<section class=""news-rsf-text-head...",2023-04-21T09:02:15.535,"[(EMTOPZ4, None, False, False, False, False, 4...",RTGJ4AT0G1KX,25,China Central Bank Hints It Will Dial Back Pan...,"<sections>\n<section class=""news-rsf-text-head...",2023-04-21T09:02:15.535,"[(EMTOPZ4, None, False, False, False, False, 4..."
2,RQ001LT1UM0W,RTGJ4AT0G1KX,25,China Central Bank Hints It Will Dial Back Pan...,"<sections>\n<section class=""news-rsf-text-head...",2023-04-21T09:02:15.535,"[(EMTOPZ4, None, False, False, False, False, 4...",RQ001LT1UM0W,25,China Central Bank Signals It Will Dial Back P...,"<sections>\n<section class=""news-rsf-text-head...",2023-04-21T07:10:25.223,"[(ECOCURZ3, None, False, False, False, False, ..."
3,RQ001LT1UM0W,RQ001LT1UM0W,25,China Central Bank Signals It Will Dial Back P...,"<sections>\n<section class=""news-rsf-text-head...",2023-04-21T07:10:25.223,"[(ECOCURZ3, None, False, False, False, False, ...",RTGJ4AT0G1KX,25,China Central Bank Hints It Will Dial Back Pan...,"<sections>\n<section class=""news-rsf-text-head...",2023-04-21T09:02:15.535,"[(EMTOPZ4, None, False, False, False, False, 4..."
4,RQ001LT1UM0W,RQ001LT1UM0W,25,China Central Bank Signals It Will Dial Back P...,"<sections>\n<section class=""news-rsf-text-head...",2023-04-21T07:10:25.223,"[(ECOCURZ3, None, False, False, False, False, ...",RQ001LT1UM0W,25,China Central Bank Signals It Will Dial Back P...,"<sections>\n<section class=""news-rsf-text-head...",2023-04-21T07:10:25.223,"[(ECOCURZ3, None, False, False, False, False, ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,RT0V2XDWX2PS,RT242YDWRGG3,25,Treasury Investors Ready for a Slow Grinding R...,"<sections>\n<section class=""news-rsf-abstract""...",2023-04-13T13:56:58.864,"[(GLOBFEAT, None, False, False, False, False, ...",RT1CWET1UM13,25,China's Big Trade Surprise Cushions Tech Weakn...,"<sections>\n<section class=""news-rsf-abstract""...",2023-04-13T04:09:50.762,"[(GLOBFEAT, None, False, False, False, False, ..."
996,RT0V2XDWX2PS,RT242YDWRGG3,25,Treasury Investors Ready for a Slow Grinding R...,"<sections>\n<section class=""news-rsf-abstract""...",2023-04-13T13:56:58.864,"[(GLOBFEAT, None, False, False, False, False, ...",RT2H7KDWX2PS,25,JPMorgan Back-to-Office Call Shows Real Estate...,"<sections>\n<section class=""news-rsf-abstract""...",2023-04-13T18:40:32.058,"[(GLOBFEAT, None, False, False, False, False, ..."
997,RT0V2XDWX2PS,RT242YDWRGG3,25,Treasury Investors Ready for a Slow Grinding R...,"<sections>\n<section class=""news-rsf-abstract""...",2023-04-13T13:56:58.864,"[(GLOBFEAT, None, False, False, False, False, ...",RT208YDWX2PW,25,Treasury Yields Test Session Lows on US PPI Da...,"<sections>\n<section class=""news-rsf-abstract""...",2023-04-13T12:34:10.585,"[(GLOBFEAT, None, False, False, False, False, ..."
998,RT0V2XDWX2PS,RT242YDWRGG3,25,Treasury Investors Ready for a Slow Grinding R...,"<sections>\n<section class=""news-rsf-abstract""...",2023-04-13T13:56:58.864,"[(GLOBFEAT, None, False, False, False, False, ...",RT20W8DWRGG1,25,Dollar Approaches Key Area Signaling More Decl...,"<sections>\n<section class=""news-rsf-abstract""...",2023-04-13T12:48:08.737,"[(GLOBFEAT, None, False, False, False, False, ..."


In [86]:
additional_wires = [
    'SPC', # Wire for S&P releases
    'FII', # Fitch releases 
    'EDG', # 
]

In [87]:
wire_code_df.loc[lambda df: df['abbreviation'].isin(additional_wires)]

Unnamed: 0,wire_code,abbreviation,location,full_name
5,284,EDG,FILINGS,Edgar SEC-Online


In [None]:
import pandas as pd 
wire_code_df = pd.read_csv('wire-codes-with-columns.csv', index_col=0)
_ = wire_code_df.loc[lambda df: df['full_name'].str.contains('First Word')]

In [None]:
t = (newsarchive_with_press_releases_df
     .filter(check_overlap_udf(newsarchive_with_press_releases_df.nicodes))
     .limit(100)
     .toPandas()
)
(t['nicodes']
 .apply(lambda x: list(map(lambda y: y.nicode_val, x)))
 .str.join(' ')
 .str.split(' ', expand=True)
 .unstack().dropna().value_counts()
 .head(20)
)

NWSKAFNQ     100
BNALL        100
2500         100
BIZNEWS       98
FINNEWS       96
BUSINESS      96
READ25        95
NCAS          91
READ          91
READ50        91
ALLTOP        90
READ100       83
MSCIWORLD     81
WWTOP         78
READ150       77
ONWEB         72
WWTOPFEAT     72
COS           69
READ250       66
TOP           64
dtype: int64

In [152]:
from bs4 import BeautifulSoup
import re 
from unidecode import unidecode

In [157]:
soup = BeautifulSoup(t['body'].iloc[2])

In [158]:
print(re.sub(' +', ' ', unidecode(soup.get_text(' ').strip())))

Bank Turmoil Seen Crimping Credit at Double Powell's Estimate 
 Bank Turmoil Seen Crimping Credit at Double Powell's Estimate 
 Most economists see half-point hike impact or more from banks Consensus sees recession coming, likely this quarter or next 
 By Steve Matthews and Sarina Yoo (Bloomberg) -- 
 US bank stress will tighten credit by twice as much as expected by Federal Reserve Chair Jerome Powell , said economists surveyed by Bloomberg, tipping the economy into recession. Almost all of the economists expect the Federal Open Market Committee to hike interest rates another quarter percentage point at its May 2-3 meeting, to a target range of 5% to 5.25%. But the higher borrowing costs will be amplified by the fallout from the March collapse of two US banks, which a majority of the economists found to be equivalent to a Fed hike of about half a percentage point or more. Powell has estimated the impact at roughly a quarter point. "Inflation remains unacceptably high, but banking stre

In [13]:
bunch_ids_with_at_least_two_rows = (
    newsarchive_df
        .join(filtered_df.select('bunch_id'), on='bunch_id', how='left')
        .distinct()
        .groupBy("bunch_id")
        .count()
        .filter("`count` >= 2")
        .select(["bunch_id"])
)

In [14]:
bunch_ids_with_at_least_two_rows = bunch_ids_with_at_least_two_rows.rdd.map(lambda x :  x[0]).collect()
# print(f"len(bunch_ids_with_at_least_two_rows) = {len(bunch_ids_with_at_least_two_rows)}")

                                                                                

In [10]:
df_filted_by_bunch_id = (
    newsarchive_df.filter(newsarchive_df.bunch_id.isin(bunch_ids_with_at_least_two_rows))
        .sort('bunch_id')
        .select(['suid', 'wire', 'headline', 'body', 'bloom_lang', 'timeofarrival', 'bunch_id'])
)
# df_filted_by_bunch_id = df_filted_by_bunch_id.filter(df_filted_by_bunch_id.wire.isin(

AttributeError: 'PipelinedRDD' object has no attribute '_get_object_id'

In [23]:
df_filted_by_bunch_id.count()

                                                                                

192

In [24]:
df_filted_by_bunch_id.show()



+------------+-------------+----+--------------------+----+----------+--------------------+------------+
|        suid|wire_mnemonic|wire|            headline|body|bloom_lang|       timeofarrival|    bunch_id|
+------------+-------------+----+--------------------+----+----------+--------------------+------------+
|QMJGLKMB2SJQ|          GO1|1883|FDA: Abbreviated ...|    |         1|2021-01-07T00:41:...|QMJGLKMB2SJQ|
|QMJGLKMB2SJQ|          GO1|1883|FDA: Abbreviated ...|    |         1|2021-01-07T00:41:...|QMJGLKMB2SJQ|
|QMJU2BMB2SJW|          GO6|1884|India Res Bank: U...|    |         1|2021-01-07T05:32:...|QMJU2BMB2SJW|
|QMJU2BMB2SJW|          GO6|1884|India Res Bank: U...|    |         1|2021-01-07T05:32:...|QMJU2BMB2SJW|
|QMJY46MB2SJU|          GO5|1886|Germany Stats: Pr...|    |         1|2021-01-07T07:00:...|QMJY46MB2SJU|
|QMJY46MB2SJU|          GO5|1886|Germany Stats: Pr...|    |         1|2021-01-07T07:00:...|QMJY46MB2SJU|
|QMK0XCMB2SJZ|          GO5|1886|Stats Austria: Wh...| 

                                                                                

In [None]:
final_df = df_filted_by_bunch_id.withColumn("row_number",F.row_number().over(Window.partitionBy(df_filted_by_bunch_id.headline).orderBy(df_filted_by_bunch_id.headline.desc()))).filter(F.col("row_number")==1).drop("row_number").sort("bunch_id")

In [31]:
import pandas as pd 

# Format newsdiscourse data

In [35]:
newsdiscourse_df = (
    pd.read_csv('../models/discourse-model/data/news-discourse-training-data.csv')
        .assign(s_id=lambda df: df['s_id'].str.replace('S', '').astype(int))
)

In [48]:
newsdiscourse_df = (
    newsdiscourse_df
     .sort_values(['name', 's_id'])
     .assign(split=lambda df: df['file'].str.split('/').str.get(4))
     .assign(event_tag=lambda df: df['event_tag'].fillna('Error'))
     .groupby('name')
     [['sentence', 'event_tag', 'split']]
     .aggregate(list)
     .assign(split=lambda df: df['split'].str.get(0) )
)

In [53]:
(newsdiscourse_df
 .to_json(
     path_or_buf='../models/discourse-model/data/news-discourse-data.jsonl',
     orient='records',
     lines=True
 )
)