In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
cwd = "/content/drive/MyDrive/NewsTrading/trading_bot"
%cd /content/drive/MyDrive/NewsTrading/trading_bot

/content/drive/MyDrive/NewsTrading/trading_bot


In [3]:
!pip install html2text
!pip install datefinder
!pip install -U dask[complete]

Collecting html2text
  Downloading html2text-2020.1.16-py3-none-any.whl (32 kB)
Installing collected packages: html2text
Successfully installed html2text-2020.1.16
Collecting datefinder
  Downloading datefinder-0.7.3-py2.py3-none-any.whl (10 kB)
Installing collected packages: datefinder
Successfully installed datefinder-0.7.3
Collecting dask[complete]
  Downloading dask-2023.9.2-py3-none-any.whl (1.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
Collecting lz4>=4.3.2 (from dask[complete])
  Downloading lz4-4.3.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m17.0 MB/s[0m eta [36m0:00:00[0m
Collecting distributed==2023.9.2 (from dask[complete])
  Downloading distributed-2023.9.2-py3-none-any.whl (994 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m994.9/994.9 kB[0m [31m20.1 MB/s[0m et

In [4]:
import dask.dataframe as dd
import dask
import pandas as pd
from dask.distributed import Client
from src.preprocessing.news_parser import filter_body, time, body_formatter

In [5]:
# client = Client(memory_limit='25GB', processes=False,
#                 n_workers=2, threads_per_worker=1)
# client

In [6]:
dask.config.set(scheduler="threads")

<dask.config.set at 0x7aeed9dc6230>

In [7]:
input_dir = "data/raw_bzg/"
output_dir = 'data/unraw1_bzg/'

Als erstes müssen wir die HTML-Dokumente zu normalem Text umwandeln, ansonsten sind die Text-Zellen zu groß und führen zu Problemen mit PyArrow/Dask.

In [8]:
# for year in range(2019, 2020):
#     print(year)
#     df = pd.read_parquet(f"{input_dir}story_df_raw_{year}.parquet")
#     df = dd.from_pandas(df, npartitions=12)
#     df["html_body"] = df["html_body"].apply(body_formatter, meta=pd.Series(dtype="str"))
#     df = df.rename(columns={"html_body":"body"})
#     name_function = lambda x: f"data-{year}-{x}.parquet"
#     df.to_parquet(output_dir, name_function=name_function)

Daten neu partitionieren, sodass alle Partitionen etwa die gleiche Größe haben.

In [9]:
input_dir = 'data/unraw1_bzg/'
output_dir = 'data/unraw2_bzg/'

# ddf = dd.read_parquet(input_dir+"*.parquet")
# ddf2 = ddf.repartition(npartitions=50)
# name_function = lambda x: f"data-{x}.parquet"
# ddf2.to_parquet(output_dir, name_function=name_function)

Ein bisschen die Daten säubern...

In [41]:
input_dir = cwd+'/data/unraw2_bzg/'
output_dir = cwd+'/data/unraw3_bzg/'

In [42]:
ddf = dd.read_parquet(input_dir+"*.parquet")

In [43]:
# Remove rows for which noo stock ticker is recorded
ddf = ddf[ddf.stocks != '']

In [44]:
# Convert `channels`  datatype from string to list
ddf["channels"] = ddf["channels"].apply(eval, meta=pd.Series(dtype='object'))

Untersuche als nächstes die Behauptung, dass **PRNewswire** und **Businesswire** den gesamten Markt für Pressemeldungen in den USA kontrollieren. Wenn dem so ist, und sie nicht noch weitere, unwichtige Meldungen veröffentlichen, dann können wir einfach die Newsartikel nach diesen Autoren filtern und uns viel Arbeit ersparen.

In [45]:
import re

In [46]:
dask.config.set(scheduler="processes")
ddf["inferred_author"] = None

def infer_author(body):
  for author in ["PRNewswire", "Globe Newswire", "Business Wire", "ACCESSWIRE"]:
    if re.search(author, body, re.IGNORECASE) is not None:
      return author
  return None

ddf["inferred_author"] = ddf.body.apply(infer_author, meta=pd.Series(dtype="string"))

In [48]:
# value_counts for authors
auhtor_value_counts = pd.concat([ddf.author.value_counts().head(10), ddf.inferred_author.value_counts().head(10)], axis=1)

In [50]:
auhtor_value_counts

Unnamed: 0,author,inferred_author
Benzinga,1061214,
PRNewswire,305720,587242.0
Globe Newswire,293466,475171.0
Business Wire,268561,293052.0
Newsfile,70877,
ACCESSWIRE,62615,81054.0
"AB Digital, Inc.",9936,
WebWire,6404,
PRWeb,2617,
News Direct,2080,


In [49]:
auhtor_value_counts.sum().diff()

author                  NaN
inferred_author   -646971.0
dtype: float64

Ungefähr 650k Nachrichten werden ausgelassen, wenn nur die vier Hauptvertreiber von Pressemeldungen berücksichtigt werden.

In [51]:
ddf = ddf[~ddf.inferred_author.isna()]

In [52]:
ddf["inferred_author"] = ddf["inferred_author"].astype("string")

In [53]:
ddf["channels"] = ddf.channels.apply(lambda x: str(x), meta=pd.Series(dtype="string"))

In [54]:
name_function = lambda x: f"data-{x}.parquet"
ddf.to_parquet(output_dir, name_function=name_function)

Als nächstes wollen wir das news parser modul anwenden und

In [55]:
input_dir = cwd+'/data/unraw3_bzg/'
output_dir = cwd+'/data/unraw2_bzg/'

In [56]:
ddf = dd.read_parquet(input_dir)

In [57]:
ddf.inferred_author.value_counts().compute()

PRNewswire        587242
Globe Newswire    475171
Business Wire     293052
ACCESSWIRE         81054
Name: inferred_author, dtype: Int64

In [59]:
ddf.inferred_author.value_counts().sum().compute()

1436519

In [68]:
# Contains 100k rows
earnings_ddf = ddf[ddf.channels.apply(lambda x: "Earnings" in x, meta=pd.Series(dtype=bool))]

In [69]:
# value counts for authors of earnings reports (contrast to value counts of all news articles)
earnings_ddf.inferred_author.value_counts().head(10)

Globe Newswire    44589
PRNewswire        31440
ACCESSWIRE        16434
Name: inferred_author, dtype: Int64

SyntaxError: ignored

Deskriptive Statistiken unseres Datensatzes. Haben wir genug Daten, um unser Modell zu trainineren?

In [None]:
# Around 3k tickers at this moment
russel_tickers = pd.read_pickle("data/tickers.pkl")

In [None]:
# Filter for stocks in the Russell 3000
# Leaving us with 41k rows
earnings_ddf = earnings_ddf[earnings_ddf.stocks.isin(russel_tickers.categories)]

In [None]:
# Around 140 in 5 years (every 10 days)
dividends = earnings_ddf[earnings_ddf.body.apply(lambda x: "dividend increase " in x)]

Dask Series Structure:
npartitions=1
    string
       ...
Name: author, dtype: string
Dask Name: unique-agg, 4 graph layers