In [None]:
import dask.dataframe as dd
import pandas as pd
from dask.distributed import Client
from news_parser import filter_body, time, body_formatter

In [None]:
client = Client(memory_limit='14GB', processes=False,
                n_workers=1, threads_per_worker=1)
client

In [None]:
client.amm.start()

### First, strip documents of html, otherwise html_body cells are too large to handle for pyarrow. 

In [None]:
input_dir = "data/parquet_bzg_stories/"
output_dir = 'data\\benzinga\\'

# Parallelized version with Dask Client.
# for year in range(2017, 2019):
#     print(year)
#     df = pd.read_parquet(f"{input_dir}story_df_raw_{year}.parquet")
#     df = dd.from_pandas(df, npartitions=12)
#     df["html_body"] = df["html_body"].apply(body_formatter, meta=pd.Series(dtype="str"))
#     df = df.rename(columns={"html_body":"body"})
#     name_function = lambda x: f"data-{year}-{x}.parquet"
#     df.to_parquet(output_dir, name_function=name_function)

# Use this version of memory not suffifcient for repartition from pd.DataFrame:
for year in range(2017,2019):
    print(year)
    df = pd.read_parquet(f"{input_dir}story_df_raw_{year}.parquet")
    df["html_body"] = df.html_body.apply(body_formatter)
    df.rename(columns={"html_body":"body"}, inplace=True)
    ddf = dd.from_pandas(df, npartitions=10)
    name_function = lambda x: f"data-{year}-{x}.parquet"
    ddf.to_parquet(output_dir, name_function=name_function)

### Repartition with dask

In [None]:
# ddf = dd.read_parquet("data/benzinga/*.parquet")
# ddf["channels"] = ddf["channels"].apply(eval)

### Apply proper news_parsing (TODO: adjust filter_body since body_formatter already applied to html)

In [None]:
story_df = ddf
start = time.time()
story_df["body"] = story_df.apply(lambda x: filter_body(x.html_body, x.stocks, x.author, x.time), axis=1, meta=pd.Series(dtype="str"))
end = time.time()
print(f"Time elapsed: {end-start}s")
print(f"Average seconds required per body: {(end-start)/story_df.shape[0]}s")

# story_df["NewsTimestamp"] = pd.to_datetime(story_df.time)
# story_df.drop(columns=["time"], inplace=True)

# Applying parser to news

In [None]:
ddf.compute()

# Misc

In [None]:
ddf.shape[0].compute()

In [None]:
ddf.columns

In [None]:
ddf.channels.unique().compute()

In [None]:
ddf.dtypes

In [None]:
ddf.time.max().compute()

In [None]:
ddf.groupby(ddf.time.dt.year).count().compute()

In [None]:
tagged_news = ddf.loc[ddf["time"].dt.year >= 2017, :]

In [None]:
tagged_news = tagged_news.compute()

In [None]:
## TODO: Untersuche die Güte der BZG-Channels. 
# In wie vielen Nachrichten kommt der Begriff EBIT, EBITDA oder Earnings vor und wie viele Nachrichten wurden mit dem `Earnings`-Channeltag gekennzeichnet. 
# Evtl. müssen wir das Tagging selbst machen, was evtl. sowieso eine gute Idee ist. Allerdings ist dann die Frage, wie wir sie Filtern, da wir nicht alle Nachrichten
# benutzen können (teilw. redundant und unnütz).
# Schaue nach, wie das in der Literatur gehandhabt wird!

In [None]:
tagged_news