In [None]:
import os
from bs4 import BeautifulSoup
from collections import defaultdict
from dataset import get_path
dataset_name ='miguelaenlle/massive-stock-news-analysis-db-for-nlpbacktests'
import pandas as pd
from tqdm import tqdm
from transformers import pipeline
import numpy as np
import lzma
import pickle
import torch

import warnings
warnings.filterwarnings('ignore')


In [None]:
ticker_dir = "datasets/ticker_data/"
os.makedirs(ticker_dir, exist_ok=True)
embeddings_dir = "datasets/ticker_data/embeddings/"
os.makedirs(embeddings_dir, exist_ok=True)
sentiments_dir = "datasets/ticker_data/sentiments/"
os.makedirs(sentiments_dir, exist_ok=True)

In [None]:
processed_data_dir = "datasets/processed/"
os.makedirs(processed_data_dir, exist_ok=True)
processed_count_file = os.path.join(processed_data_dir, "processed.count")

In [None]:
companies = ['AAPL','META', 'GOOGL','AMZN', 'MSFT', "FB", "TSLA", "NFLX"]
companies = []
companies = {c:1 for c in companies}
def is_valid_ticker(t):
    return t in companies

In [None]:
# !pip install sentence_transformers

In [None]:
device='cuda:1'

In [None]:
csvs = [os.path.join(processed_data_dir, f) for f in sorted(os.listdir(processed_data_dir)) if f.endswith('.csv')]

import ast

from sentence_transformers import SentenceTransformer

model_id = "mrm8488/deberta-v3-ft-financial-news-sentiment-analysis"
model_id = 'paraphrase-MiniLM-L6-v2'

sentence_model = SentenceTransformer(model_id)
sentence_model.to(device)
sentence_model.eval()

task = "text-classification"
model_id = "mrm8488/deberta-v3-ft-financial-news-sentiment-analysis"
classifier = pipeline(task, model_id, device=device)
classifier.model.eval()

sentiments = {c: {} for c in companies}
embeddings = {c: {} for c in companies}


count = 0

with torch.no_grad():
    for csv in csvs:
        print(csv)
        df = pd.read_csv(csv)
        
        # display(df)
        for row in tqdm(df.iterrows()):
            ticker = row[1]['Ticker']
            if not is_valid_ticker(ticker):
                continue
            date = row[1]['Date']
            news = row[1]['News']
            # news = json.loads(news)
            news = " ".join(ast.literal_eval(news))
            embedding = sentence_model.encode(news)
            try:
                result = classifier(news[:16384])[0]
            except Exception as e:
                print(len(news), len(row[1]['News']))
                raise e
            sentiments[ticker][date] = result
            embeddings[ticker][date] = embedding
            # print(type(embedding))
            # print(csv, ticker, date, news, type(news), type(embeddings), embeddings.shape)
        #     break
        # break

In [None]:
for ticker in sentiments:
    if not sentiments[ticker]:
        continue
    with lzma.open(f"{sentiments_dir}/{ticker}.xz", "wb") as f:
        pickle.dump(sentiments[ticker], f)

    with lzma.open(f"{embeddings_dir}/{ticker}.xz", "wb") as f:
        pickle.dump(embeddings[ticker], f)

In [None]:


task = "text-classification"
# task = "fill-mask"
model_id = "mrm8488/deberta-v3-ft-financial-news-sentiment-analysis"

classifier = pipeline(task, model_id, device='cuda')
text = "Tesla cars are not as good as expected"
text = news
result = classifier(text)
print(result)


In [None]:
classifier.model.device

In [None]:
# pip install tf-keras

In [None]:
!ls datasets/

In [None]:
!mkdir -p datasets/

In [None]:
os.environ['HF_HOME']

In [None]:
from dataset import load_dataset
dataset_name ='miguelaenlle/massive-stock-news-analysis-db-for-nlpbacktests'
file_name = 'raw_analyst_ratings.csv'
raw_analyst_ratings_df = load_dataset(dataset_name, file_name)

In [None]:
raw_data_path = os.path.join(get_path(dataset_name),'raw_data/data.txt')


In [None]:
raw_analyst_ratings_df.sort_values(by=['date'], ascending=False, inplace=True)


In [None]:
raw_analyst_ratings_df.reset_index(drop=True, inplace=True)

In [None]:
raw_analyst_ratings_df.iloc[0]

In [None]:
raw_analyst_ratings_df[raw_analyst_ratings_df['Unnamed: 0'] == 4]['url'].iloc[0]

In [None]:
idx_data = {}

for idx, row in tqdm(raw_analyst_ratings_df.iterrows()):
    idx_data[row['Unnamed: 0']] = row['date'][:10]

In [None]:
news = defaultdict(lambda: defaultdict(list))

In [None]:
start = 0
if os.path.isfile(processed_count_file):
    with open(processed_count_file, 'r') as rf:
        try:
            x = int(rf.read())
            print(x)
            start = x
        except:
            print(rf.read())
            pass

In [None]:
def save_df_to_csv(df, st, end):
    file_name = os.path.join(processed_data_dir, f"{st:07d}_{end:07d}.csv")
    df.to_csv(file_name)
    with open(processed_count_file, 'w') as wf:
        wf.write(str(end))

In [None]:
def save_dict(news, st, end):
    news_df = pd.DataFrame.from_dict(news).reset_index().rename(columns={'index':'Ticker'})
    news_df_processed = pd.melt(news_df, id_vars=['Ticker'], var_name='Date', value_name='News')
    news_df_processed = news_df_processed[~news_df_processed['News'].isna()]
    save_df_to_csv(news_df_processed, st, end)

In [None]:
count = 0
news = defaultdict(lambda: defaultdict(list))
text = ""
end = start
parse = False
prev_text=None
with open(raw_data_path, 'r', encoding="utf8") as rf:
    for l in rf:
        # if skip:
        #     skip -= 1
        #     continue
        text += l
        if text.endswith("</div>\n"):
            if parse:
                try:
                    idx = int(text[:10])
                except Exception as e:
                    # print(prev_text)
                    # print(text)
                    print(e)
                    text = ""
                    count += 1
                    continue
                    # raise e
                # dt = raw_analyst_ratings_df[raw_analyst_ratings_df['Unnamed: 0'] == idx]['date'].iloc[0][:10]
                dt = idx_data[idx]
                # print(dt)
                html = text[10:]
                soup = BeautifulSoup(html, 'html.parser')
                for p in soup.find_all("p", {"class": "block core-block"}):
                    ticker = p.find("span", {"class": "ticker"})
                    if ticker:
                        ticker = ticker.get_text()
                        # if is_valid_ticker(ticker):
                        tnews = p.get_text(strip=False).replace('\xa0', ' ')
                        news[dt][ticker].append(tnews)
                for ul in soup.find_all("ul", {"class": "block core-block"}):
                    for li in ul.find_all("li"):
                        # print(li)
                        ticker = li.find("span", {"class": "ticker"})
                        if ticker:
                            ticker = ticker.get_text()
                            # if is_valid_ticker(ticker):
                            tnews = li.get_text(strip=False).replace('\xa0', ' ')
                            news[dt][ticker].append(tnews)
            # prev_text = text
            text = ""
            count += 1
        if count == start:
            parse = True
        # if count % 5000 == 0:
        #     print(count)
        if count % 10000 == 0 and len(news) > 0:
            save_dict(news, end, count)
            end = count + 1
            news = defaultdict(lambda: defaultdict(list))
        # if count == 10000:
        #     break
        #     break
        # break
save_dict(news, end, count)