# EDA

# Notebook Environment

In [None]:
UPGRADE_PY = False
INSTALL_DEPS = False
if INSTALL_DEPS:
  !pip install ijson
  !pip install yfinance
  !pip install pandas-market-calendars
if UPGRADE_PY:
  !mamba create -n py311 -y
  !source /opt/conda/bin/activate py312 && mamba install python=3.11 jupyter mamba -y

  !sudo rm /opt/conda/bin/python3
  !sudo ln -sf /opt/conda/envs/py312/bin/python3 /opt/conda/bin/python3
  !sudo rm /opt/conda/bin/python3.10
  !sudo ln -sf /opt/conda/envs/py312/bin/python3 /opt/conda/bin/python3.10
  !sudo rm /opt/conda/bin/python
  !sudo ln -sf /opt/conda/envs/py312/bin/python3 /opt/conda/bin/python

In [None]:
import os
import sys
import warnings
warnings.filterwarnings("ignore")

IN_KAGGLE = IN_COLAB = False
try:
    # https://www.tensorflow.org/install/pip#windows-wsl2
    import google.colab
    from google.colab import drive
    drive.mount('/content/drive')
    DATA_PATH = "/content/drive/MyDrive/EDT dataset"
    MODEL_PATH = "./models/bert_news"
    IN_COLAB = True
    print('Colab!')
except:
    IN_COLAB = False
if 'KAGGLE_KERNEL_RUN_TYPE' in os.environ and not IN_COLAB:
    print('Running in Kaggle...')
    for dirname, _, filenames in os.walk('/kaggle/input'):
        for filename in filenames:
            print(os.path.join(dirname, filename))
    DATA_PATH = "/kaggle/input/uscorpactionnews"
    MODEL_PATH = "/kaggle/input/bert_news/tensorflow2/bert_news/1/bert_news"
    IN_KAGGLE = True
    print('Kaggle!')
elif not IN_COLAB and not IN_KAGGLE:
    IN_KAGGLE = False
    DATA_PATH = "./data/"
    MODEL_PATH = "./models/bert_news"
    print('Normal!')

MODEL_BASE = "google-bert/bert-base-cased"

In [None]:
import numpy as np
import math
import shutil
import matplotlib.pyplot as plt
import pandas as pd

import re
import os
from pathlib import Path
from tqdm import tqdm

import tensorflow as tf

print(f'Tensorflow version: [{tf.__version__}]')
tf.get_logger().setLevel('INFO')

!python --version

# Data Analysis

In [None]:
## Create Datasets for Training
def read_wnut(file_path):
    file_path = Path(file_path)

    raw_text = file_path.read_text().strip()
    raw_docs = re.split(r'\n\t?\n', raw_text)
    token_docs = []
    tag_docs = []
    for doc in raw_docs:
        tokens = []
        tags = []
        for line in doc.split('\n'):
            token, tag = line.split('\t')
            tokens.append(token)
            tags.append(tag)
        token_docs.append(tokens)
        tag_docs.append(tags)

    return token_docs, tag_docs

train_ner_texts, train_ner_tags = read_wnut(os.path.join(DATA_PATH, 'Event_detection/train.txt'))
test_ner_texts, test_ner_tags = read_wnut(os.path.join(DATA_PATH, 'Event_detection/dev.txt'))

combined_texts = train_ner_texts + test_ner_texts
combined_tags = train_ner_tags + test_ner_tags

print(combined_texts[1])
print(combined_tags[1])


In [None]:
def identify_duplicates_and_labels(train_texts, train_tags, test_texts, test_tags):
    # Create a dictionary to hold text as key and tags as values for train data
    train_docs = {' '.join(text): tags for text, tags in zip(train_texts, train_tags)}
    test_docs = {' '.join(text): tags for text, tags in zip(test_texts, test_tags)}

    # Find intersection of keys from both dictionaries to find duplicated documents
    duplicates = set(train_docs.keys()).intersection(test_docs.keys())

    # Collecting the number of duplicates and their corresponding tags
    duplicate_details = [(doc, len(doc.split()), train_docs[doc], test_docs[doc]) for doc in duplicates]

    return duplicate_details

# Call the function with the datasets
duplicates_details = identify_duplicates_and_labels(train_ner_texts, train_ner_tags, test_ner_texts, test_ner_tags)

from collections import Counter

def summarize_duplicates(duplicates_details):
    num_duplicates = len(duplicates_details)
    label_counter = Counter()

    for _, _, train_tags, test_tags in duplicates_details:
        unique_tags = set([tag for tag in train_tags if tag != 'O'] + [tag for tag in test_tags if tag != 'O'])
        label_counter.update(unique_tags)

    print("Number of duplicates:", num_duplicates)
    print("Unique labels and their counts (excluding 'O'):")
    for label, count in label_counter.items():
        print(f"{label}: {count}")

summarize_duplicates(duplicates_details)

In [None]:
data = {'texts': combined_texts, 'tags': combined_tags}
df = pd.DataFrame(data)
num_docs = len(df)
print(f"Number of documents: {num_docs}")

num_labeled_docs = df['tags'].apply(lambda tags: any(tag != 'O' for tag in tags)).sum()
print(f"Number of labeled documents: {num_labeled_docs}")

unique_tags_per_doc = df['tags'].apply(lambda x: set(x))
all_unique_tags = set.union(*unique_tags_per_doc)
print(f"Unique NER tags across all documents: {all_unique_tags}")

tag_counts = pd.Series([tag for tags in unique_tags_per_doc for tag in tags]).value_counts()
print("Unique NER Tag Counts:")
print(tag_counts)

all_tags = [tag for sublist in df['tags'] for tag in sublist]
tag_counts = pd.Series(all_tags).value_counts()
print("NER Tag Counts:")
print(tag_counts)


df['text_labels'] = df['tags'].apply(lambda x: set(x) if set(x) == {'O'} else set(x) - {'O'})
df_filtered = df[df['text_labels'].apply(lambda labels: 'O' not in labels or labels != {'O'})]
print("\nExample of documents with unique NER tags (excluding 'O' unless sole tag):")
print(df_filtered.head()[['texts', 'text_labels']])

## Visualizations

In [None]:
fig, axs = plt.subplots(2, 2, figsize=(14, 14))
fig.suptitle('NER Tag Distribution Analysis', fontsize=16)

# Chart 1: Unique NER Tag Sets per Document
unique_tag_sets_counts = df_filtered['text_labels'].apply(lambda x: ', '.join(sorted(x))).value_counts()
total_documents = unique_tag_sets_counts.sum()
bars = axs[0, 0].barh(unique_tag_sets_counts.index, unique_tag_sets_counts, color='skyblue')
axs[0, 0].set_title('Sets per Document')
axs[0, 0].set_xlabel('% Documents')
for bar in bars:
    width = bar.get_width()
    percentage = (width / total_documents) * 100
    axs[0, 0].text(width, bar.get_y() + bar.get_height()/2, f' {percentage:.1f}%', va='center')

# Chart 2: All Unique NER Tags Across Documents (Excluding 'O')
all_unique_tags_counts = pd.Series([tag for tags in unique_tags_per_doc for tag in tags if tag != 'O']).value_counts()
total_tags = all_unique_tags_counts.sum()
bars = axs[0, 1].barh(all_unique_tags_counts.index, all_unique_tags_counts, color='lightgreen')
axs[0, 1].set_title('Tags Across Documents (Excluding \'O\')')
axs[0, 1].set_xlabel('% Frequency')
for bar in bars:
    width = bar.get_width()
    percentage = (width / total_tags) * 100
    axs[0, 1].text(width, bar.get_y() + bar.get_height()/2, f' {percentage:.1f}%', va='center')

# Chart 3: 'O' vs. Everything Not 'O' for Documents
docs_with_o = df['text_labels'].apply(lambda labels: 'No Events' if labels == {'O'} else 'Events').value_counts()
total_docs = docs_with_o.sum()
bars = axs[1, 0].barh(docs_with_o.index, docs_with_o, color=['#66b3ff', '#ff9999'])
axs[1, 0].set_title('Docs with No Event vs. Docs with Events')
axs[1, 0].set_xlabel('% Documents')
for bar in bars:
    width = bar.get_width()
    percentage = (width / total_docs) * 100
    axs[1, 0].text(width, bar.get_y() + bar.get_height()/2, f' {percentage:.1f}%', va='center')

# Chart 4: 'O' vs. Everything Not 'O' for Tokens
tokens_with_o_counts = pd.Series([tag for sublist in df['tags'] for tag in sublist]).value_counts()
total_tokens = tokens_with_o_counts.sum()
tokens_with_o_counts = {'No Event': tokens_with_o_counts.get('O', 0), 'Event': tokens_with_o_counts.drop('O').sum()}
bars = axs[1, 1].barh(list(tokens_with_o_counts.keys()), list(tokens_with_o_counts.values()), color=['#ff9999', '#66b3ff'])
axs[1, 1].set_title('Tokens with no Event vs. With Event')
axs[1, 1].set_xlabel('% Frequency')
for bar in bars:
    width = bar.get_width()
    percentage = (width / total_tokens) * 100
    axs[1, 1].text(width, bar.get_y() + bar.get_height()/2, f' {percentage:.1f}%', va='center')

plt.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.show()


# Trading Benchmark Data Anaylsis

In [None]:

import ijson

BACKTEST_PATH = f'{DATA_PATH}/Trading_benchmark/evaluate_news.json'

def dataset_generator(data_path, max_len=512):
    """Yield processed data from a large JSON file, one item at a time using ijson."""
    with open(data_path, 'rb') as file:
        items = ijson.items(file, 'item')
        for item in items:
            try:
                text = item['title'] + " " + item['text']
                text = " ".join(text.split()[:max_len])
                labels = item.get('labels', {})
                yield {'text': text, 'labels': labels}

            except KeyError:
                continue

dataset = dataset_generator(BACKTEST_PATH)
first_record = next(dataset)

print("Text:", first_record['text'])
print("Labels:", first_record['labels'])

## Augment Backtest with Market Close

In [None]:
import yfinance as yf
from datetime import datetime
import json

def fetch_snp500_data():
    print("Fetching S&P 500 data from 2020 to 2022...")
    data = yf.download('^GSPC', start='2020-01-01', end='2022-12-31')
    data.index = pd.to_datetime(data.index).date
    return data['Close']

def get_snp500_close(date_str, data):
    """Get the closing price of the S&P 500 on a specific date from preloaded data, handling time and timezone in date_str."""
    cleaned_date_str = date_str.split(' ')[0]
    date_dt = pd.to_datetime(cleaned_date_str, format='%Y-%m-%d')
    return data.loc[date_dt.date()]


def dataset_generator(data_path, snp500_data, max_len=512):
    """Yield processed data from a large JSON file, focusing on adding S&P 500 close prices to labels."""
    with open(data_path, 'r', encoding='utf-8') as file:
        for line in file:
            items = json.loads(line)
            for item in tqdm(items, desc="dataset_generator"):
                labels = item.get('labels', {})
                for i in range(1, 4):
                    key = f'end_time_{i}day'
                    if key in labels:
                        mkt_key = f'mkt_end_time_{i}day'
                        closing_price_current = get_snp500_close(labels[key], snp500_data)
                        days = 1
                        while True:
                            # TODO: Refactor this!
                            previous_day_dt = pd.to_datetime(labels[key]).date() - pd.Timedelta(days=days)
                            previous_day_str = previous_day_dt.strftime('%Y-%m-%d')
                            try:
                                if days>5:
                                    break
                                closing_price_previous = get_snp500_close(previous_day_str, snp500_data)
                                if closing_price_previous is not None:
                                    break;
                                days += 1
                            except Exception as e:
                                days += 1
                        percentage_change = ((closing_price_current - closing_price_previous) / closing_price_previous)
                        labels[mkt_key] = percentage_change
                item['labels'] = labels
                yield labels

def save_augmented_data(data_path, output_path, snp500_data):
    """Process and save the augmented data to a new file."""
    with open(output_path, 'w', encoding='utf-8') as outfile:
        for data in dataset_generator(data_path, snp500_data):
            json.dump(data, outfile)
            outfile.write('\n')  #

OUTPUT_PATH = f'./evaluate_news_augmented.json'

snp500_data = fetch_snp500_data()
save_augmented_data(BACKTEST_PATH, OUTPUT_PATH, snp500_data)