In [2]:
import os
import json
import logging

In [3]:
edt_dataset_path = os.path.join(os.getcwd(), 'EDT_dataset', 'Event_detection', 'train.txt')
print(edt_dataset_path)

/dcs/large/u5579267/EventExtraction/src/Analysis/EDT_dataset/Event_detection/train.txt


## Convert txt file to json

In [5]:
import json

with open(edt_dataset_path, "r") as file:
    annotated_text = file.read()

# Split the dataset into articles
articles = annotated_text.strip().split("\n\n")

# Function to process each article
def process_article(article):
    tokens = article.strip().split("\n")
    tokens = [token.split() for token in tokens if token]

    sentences = []
    current_sentence = []
    event_tags = set()

    for token in tokens:
        word, tag = token
        current_sentence.append(word)
        if tag != "O":
            event_tags.add(tag)
        elif tag ==  "O" and not event_tags:
            event_tags.add("O")
        if word in [".", "!", "?"]:
            sentences.append(" ".join(current_sentence))
            current_sentence = []

    # Add the last sentence if not added
    if current_sentence:
        sentences.append(" ".join(current_sentence))

    # Combine all sentences into one
    combined_sentence = " ".join(sentences)

    return {
        "sentence": [combined_sentence],
        "events": list(event_tags) if event_tags else []
    }


# Process each article and store the results
results = []
for article in articles:
    results.append(process_article(article))


# Save the JSON object to a file
output_path = os.path.join(os.getcwd(), "EDT_dataset", "Event_detection", "train.json")

with open(output_path, "w") as file:
    json.dump(results, file, indent=4)


print("Done!")
print("Output saved to:", output_path)


Done!
Output saved to: /Users/adityamahamuni/Work/Warwick/Modules/Dissertation/Event_Extraction_for_Financial_Documents/EDT_dataset/Event_detection/train.json
{'sentence': ['JTI report warns of a \'Gathering Storm\' in the black market English English Intelligence shows criminals are ready for post-Covid boom GENEVA , Sept . 11 , 2020 / / JTI ( Japan Tobacco International ) has published a report , independently verified by Intrinsic Insight Ltd. , entitled \'The Gathering Storm\' , on how the illegal tobacco trade are operating during the Covid-19 global pandemic and preparing to reap the rewards in the economic aftermath that will follow . Law enforcement agencies around the world have welcomed the report , which is based on 63 field studies , conducted across 50 countries including Russia , Canada , Malaysia , and the Philippines where tobacco smugglers currently have a strong presence . JTI intelligence found that the global public health crisis and financial downturn has created t

In [3]:
data_path = os.path.join(os.getcwd(), "EDT_dataset", "Event_detection", "train.json")

# Load the JSON object from the file
with open(data_path, "r") as file:
    data = json.load(file)

# Print the first few entries
for i in range(3):
    print(data[-i])
    print()

{'sentence': ['JTI report warns of a \'Gathering Storm\' in the black market English English Intelligence shows criminals are ready for post-Covid boom GENEVA , Sept . 11 , 2020 / / JTI ( Japan Tobacco International ) has published a report , independently verified by Intrinsic Insight Ltd. , entitled \'The Gathering Storm\' , on how the illegal tobacco trade are operating during the Covid-19 global pandemic and preparing to reap the rewards in the economic aftermath that will follow . Law enforcement agencies around the world have welcomed the report , which is based on 63 field studies , conducted across 50 countries including Russia , Canada , Malaysia , and the Philippines where tobacco smugglers currently have a strong presence . JTI intelligence found that the global public health crisis and financial downturn has created the conditions for a \'perfect storm\' where organized criminal groups will further exploit public demand for cheap goods , and capitalize on dwindling buying p

### Run Schema Prompts

In [4]:
def get_schema_prompt(sentence):
        schema_prompt = """
        Task: Event Extraction
        Instructions:
        1. Identify any financial or corporate events within the sentence.
        2. Classify each identified event using the following types:
           - Acquisition (A)
           - Clinical Trial (CT)
           - Regular Dividend (RD)
           - Dividend Cut (DC)
           - Dividend Increase (DI)
           - Guidance Increase (GI)
           - New Contract (NC)
           - Reverse Stock Split (RSS)
           - Special Dividend (SD)
           - Stock Repurchase (SR)
           - Stock Split (SS)
           - Other/None (O)
        3. Extract the primary corporate entity (company name) directly associated with each event. Use contextual clues and focus on entities performing financial actions. Prioritize the company name.
        4. Output a JSON array of dictionaries, each containing:
            - "event_type": [Use the exact event classification code from the provided list, e.g., "RD" for Regular Dividend]
            - "company": [Identify the primary company performing the financial action, often the publicly traded parent company]
        5. Ensure the output is valid JSON. Double-check for proper formatting, brackets, commas, and quotation marks.
        Extract the event from the following sentence: {sentence}
        """
        logging.debug(f"Generated schema prompt for sentence: {sentence}")
        return schema_prompt.format(sentence=sentence)

In [5]:
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration

# Load Flan-T5 model and tokenizer
model_name = "google/flan-t5-base"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

  from .autonotebook import tqdm as notebook_tqdm
  torch.utils._pytree._register_pytree_node(


ImportError: 
T5Tokenizer requires the SentencePiece library but it was not found in your environment. Checkout the instructions on the
installation page of its repo: https://github.com/google/sentencepiece#installation and follow the ones
that match your environment. Please note that you may need to restart your runtime after installation.


In [None]:
def get_model_response(prompt, max_length=512):
        logging.debug(f"Generating model response for prompt: {prompt}")
        inputs = tokenizer(prompt, return_tensors="pt", max_length=1024, truncation=True).to(device)
        outputs = model.generate(**inputs, max_length=max_length)
        response = tokenizer.decode(outputs[0], skip_special_tokens=True)
        logging.debug(f"Model response: {response}")
        return response

### Dev/Train dataset

In [6]:
import json

edt_dev_path = os.path.join("/dcs/large/u5579267/EventExtraction", "EDT_dataset", "Event_detection", "dev.txt")

with open(edt_dev_path, "r") as file:
    annotated_text = file.read()

# Split the dataset into articles
articles = annotated_text.strip().split("\n\n")

# Function to process each article
def process_article(article):
    tokens = article.strip().split("\n")
    tokens = [token.split() for token in tokens if token]

    sentences = []
    current_sentence = []
    event_tags = set()

    for token in tokens:
        word, tag = token
        current_sentence.append(word)
        if tag != "O":
            event_tags.add(tag)
        elif tag ==  "O" and not event_tags:
            event_tags.add("O")
        if word in [".", "!", "?"]:
            sentences.append(" ".join(current_sentence))
            current_sentence = []

    # Add the last sentence if not added
    if current_sentence:
        sentences.append(" ".join(current_sentence))

    # Combine all sentences into one
    combined_sentence = " ".join(sentences)

    return {
        "sentence": [combined_sentence],
        "events": list(event_tags) if event_tags else []
    }


# Process each article and store the results
results = []
for article in articles:
    results.append(process_article(article))


# Save the JSON object to a file
output_path = os.path.join("/dcs/large/u5579267/EventExtraction", "EDT_dataset", "Event_detection", "dev.json")
with open(output_path, "w") as file:
    json.dump(results, file, indent=4)

print("Done!")
print("Output saved to:", output_path)


Done!
Output saved to: /dcs/large/u5579267/EventExtraction/EDT_dataset/Event_detection/dev.json
