## Utils

In [1]:
import json
import os

In [2]:
import torch
import json
import logging
import re
import os
import argparse
import random
from enum import Enum

from rank_bm25 import BM25Okapi
from nltk import word_tokenize
from sentence_transformers import SentenceTransformer

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import precision_recall_fscore_support, classification_report


from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import T5Tokenizer, T5ForConditionalGeneration, BartTokenizer, BartForConditionalGeneration



  from tqdm.autonotebook import tqdm, trange


In [8]:
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [18]:
def generate_event_information(model, tokenizer, prompt, max_input_length=500, max_new_tokens=50):
    # Tokenize the input prompt, truncating to the maximum input length
    inputs = tokenizer(prompt, return_tensors="pt",
                       truncation=True, max_length=max_input_length)

    inputs = {k: v.to(model.device) for k, v in inputs.items()}

    outputs = model.generate(
        inputs['input_ids'], max_new_tokens=max_new_tokens)
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

    return generated_text

In [19]:
def get_model_response(tokenizer, model, prompt, max_new_tokens=512):
    logging.debug(f"Generating model response for prompt: {prompt}")
    inputs = tokenizer(prompt, return_tensors="pt",max_length=max_new_tokens, truncation=True).to(device)
    outputs = model.generate(**inputs, max_length=inputs['input_ids'].shape[1] + max_new_tokens, max_new_tokens=200)
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    logging.debug(f"Model response: {response}")
    return response

In [20]:
from huggingface_hub import login
from transformers import AutoModel, AutoTokenizer

# Set your Hugging Face token
token = "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"
os.environ['HUGGINGFACE_TOKEN'] = token

# Log in to Hugging Face
login(token=token)

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /dcs/pg23/u5579267/.cache/huggingface/token
Login successful


In [21]:
os.environ['TRANSFORMERS_CACHE'] = '/dcs/large/u5579267/.huggingface'
os.environ['HF_HOME'] = '/dcs/large/u5579267/.huggingface'

In [22]:
cache_dir='/dcs/large/u5579267/.huggingface'

## Fewshot Learning Experiments

- providing examples for each instance

In [23]:
def sbert_selection(self, sentence, training_data, k):
    embeddings = self.sbert_model.encode(
        [sentence] + [data['sentence'][0] for data in training_data])
    similarities = cosine_similarity([embeddings[0]], embeddings[1:])[0]
    top_k_indices = similarities.argsort()[-k:][::-1]
    return [training_data[i] for i in top_k_indices]


def bm25_selection(self, sentence, training_data, k):
    tokenized_corpus = [word_tokenize(
        data['sentence'][0]) for data in training_data]
    bm25 = BM25Okapi(tokenized_corpus)
    tokenized_query = word_tokenize(sentence)
    top_k_indices = bm25.get_top_n(
        tokenized_query, range(len(training_data)), n=k)
    return [training_data[i] for i in top_k_indices]

In [24]:
class EventType(Enum):
    A = "Acquisition (A)"
    CT = "Clinical Trial (CT)"
    RD = "Regular Dividend (RD)"
    DC = "Dividend Cut (DC)"
    DI = "Dividend Increase (DI)"
    GI = "Guidance Increase (GI)"
    NC = "New Contract (NC)"
    RSS = "Reverse Stock Split (RSS)"
    SD = "Special Dividend (SD)"
    SR = "Stock Repurchase (SR)"
    SS = "Stock Split (SS)"
    O = "Other/None (O)"


In [25]:
def load_few_shot_examples(filename="fewshot_examples.txt"):
    # Load few-shot examples from file
    logging.info(f"Loading few-shot examples from: {filename}")
    with open(filename, 'r') as file:
        lines = file.readlines()

    examples = []
    current_example = {"sentence": "", "event": ""}
    for line in lines:
        if line.startswith("Sentence: "):
            if current_example["sentence"]:
                examples.append(current_example)
                current_example = {"sentence": "", "event": ""}
            current_example["sentence"] = line.strip().replace("Sentence: ", "")
        elif line.startswith("Event: "):
            current_example["event"] = line.strip().replace("Event: ", "")

    if current_example["sentence"]:
        examples.append(current_example)

    return examples

In [26]:
sbert_model = SentenceTransformer('all-MiniLM-L6-v2')

def sbert_selection(sentence, training_data, k):
    logging.info("Selecting few-shot examples using SBERT")
    embeddings = sbert_model.encode([sentence] + [data['sentence'][0] for data in training_data])
    similarities = cosine_similarity([embeddings[0]], embeddings[1:])[0]
    top_k_indices = similarities.argsort()[-k:][::-1]
    return [training_data[i] for i in top_k_indices]



In [27]:
def bm25_selection(self, sentence, training_data, k):
    logging.info("Selecting few-shot examples using BM25")
    tokenized_corpus = [word_tokenize(data['sentence'][0]) for data in training_data]
    bm25 = BM25Okapi(tokenized_corpus)
    tokenized_query = word_tokenize(sentence)
    top_k_indices = bm25.get_top_n(tokenized_query, range(len(training_data)), n=k)
    return [training_data[i] for i in top_k_indices]

In [28]:
def fewshot_selection(fewshot_strategy, sentence, training_data, k=5):
    if fewshot_strategy == "random":
        return load_few_shot_examples()
    elif fewshot_strategy == "sbert":
        return sbert_selection(sentence, training_data, k)
    elif fewshot_strategy == "bm25":
        return bm25_selection(sentence, training_data, k)
    else:
        raise ValueError(f"Invalid strategy: {fewshot_strategy}")

In [29]:
def get_schema_prompt(sentence, fewshot_prompt):
    return f"""
        Extract event information from the following sentence and return the most matching event as event_type

        Event_type:
        - Acquisition (A)
        - Clinical Trial (CT)
        - Regular Dividend (RD)
        - Dividend Cut (DC)
        - Dividend Increase (DI)
        - Guidance Increase (GI)
        - New Contract (NC)
        - Reverse Stock Split (RSS)
        - Special Dividend (SD)
        - Stock Repurchase (SR)
        - Stock Split (SS)
        - Other/None (O)

        Sentence: "{sentence}"

        {fewshot_prompt}

        Output:
    """

def get_code_prompt(sentence, fewshot_prompt):
    pass

def get_explanation_prompt(sentence, fewshot_prompt):
    pass

def get_pipeline_prompt(sentence, fewshot_prompt):
    pass

In [37]:
def get_prompt(prompt_type, fewshot_strategy, sentence, training_data, few_shot_examples=[]):
    fewshot_prompt = ""
    if fewshot_strategy:
        fewshot_prompt = "Examples:\n\n".join([f"Sentence: {ex['sentence']}\nEvent: {ex['event']}" for ex in few_shot_examples])
        
    if prompt_type == "schema":
        return get_schema_prompt(sentence, fewshot_prompt)
    elif prompt_type == "code":
        return get_code_prompt(sentence, fewshot_prompt)
    elif prompt_type == "explanation":
        return get_explanation_prompt(sentence, fewshot_prompt)
    elif prompt_type == "pipeline":
        return get_pipeline_prompt(sentence, fewshot_prompt)
    else:
        raise ValueError(f"Invalid prompt type: {prompt_type}")

In [31]:
def extract_event_type(response):
    pass

### Flan T5

In [32]:
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration

# Load Flan-T5 model and tokenizer
model_name = "google/flan-t5-base"
flan_tokenizer = T5Tokenizer.from_pretrained(model_name)
flan_model = T5ForConditionalGeneration.from_pretrained(model_name)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [33]:
def process_sentence(tokenizer, model, sentence, training_data, fewshot_strategy="random"):
    print(f"Processing sentence: {sentence}")
    prompt = get_prompt("schema", fewshot_strategy, sentence, training_data)
    print(f"Prompt: {prompt}")

    response = get_model_response(tokenizer, model, prompt, max_new_tokens=1024)

    if "llama" in model_name.lower():
        event_type = extract_event_type(response)
    else:
        event_type = response

    return event_type

In [34]:
dataset_path  = "/dcs/large/u5579267/EventExtraction/EDT_dataset/Event_detection/train.json"

In [35]:
def process_dataset(dataset_path, few_shot_strategy="random", samples=5):

    logging.info(f"Loading dataset from: {dataset_path}")
    with open(dataset_path, "r") as file:
        data = json.load(file)

    data = list(reversed(data))
    data = data[500:503]

    results = []
    for item in data:
        sentence = item["sentence"][0]
        extracted_events = process_sentence(flan_tokenizer, flan_model, sentence, data)
        
        results.append({"sentence": sentence, "extracted_events": extracted_events, "actual_events": item["events"]})

        torch.cuda.empty_cache()
        logging.info(f"Extracted events: {extracted_events}")

    logging.info("Processing Data Complete")
    logging.info("=" * 50)
    return results

In [38]:
process_dataset(dataset_path, few_shot_strategy="sbert")

Processing sentence: BWX Technologies Reports Strong Second Quarter 2020 Results and Increases 2020 Full-Year Earnings Guidance Grows 2Q20 EPS to $0.67 ( GAAP ) and $0.71 ( non GAAP ) vs . 2Q19 EPS of $0.62 Reports 2Q20 consolidated revenue of $505 million , up 7% vs . 2Q19 Increases 2020 full year non GAAP EPS guidance to a range of $2.80 $2.90 Increases the Nuclear Operations Group segment revenue guidance to ~10% growth LYNCHBURG , Va . ( ) BWX Technologies , Inc . ( NYSE: BWXT ) ( "BWXT" , "we" , "us" or the "Company" ) reported second quarter 2020 revenue of $505 million , a 7% increase compared with $471 million in the second quarter of 2019 . GAAP net income for the second quarter 2020 was $64.3 million , or $0.67 per diluted share , compared with GAAP net income of $58.9 million , or $0.62 per diluted share , in the prior-year period . Non-GAAP net income for the second quarter 2020 was $67.7 million , or $0.71 per diluted share , compared with non-GAAP net income of $59.1 mill

Both `max_new_tokens` (=200) and `max_length`(=1906) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


Processing sentence: PVH Corp . Reports 2018 First Quarter Revenue and EPS above Guidance and Raises Full Year EPS Outlook First quarter revenue increased 16% ( increased 10% on a constant currency basis ) compared to the prior year period and exceeded guidance across all businesses First quarter EPS exceeded guidance and was: GAAP basis: $2.29 compared to guidance of $2.13 to $2.18 Non GAAP basis: $2.36 compared to guidance of $2.20 to $2.25 EPS included a positive impact of $0.20 per share related to foreign currency translation , which was in line with guidance Full year 2018 EPS outlook raised despite reduced foreign currency benefit: GAAP basis: Raised to $8.81 to $8.91 from $8.76 to $8.86 previously Non GAAP basis: Raised to $9.05 to $9.15 from $9.00 to $9.10 previously EPS outlook now includes a reduced positive impact of $0.12 per share related to foreign currency translation , compared to $0.35 previously NEW YORK ( ) PVH Corp . ( NYSE:PVH ) reported 2018 first quarter results

Both `max_new_tokens` (=200) and `max_length`(=1851) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


Processing sentence: Trex Company Increases 2007 Revenue Guidance WINCHESTER , Va . ( ) Trex Company , Inc . ( NYSE: TWP ) , manufacturer of Trex decking , railing and fencing , today announced revised net sales guidance for 2007 . The Company now expects net sales to be in the range of $335 million to $345 million ( $350 million to $360 million before taking into account its product replacement reserve ) . This revised guidance compares favorably to the November 6 , 2007 net sales guidance of $315 million to $335 million . Projected sales for 2007 ( before taking into account the product replacement reserve ) represent an approximately 5% increase over the $337 million in net sales recorded for the full year 2006 . Chief Executive Officer Andrew U . Ferrari commented , Our performance in the first two months of the seasonally slow fourth quarter has been stronger than anticipated , particularly at a time when the homebuilding and remodeling markets continue to exhibit weakness . A key

Both `max_new_tokens` (=200) and `max_length`(=1843) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


[{'sentence': 'BWX Technologies Reports Strong Second Quarter 2020 Results and Increases 2020 Full-Year Earnings Guidance Grows 2Q20 EPS to $0.67 ( GAAP ) and $0.71 ( non GAAP ) vs . 2Q19 EPS of $0.62 Reports 2Q20 consolidated revenue of $505 million , up 7% vs . 2Q19 Increases 2020 full year non GAAP EPS guidance to a range of $2.80 $2.90 Increases the Nuclear Operations Group segment revenue guidance to ~10% growth LYNCHBURG , Va . ( ) BWX Technologies , Inc . ( NYSE: BWXT ) ( "BWXT" , "we" , "us" or the "Company" ) reported second quarter 2020 revenue of $505 million , a 7% increase compared with $471 million in the second quarter of 2019 . GAAP net income for the second quarter 2020 was $64.3 million , or $0.67 per diluted share , compared with GAAP net income of $58.9 million , or $0.62 per diluted share , in the prior-year period . Non-GAAP net income for the second quarter 2020 was $67.7 million , or $0.71 per diluted share , compared with non-GAAP net income of $59.1 million , 

### ==================================

In [15]:
from enum import Enum
from sklearn.metrics import confusion_matrix
import numpy as np

In [5]:
class EventType(Enum):
    A = "Acquisition (A)"
    CT = "Clinical Trial (CT)"
    RD = "Regular Dividend (RD)"
    DC = "Dividend Cut (DC)"
    DI = "Dividend Increase (DI)"
    GI = "Guidance Increase (GI)"
    NC = "New Contract (NC)"
    RSS = "Reverse Stock Split (RSS)"
    SD = "Special Dividend (SD)"
    SR = "Stock Repurchase (SR)"
    SS = "Stock Split (SS)"
    O = "Other/None (O)"


In [16]:
def evaluate_events(results):
    logging.info("Evaluating extracted events")

    y_true = []
    y_pred = []

    event_mapping = {
        'Acquisition (A)': 'A',
        'Clinical Trial (CT)': 'CT',
        'Regular Dividend (RD)': 'RD',
        'Dividend Cut (DC)': 'DC',
        'Dividend Increase (DI)': 'DI',
        'Guidance Increase (GI)': 'GI',
        'New Contract (NC)': 'NC',
        'Reverse Stock Split (RSS)': 'RSS',
        'Special Dividend (SD)': 'SD',
        'Stock Repurchase (SR)': 'SR',
        'Stock Split (SS)': 'SS',
        'Other/None (O)': 'O'
    }

    for result in results:
        actual_events = result["actual_events"]
        extracted_event = result["extracted_events"]["event_type"].strip()

        # Normalize the extracted event type
        normalized_extracted_event = event_mapping.get(extracted_event, 'O')

        for actual in actual_events:
            # Normalize the actual event type
            normalized_actual_event = event_mapping.get(actual, 'O')
            actual_event_enum = next((e for e in EventType if e.name == normalized_actual_event), EventType.O)
            y_true.append(actual_event_enum.value)

            matched_event_enum = next((e for e in EventType if e.name == normalized_extracted_event), EventType.O)
            y_pred.append(matched_event_enum.value)

    # Ensure all event types are included in the labels, even if not present in predictions
    unique_labels = [e.value for e in EventType]

    # Calculate the confusion matrix
    cm = confusion_matrix(y_true, y_pred, labels=unique_labels)
    logging.info(f"Confusion Matrix:\n{cm}")
    print(f"Confusion Matrix:\n{cm}")

    # Calculate classification metrics from the confusion matrix
    tp = np.diag(cm)  # True Positives
    fp = np.sum(cm, axis=0) - tp  # False Positives
    fn = np.sum(cm, axis=1) - tp  # False Negatives
    tn = np.sum(cm) - (tp + fp + fn)  # True Negatives

    # Avoid division by zero
    precision = np.divide(tp, tp + fp, out=np.zeros_like(tp, dtype=float), where=(tp + fp) != 0)
    recall = np.divide(tp, tp + fn, out=np.zeros_like(tp, dtype=float), where=(tp + fn) != 0)
    f1_score = np.divide(2 * precision * recall, precision + recall, out=np.zeros_like(precision, dtype=float), where=(precision + recall) != 0)

    for i, label in enumerate(unique_labels):
        logging.info(f"Class {label} - Precision: {precision[i]:.2f}, Recall: {recall[i]:.2f}, F1-Score: {f1_score[i]:.2f}, TP: {tp[i]}, FP: {fp[i]}, FN: {fn[i]}, TN: {tn[i]}")
        print(f"Class {label} - Precision: {precision[i]:.2f}, Recall: {recall[i]:.2f}, F1-Score: {f1_score[i]:.2f}, TP: {tp[i]}, FP: {fp[i]}, FN: {fn[i]}, TN: {tn[i]}")

    logging.info("=" * 50)

    return {
        "confusion_matrix": cm,
        "precision": precision,
        "recall": recall,
        "f1_score": f1_score,
        "tp": tp,
        "fp": fp,
        "fn": fn,
        "tn": tn
    }

In [1]:
with open('fewshot_random_schema.json', 'r') as file:
    results = json.load(file)

evaluate_events(results)