In [8]:
import numpy
import pandas as pd

import re
import os
import json

In [6]:
edt_dataset_path = "/dcs/large/u5579267/EventExtraction/EDT_dataset/Event_detection/train.json"
print(edt_dataset_path)

/dcs/large/u5579267/EventExtraction/EDT_dataset/Event_detection/train.json


In [7]:
def read_edt_dataset(path):
    with open(path, 'r') as file:
        lines = file.readlines()
    return lines

content = read_edt_dataset(edt_dataset_path)

print(content[0])

[



In [9]:
# Load the dataset
with open(edt_dataset_path, "r") as file:
    data = json.load(file)

# Initialize an empty list to store the data
data_list = []

# Iterate over the data and format it into a list of dictionaries
for item in data:
    for sentence in item["sentence"]:
        data_list.append({
            "sentence": sentence,
            "events": item["events"][0]
        })

# Create the DataFrame
edt_df = pd.DataFrame(data_list)

In [10]:
edt_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7770 entries, 0 to 7769
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   sentence  7770 non-null   object
 1   events    7770 non-null   object
dtypes: object(2)
memory usage: 121.5+ KB


In [14]:
edt_df.events.nunique()

12

In [13]:
import nltk

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     /dcs/pg23/u5579267/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /dcs/pg23/u5579267/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /dcs/pg23/u5579267/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [9]:
# Pre-processing the text

stop_words = set(stopwords.words('english'))
ps = PorterStemmer()
lemmatizer = WordNetLemmatizer()


def preprocess_text(text):
    text = text.lower()    # Lowercase
    text = re.sub(r'\d+', '', text)    # Remove numbers
    text = re.sub(r'[^\w\s]', '', text)    # Remove punctuation
    text = text.strip()    # Remove whitespaces

    word_tokens = word_tokenize(text)
    text = [word for word in word_tokens if word not in stop_words]    # Remove stopwords

    text = [ps.stem(word) for word in text]    # Stemming

    text = [lemmatizer.lemmatize(word) for word in text]    # Lemmatization

    text = ' '.join(text)
    return text

In [10]:
edt_df_copy = edt_df.copy()

In [11]:
edt_df_copy['text'] = edt_df_copy['text'].apply(preprocess_text)

In [12]:
edt_df_copy.head(n=20)

Unnamed: 0,text,label
0,jti,O
1,report,O
2,warn,O
3,,O
4,,O
5,gather,O
6,storm,O
7,,O
8,,O
9,black,O


In [13]:
edt_df_copy['label'].value_counts()

label
O        3565917
I-NC       21276
I-CT       14291
I-SR       12807
I-A         9166
I-RD        7972
I-DI        6720
I-DC        2630
I-GC        2294
I-SS        2047
I-SD        1918
I-RSS       1499
Name: count, dtype: int64

In [14]:
labels, uniques = pd.factorize(edt_df_copy['label'])
edt_df_updated = edt_df_copy.copy()
edt_df_updated['label'] = labels

In [15]:
edt_df_updated.label.value_counts()

label
0     3565917
2       21276
1       14291
3       12807
4        9166
8        7972
5        6720
10       2630
7        2294
6        2047
9        1918
11       1499
Name: count, dtype: int64

In [16]:
edt_df_updated.head(n=20)

Unnamed: 0,text,label
0,jti,0
1,report,0
2,warn,0
3,,0
4,,0
5,gather,0
6,storm,0
7,,0
8,,0
9,black,0


## ML Classification Experiment

In [17]:
import pandas as pd

def process_data(file_path):
    with open(file_path, 'r') as file:
        lines = file.readlines()

    sentences = []
    labels = []
    current_sentence = []
    current_labels = []

    for line in lines:
        if line.strip():
            token, label = line.strip().split()
            current_sentence.append(token)
            current_labels.append(label)
        else:
            if current_sentence:
                sentences.append(' '.join(current_sentence))
                labels.append(current_labels[0])
            current_sentence = []
            current_labels = []

    df = pd.DataFrame({'text': sentences, 'label': labels})
    return df


df = process_data(edt_dataset_path)

In [18]:
print(df.tail())

                                                   text  label
7765  Gulf Resources , Inc . Announces 1-for-5 Rever...  I-RSS
7766  Bellicum Announces Reverse Stock Split HOUSTON...  I-RSS
7767  ContraFect Announces One-for-Ten Reverse Stock...  I-RSS
7768  TOP Ships Announces Reverse Stock Split ATHENS...  I-RSS
7769  Windtree Therapeutics Announces Reverse Stock ...  I-RSS


In [19]:
import torch
from transformers import AutoTokenizer, AutoModel

from sklearn.preprocessing import LabelEncoder


  from .autonotebook import tqdm as notebook_tqdm


In [20]:
# Load pre-trained FinBERT model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
model = AutoModel.from_pretrained("ProsusAI/finbert")

  return self.fget.__get__(instance, owner)()


In [21]:
def generate_embeddings(texts):
    tokens = tokenizer(texts, padding=True, truncation=True, return_tensors='pt')
    with torch.no_grad():
        outputs = model(**tokens)
        embeddings = outputs.last_hidden_state.mean(dim=1)
    return embeddings

In [22]:
embeddings = generate_embeddings(df['text'].tolist())
embeddings_df = pd.DataFrame(embeddings.numpy())

label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['label'])

: 

In [None]:
X_train, X_test, y_train, y_test = train_test_split(embeddings_df, df['label'], test_size=0.2, random_state=42)
