# Joint Paradigm for Document-Level Event Extraction and Geoparsing for Indonesian News Text 

## Import

In [22]:
import pandas as pd
import numpy as np
import pprint
import nltk

pp = pprint.PrettyPrinter(indent=2)

## Data Pre-Processing

In [23]:
# Specify the path to the text file
file_path = "event-geoparsing-corpus.txt"

# Open the file
file = open(file_path, "r")

# Read the contents of the file
file_contents = file.read()

# Close the file
file.close()

# Print the contents of the file
pp.pprint(file_contents)

('Kerabat/NNP/O/O\n'
 'Layat/NNP/O/O\n'
 'Jasad/NNP/O/O\n'
 'Jerry/NNP/O/O\n'
 'Wong/NNP/O/O\n'
 'Pengemudi/NNP/O/O\n'
 'Mini/NNP/B-ARG/Vehicle-Arg\n'
 'Cooper/NNP/I-ARG/Vehicle-Arg\n'
 'di/IN/O/O\n'
 'RSCM/NN/B-ORG/Hospital-Arg\n'
 '-/-/O/O\n'
 '-/NNP/O/O\n'
 'Jakarta/NNP/B-PLOC/Published-Arg/(-6.197602429787846, 106.83139222722116)/1\n'
 '-/-/O/O\n'
 'Jenazah/NNP/O/O\n'
 'Jerry/NNP/O/O\n'
 'Wong/NNP/O/O\n'
 ',/,/O/O\n'
 'pengemudi/NN/O/O\n'
 'Mini/NNP/B-ARG/Vehicle-Arg\n'
 'Cooper/NNP/I-ARG/Vehicle-Arg\n'
 'yang/SC/O/O\n'
 'menabrak/VBT/B-EVE/ACCIDENT-EVENT\n'
 'bagian/O/O\n'
 'belakang/NN/O/O\n'
 'truk/NN/B-ARG/Vehicle-Arg\n'
 ',/,/O/O\n'
 'kini/RB/O/O\n'
 'berada/VBI/O/O\n'
 'di/IN/O/O\n'
 'RSCM/NNP/B-ORG/Hospital-Arg\n'
 '\n'
 'Sekitar/CDI/O/O\n'
 '15/CDP/O/O\n'
 'kerabat/NN/O/O\n'
 'dan/CC/O/O\n'
 'teman/NN/O/O\n'
 'datang/VBI/O/O\n'
 'melayat/VBT/O/O\n'
 '\n'
 'Pantauan/IN/O/O\n'
 'detikcom/NN/B-ORG/Reporter-Arg\n'
 'di/IN/O/O\n'
 'kamar/NN/O/O\n'
 'jenazah/NN/O/O\n'
 'RSCM/NNP/

Replace "PLOC" with "LOC" because "PLOC" indicates pseudo-location for the data owner's methodology.
Pseudo location is used to mark location which is not the actual location.

In [24]:
replaced_ploc = file_contents.replace("PLOC", "LOC")

In [25]:
type(file_contents)

str

Each document is splitted by "===" in the text file, so in order to get the documents in an array we need to split them.

In [26]:
documents = file_contents.split("===")
type(documents)

list

In [27]:
documents

['Kerabat/NNP/O/O\nLayat/NNP/O/O\nJasad/NNP/O/O\nJerry/NNP/O/O\nWong/NNP/O/O\nPengemudi/NNP/O/O\nMini/NNP/B-ARG/Vehicle-Arg\nCooper/NNP/I-ARG/Vehicle-Arg\ndi/IN/O/O\nRSCM/NN/B-ORG/Hospital-Arg\n-/-/O/O\n-/NNP/O/O\nJakarta/NNP/B-PLOC/Published-Arg/(-6.197602429787846, 106.83139222722116)/1\n-/-/O/O\nJenazah/NNP/O/O\nJerry/NNP/O/O\nWong/NNP/O/O\n,/,/O/O\npengemudi/NN/O/O\nMini/NNP/B-ARG/Vehicle-Arg\nCooper/NNP/I-ARG/Vehicle-Arg\nyang/SC/O/O\nmenabrak/VBT/B-EVE/ACCIDENT-EVENT\nbagian/O/O\nbelakang/NN/O/O\ntruk/NN/B-ARG/Vehicle-Arg\n,/,/O/O\nkini/RB/O/O\nberada/VBI/O/O\ndi/IN/O/O\nRSCM/NNP/B-ORG/Hospital-Arg\n\nSekitar/CDI/O/O\n15/CDP/O/O\nkerabat/NN/O/O\ndan/CC/O/O\nteman/NN/O/O\ndatang/VBI/O/O\nmelayat/VBT/O/O\n\nPantauan/IN/O/O\ndetikcom/NN/B-ORG/Reporter-Arg\ndi/IN/O/O\nkamar/NN/O/O\njenazah/NN/O/O\nRSCM/NNP/B-ORG/Hospital-Arg\n,/,/O/O\nJalan/NN/B-ARG/Street-Arg\nSalemba/NNP/I-ARG/Street-Arg\n,/,/O/O\nJakarta/NNP/B-PLOC/Place-Arg/(-6.1780125136691195, 106.83773321392438)/2\nPusat/NNP/I

### 2.1 Data Exploration

In [28]:
df = pd.DataFrame(documents, columns=['Document'])
df.head()

Unnamed: 0,Document
0,Kerabat/NNP/O/O\nLayat/NNP/O/O\nJasad/NNP/O/O\...
1,\n1/CDP/B-ARG/DeathVictim-Arg\nOrang/NN/I-ARG/...
2,\nAnalisis/NNP/O/O\nBMKG/NNP/B-ORG/Reporter-Ar...
3,\n3/CDP/B-ARG/DeathVictim-Arg\nTewas/JJ/I-ARG/...
4,\nTabrak/NN/B-EVE/ACCIDENT-EVENT\nPembatas/NN/...


In [29]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 84 entries, 0 to 83
Data columns (total 1 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Document  84 non-null     object
dtypes: object(1)
memory usage: 804.0+ bytes


In [30]:
first_row = df.iloc[0]
print(first_row.values)

['Kerabat/NNP/O/O\nLayat/NNP/O/O\nJasad/NNP/O/O\nJerry/NNP/O/O\nWong/NNP/O/O\nPengemudi/NNP/O/O\nMini/NNP/B-ARG/Vehicle-Arg\nCooper/NNP/I-ARG/Vehicle-Arg\ndi/IN/O/O\nRSCM/NN/B-ORG/Hospital-Arg\n-/-/O/O\n-/NNP/O/O\nJakarta/NNP/B-PLOC/Published-Arg/(-6.197602429787846, 106.83139222722116)/1\n-/-/O/O\nJenazah/NNP/O/O\nJerry/NNP/O/O\nWong/NNP/O/O\n,/,/O/O\npengemudi/NN/O/O\nMini/NNP/B-ARG/Vehicle-Arg\nCooper/NNP/I-ARG/Vehicle-Arg\nyang/SC/O/O\nmenabrak/VBT/B-EVE/ACCIDENT-EVENT\nbagian/O/O\nbelakang/NN/O/O\ntruk/NN/B-ARG/Vehicle-Arg\n,/,/O/O\nkini/RB/O/O\nberada/VBI/O/O\ndi/IN/O/O\nRSCM/NNP/B-ORG/Hospital-Arg\n\nSekitar/CDI/O/O\n15/CDP/O/O\nkerabat/NN/O/O\ndan/CC/O/O\nteman/NN/O/O\ndatang/VBI/O/O\nmelayat/VBT/O/O\n\nPantauan/IN/O/O\ndetikcom/NN/B-ORG/Reporter-Arg\ndi/IN/O/O\nkamar/NN/O/O\njenazah/NN/O/O\nRSCM/NNP/B-ORG/Hospital-Arg\n,/,/O/O\nJalan/NN/B-ARG/Street-Arg\nSalemba/NNP/I-ARG/Street-Arg\n,/,/O/O\nJakarta/NNP/B-PLOC/Place-Arg/(-6.1780125136691195, 106.83773321392438)/2\nPusat/NNP/I

In [31]:
df['Accident_Event'] = ''
df['Fire_Event'] = ''
df['Flood_Event'] = ''
df['Earthquake_Event'] = ''
df.head()

Unnamed: 0,Document,Accident_Event,Fire_Event,Flood_Event,Earthquake_Event
0,Kerabat/NNP/O/O\nLayat/NNP/O/O\nJasad/NNP/O/O\...,,,,
1,\n1/CDP/B-ARG/DeathVictim-Arg\nOrang/NN/I-ARG/...,,,,
2,\nAnalisis/NNP/O/O\nBMKG/NNP/B-ORG/Reporter-Ar...,,,,
3,\n3/CDP/B-ARG/DeathVictim-Arg\nTewas/JJ/I-ARG/...,,,,
4,\nTabrak/NN/B-EVE/ACCIDENT-EVENT\nPembatas/NN/...,,,,


In [32]:
def check_event_types(row):
    accident_key = "ACCIDENT-EVENT"
    fire_key = "FIRE-EVENT"
    flood_key = "FLOOD-EVENT"
    earthquake_key = "QUAKE-EVENT"
    document = row['Document']
    if accident_key in document:
        row['Accident_Event'] = True
    if fire_key in document:
        row['Fire_Event'] = True
    if flood_key in document:
        row['Flood_Event'] = True
    if earthquake_key in document:
        row['Earthquake_Event'] = True
    return row

In [33]:
df.apply(check_event_types, axis=1)

Unnamed: 0,Document,Accident_Event,Fire_Event,Flood_Event,Earthquake_Event
0,Kerabat/NNP/O/O\nLayat/NNP/O/O\nJasad/NNP/O/O\...,True,,,
1,\n1/CDP/B-ARG/DeathVictim-Arg\nOrang/NN/I-ARG/...,,True,,
2,\nAnalisis/NNP/O/O\nBMKG/NNP/B-ORG/Reporter-Ar...,,,,True
3,\n3/CDP/B-ARG/DeathVictim-Arg\nTewas/JJ/I-ARG/...,,,,True
4,\nTabrak/NN/B-EVE/ACCIDENT-EVENT\nPembatas/NN/...,True,,,
...,...,...,...,...,...
79,\nGempa/NNP/B-EVE/QUAKE-EVENT\nMagnitudo/NNP/B...,,,,True
80,"\nHujan/NN/O/O\nDeras/NNP/O/O\n,/,/O/O\nKawasa...",,,True,
81,\nBanjir/NN/B-EVE/FLOOD-EVENT\n3/CDP/B-ARG/Hei...,,,True,
82,\nBus/NN/O/O\nAngkut/NN/O/O\nRombongan/NN/O/O\...,True,,,


In [34]:
df = df.fillna(False)
df

Unnamed: 0,Document,Accident_Event,Fire_Event,Flood_Event,Earthquake_Event
0,Kerabat/NNP/O/O\nLayat/NNP/O/O\nJasad/NNP/O/O\...,,,,
1,\n1/CDP/B-ARG/DeathVictim-Arg\nOrang/NN/I-ARG/...,,,,
2,\nAnalisis/NNP/O/O\nBMKG/NNP/B-ORG/Reporter-Ar...,,,,
3,\n3/CDP/B-ARG/DeathVictim-Arg\nTewas/JJ/I-ARG/...,,,,
4,\nTabrak/NN/B-EVE/ACCIDENT-EVENT\nPembatas/NN/...,,,,
...,...,...,...,...,...
79,\nGempa/NNP/B-EVE/QUAKE-EVENT\nMagnitudo/NNP/B...,,,,
80,"\nHujan/NN/O/O\nDeras/NNP/O/O\n,/,/O/O\nKawasa...",,,,
81,\nBanjir/NN/B-EVE/FLOOD-EVENT\n3/CDP/B-ARG/Hei...,,,,
82,\nBus/NN/O/O\nAngkut/NN/O/O\nRombongan/NN/O/O\...,,,,


In [35]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 84 entries, 0 to 83
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Document          84 non-null     object
 1   Accident_Event    84 non-null     object
 2   Fire_Event        84 non-null     object
 3   Flood_Event       84 non-null     object
 4   Earthquake_Event  84 non-null     object
dtypes: object(5)
memory usage: 3.4+ KB


In [36]:
true_counts = df.apply(lambda x: x.astype(bool).sum())
print(true_counts)

Document            83
Accident_Event       0
Fire_Event           0
Flood_Event          0
Earthquake_Event     0
dtype: int64


# Embedding

In [37]:
# NLTK
from nltk.tokenize import sent_tokenize, word_tokenize

# Warnings
import warnings
warnings.filterwarnings(action='ignore')

# Word2Vec
import gensim
from gensim.models import Word2Vec

# JSON
import json

# OS
import os

In [38]:
# Function to get text data from JSON
def extract_text_from_json(file_path):
    with open(file_path, 'r') as f:
        data = json.load(f)
        content = data['content']
        # Pre-process data 
    return content

# Function to traverse `newspapers` folder & extract each JSON files
def extract_text_from_json_files(root_dir):
    text_data = []
    for subdir, dirs, files in os.walk(root_dir):
        for file in files:
            file_path = os.path.join(subdir, file)
            if file_path.endswith('.json'):
                text_data.append(extract_text_from_json(file_path))
    return text_data

In [39]:
# Indonesian News Dataset 2018 as Corpus 
# https://huggingface.co/datasets/indonesian-nlp/id_newspapers_2018/tree/main

root_directory = '/Users/azurast/Downloads/newspapers/json'
text_data = extract_text_from_json_files(root_directory)

In [40]:
text_data

[]

In [41]:
# Tokenize the text data
tokenized_data = [gensim.utils.simple_preprocess(text) for text in text_data]

In [42]:
# Train the Word2Vec model
# model = Word2Vec(tokenized_data, vector_size=100, window=5, min_count=5, workers=4)

In [43]:
# Save the trained model
#model.save('indonesian_newspapers_word2vec.model')

In [44]:
# 1. Let's see the word embedding for "kebakaran" by accessing the "wv" attribute and passing in "kebakaran" as the key.
model = Word2Vec.load('my_model_indonesian_newspapers_word2vec.model')
print("Embedding for kebakaran:", model.wv["kebakaran"])

Embedding for kebakaran: [-0.11402367  0.37845838 -0.10772562 -0.13374746  0.04284624 -0.4803884
  0.24623325  0.70485073 -0.15914932 -0.22895694 -0.02735815 -0.49519628
 -0.17123197  0.0778406   0.12902541 -0.1309303   0.14402567 -0.3005387
 -0.10067452 -0.6012565   0.07891174  0.18998279  0.39396274 -0.00951392
 -0.16941355  0.05007969 -0.3464864  -0.15260598 -0.3600642   0.0784976
  0.477373    0.03955512  0.24122703 -0.2837372  -0.3004585   0.42481065
  0.12425724 -0.12694457 -0.07098845 -0.41624826 -0.03917486 -0.20446767
 -0.11558851  0.10804968  0.451089    0.02398587 -0.33546603 -0.07152762
  0.13858435  0.21837054  0.14388753 -0.27416983 -0.13308528 -0.27555966
 -0.3208372   0.02202458  0.20332256 -0.05704811 -0.36412022  0.10308675
  0.02870579  0.10630788  0.06790478 -0.1164986  -0.56474173  0.1377047
  0.16009648  0.2849244  -0.5242075   0.25276956 -0.24397208  0.09804486
  0.20336673 -0.14771955  0.271215    0.11228774 -0.00281903 -0.10425534
 -0.3532008   0.09768607 -0.18

In [45]:
# 2. Inspect the model vocabulary by accessing keys of the "wv.index_to_key" attribute. We'll print the first 20 words.
print("Vocabulary length:", len(model.wv))
print("Check first 20 vocab:", model.wv.index_to_key[:20])

Vocabulary length: 578
Check first 20 vocab: [',', 'di', '-', 'yang', '"', 'dan', '(', ')', 'Jakarta', 'ini', 'dari', 'rumah', 'ke', 'ada', 'itu', 'terjadi', 'air', 'warga', 'banjir', 'korban']


## Train Test Split

In [46]:
df

Unnamed: 0,Document,Accident_Event,Fire_Event,Flood_Event,Earthquake_Event
0,Kerabat/NNP/O/O\nLayat/NNP/O/O\nJasad/NNP/O/O\...,,,,
1,\n1/CDP/B-ARG/DeathVictim-Arg\nOrang/NN/I-ARG/...,,,,
2,\nAnalisis/NNP/O/O\nBMKG/NNP/B-ORG/Reporter-Ar...,,,,
3,\n3/CDP/B-ARG/DeathVictim-Arg\nTewas/JJ/I-ARG/...,,,,
4,\nTabrak/NN/B-EVE/ACCIDENT-EVENT\nPembatas/NN/...,,,,
...,...,...,...,...,...
79,\nGempa/NNP/B-EVE/QUAKE-EVENT\nMagnitudo/NNP/B...,,,,
80,"\nHujan/NN/O/O\nDeras/NNP/O/O\n,/,/O/O\nKawasa...",,,,
81,\nBanjir/NN/B-EVE/FLOOD-EVENT\n3/CDP/B-ARG/Hei...,,,,
82,\nBus/NN/O/O\nAngkut/NN/O/O\nRombongan/NN/O/O\...,,,,


In [52]:
df['Event_Type'] = ''
df

Unnamed: 0,Document,Accident_Event,Fire_Event,Flood_Event,Earthquake_Event,EventType,Event_Type
0,Kerabat/NNP/O/O\nLayat/NNP/O/O\nJasad/NNP/O/O\...,,,,,,
1,\n1/CDP/B-ARG/DeathVictim-Arg\nOrang/NN/I-ARG/...,,,,,,
2,\nAnalisis/NNP/O/O\nBMKG/NNP/B-ORG/Reporter-Ar...,,,,,,
3,\n3/CDP/B-ARG/DeathVictim-Arg\nTewas/JJ/I-ARG/...,,,,,,
4,\nTabrak/NN/B-EVE/ACCIDENT-EVENT\nPembatas/NN/...,,,,,,
...,...,...,...,...,...,...,...
79,\nGempa/NNP/B-EVE/QUAKE-EVENT\nMagnitudo/NNP/B...,,,,,,
80,"\nHujan/NN/O/O\nDeras/NNP/O/O\n,/,/O/O\nKawasa...",,,,,,
81,\nBanjir/NN/B-EVE/FLOOD-EVENT\n3/CDP/B-ARG/Hei...,,,,,,
82,\nBus/NN/O/O\nAngkut/NN/O/O\nRombongan/NN/O/O\...,,,,,,


In [53]:
def fill_event_type(row):
    if row['Accident_Event'] == True:
        row['Event_Type'] = "Accident"
    if row['Fire_Event'] == True:
        row['Event_Type'] = "Fire"
    if row['Flood_Event'] == True:
        row['Event_Type'] = "Flood"
    if row['Earthquake_Event'] == True:
        row['Event_Type'] = "Earthquake"
    return row

In [54]:
df = df.apply(fill_event_type, axis=1)

In [55]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 84 entries, 0 to 83
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Document          84 non-null     object
 1   Accident_Event    84 non-null     object
 2   Fire_Event        84 non-null     object
 3   Flood_Event       84 non-null     object
 4   Earthquake_Event  84 non-null     object
 5   EventType         84 non-null     object
 6   Event_Type        84 non-null     object
dtypes: object(7)
memory usage: 4.7+ KB


In [56]:
df[df['Event_Type'].isna()]

Unnamed: 0,Document,Accident_Event,Fire_Event,Flood_Event,Earthquake_Event,EventType,Event_Type


In [57]:
df = df.fillna("")

In [58]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 84 entries, 0 to 83
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Document          84 non-null     object
 1   Accident_Event    84 non-null     object
 2   Fire_Event        84 non-null     object
 3   Flood_Event       84 non-null     object
 4   Earthquake_Event  84 non-null     object
 5   EventType         84 non-null     object
 6   Event_Type        84 non-null     object
dtypes: object(7)
memory usage: 4.7+ KB


In [59]:
pos_document_list = df['Document'].tolist()
tokenized_data = []
tokenized_documents = []

for document in pos_document_list:
    lines = document.split('\n')
    document_tokens = []
    for line in lines:
        print("line", line)
        # Skip empty line
        if line.strip() == '':
            continue
        # Split each line by the delimiter    
        parts = line.strip().split("/")
        word = parts[0]
        document_tokens.append(word)
    tokenized_data.append(document_tokens)

print(tokenized_data)

line Kerabat/NNP/O/O
line Layat/NNP/O/O
line Jasad/NNP/O/O
line Jerry/NNP/O/O
line Wong/NNP/O/O
line Pengemudi/NNP/O/O
line Mini/NNP/B-ARG/Vehicle-Arg
line Cooper/NNP/I-ARG/Vehicle-Arg
line di/IN/O/O
line RSCM/NN/B-ORG/Hospital-Arg
line -/-/O/O
line -/NNP/O/O
line Jakarta/NNP/B-PLOC/Published-Arg/(-6.197602429787846, 106.83139222722116)/1
line -/-/O/O
line Jenazah/NNP/O/O
line Jerry/NNP/O/O
line Wong/NNP/O/O
line ,/,/O/O
line pengemudi/NN/O/O
line Mini/NNP/B-ARG/Vehicle-Arg
line Cooper/NNP/I-ARG/Vehicle-Arg
line yang/SC/O/O
line menabrak/VBT/B-EVE/ACCIDENT-EVENT
line bagian/O/O
line belakang/NN/O/O
line truk/NN/B-ARG/Vehicle-Arg
line ,/,/O/O
line kini/RB/O/O
line berada/VBI/O/O
line di/IN/O/O
line RSCM/NNP/B-ORG/Hospital-Arg
line 
line Sekitar/CDI/O/O
line 15/CDP/O/O
line kerabat/NN/O/O
line dan/CC/O/O
line teman/NN/O/O
line datang/VBI/O/O
line melayat/VBT/O/O
line 
line Pantauan/IN/O/O
line detikcom/NN/B-ORG/Reporter-Arg
line di/IN/O/O
line kamar/NN/O/O
line jenazah/NN/O/O
line RSCM/N

In [60]:
# Train the Word2Vec model
my_model = Word2Vec(tokenized_data, vector_size=100, window=5, min_count=5, workers=4)

In [None]:
# Save the trained model
# my_model.save('my_model_indonesian_newspapers_word2vec.model')

In [61]:
document_vectors = []

for tokens in tokenized_data:
    # If the document has no tokens (e.g., after preprocessing), assign a default vector
    if not tokens:
        document_vectors.append(np.zeros(my_model.vector_size))
    else:
        # Calculate the document vector by averaging the word vectors
        document_vector = np.mean([my_model.wv[word] for word in tokens if word in my_model.wv.index_to_key], axis=0)
        document_vectors.append(document_vector)

In [62]:
y_vectors = []
for label in df['Event_Type']:
    label_vector = np.mean([my_model.wv[word] for word in label if word in my_model.wv.index_to_key], axis=0)
    y_vectors.append(label_vector)

In [63]:
document_vectors[0]

array([-0.12387558,  0.3450927 , -0.0721269 , -0.05232852,  0.01720145,
       -0.4560713 ,  0.20645854,  0.60567826, -0.22441961, -0.17900987,
       -0.02367817, -0.47163802, -0.11995075,  0.05255514,  0.14703786,
       -0.15496871,  0.09423275, -0.24966626, -0.13772894, -0.49646738,
        0.05740146,  0.163016  ,  0.3809803 , -0.05315839, -0.12483926,
        0.0769779 , -0.34598184, -0.18044128, -0.3089661 ,  0.0409544 ,
        0.40049723,  0.048063  ,  0.22175628, -0.18704057, -0.23429811,
        0.43017083,  0.1358005 , -0.11894071, -0.14714123, -0.3522848 ,
       -0.03139261, -0.2230216 , -0.12332384,  0.10893395,  0.37661967,
       -0.04837448, -0.28859988, -0.11931238,  0.12882304,  0.16780868,
        0.15381701, -0.26329908, -0.06231876, -0.25818405, -0.27213198,
        0.06029445,  0.15896788, -0.11193493, -0.36640272,  0.09775911,
        0.01228678,  0.07225887,  0.05842808, -0.10962836, -0.43739718,
        0.14187858,  0.19295003,  0.26495758, -0.5382732 ,  0.24

In [64]:
y_vectors[3]

nan

In [66]:
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import LabelEncoder

tfidf_vectorizer = TfidfVectorizer()

X_train,X_test,y_train,y_test = train_test_split(document_vectors, df['Event_Type'], train_size=0.8, random_state=42)


In [67]:
# Min max scaling
min_value = np.min(X_train, axis=0)
max_value = np.max(X_train, axis=0)
scaled_X_train = (X_train - min_value) / (max_value - min_value)
scaled_X_test = (X_test - min_value) / (max_value - min_value)

In [68]:
label_encoder = LabelEncoder()

y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.fit_transform(y_test)

In [69]:
from sklearn.naive_bayes import MultinomialNB

nb = MultinomialNB()
nb.fit(scaled_X_train, y_train_encoded)

In [70]:
y_pred = nb.predict(scaled_X_test)

In [71]:
from sklearn.metrics import confusion_matrix, classification_report
confusion_matrix(y_test_encoded, y_pred)

array([[17]])

In [72]:
print(classification_report(y_pred, y_test_encoded))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        17

    accuracy                           1.00        17
   macro avg       1.00      1.00      1.00        17
weighted avg       1.00      1.00      1.00        17



# Joint Paradigm
**Phase 1: Trigger & Argument Identification**

**Phase 2: Trigger & Argument Classification**

In [73]:
df_copy = df

In [74]:
df_copy = df_copy.drop(['Accident_Event', 'Fire_Event',	'Flood_Event', 'Earthquake_Event'], axis=1, inplace=False)

In [75]:
df_copy = df_copy.drop(['EventType'], axis=1, inplace=False)

In [76]:
df_copy

Unnamed: 0,Document,Event_Type
0,Kerabat/NNP/O/O\nLayat/NNP/O/O\nJasad/NNP/O/O\...,
1,\n1/CDP/B-ARG/DeathVictim-Arg\nOrang/NN/I-ARG/...,
2,\nAnalisis/NNP/O/O\nBMKG/NNP/B-ORG/Reporter-Ar...,
3,\n3/CDP/B-ARG/DeathVictim-Arg\nTewas/JJ/I-ARG/...,
4,\nTabrak/NN/B-EVE/ACCIDENT-EVENT\nPembatas/NN/...,
...,...,...
79,\nGempa/NNP/B-EVE/QUAKE-EVENT\nMagnitudo/NNP/B...,
80,"\nHujan/NN/O/O\nDeras/NNP/O/O\n,/,/O/O\nKawasa...",
81,\nBanjir/NN/B-EVE/FLOOD-EVENT\n3/CDP/B-ARG/Hei...,
82,\nBus/NN/O/O\nAngkut/NN/O/O\nRombongan/NN/O/O\...,


In [77]:
df_copy['Document_ID'] = ''
df_copy

Unnamed: 0,Document,Event_Type,Document_ID
0,Kerabat/NNP/O/O\nLayat/NNP/O/O\nJasad/NNP/O/O\...,,
1,\n1/CDP/B-ARG/DeathVictim-Arg\nOrang/NN/I-ARG/...,,
2,\nAnalisis/NNP/O/O\nBMKG/NNP/B-ORG/Reporter-Ar...,,
3,\n3/CDP/B-ARG/DeathVictim-Arg\nTewas/JJ/I-ARG/...,,
4,\nTabrak/NN/B-EVE/ACCIDENT-EVENT\nPembatas/NN/...,,
...,...,...,...
79,\nGempa/NNP/B-EVE/QUAKE-EVENT\nMagnitudo/NNP/B...,,
80,"\nHujan/NN/O/O\nDeras/NNP/O/O\n,/,/O/O\nKawasa...",,
81,\nBanjir/NN/B-EVE/FLOOD-EVENT\n3/CDP/B-ARG/Hei...,,
82,\nBus/NN/O/O\nAngkut/NN/O/O\nRombongan/NN/O/O\...,,


In [78]:
df_copy = df_copy[['Document_ID', 'Document', 'Event_Type']]

In [79]:
df_copy

Unnamed: 0,Document_ID,Document,Event_Type
0,,Kerabat/NNP/O/O\nLayat/NNP/O/O\nJasad/NNP/O/O\...,
1,,\n1/CDP/B-ARG/DeathVictim-Arg\nOrang/NN/I-ARG/...,
2,,\nAnalisis/NNP/O/O\nBMKG/NNP/B-ORG/Reporter-Ar...,
3,,\n3/CDP/B-ARG/DeathVictim-Arg\nTewas/JJ/I-ARG/...,
4,,\nTabrak/NN/B-EVE/ACCIDENT-EVENT\nPembatas/NN/...,
...,...,...,...
79,,\nGempa/NNP/B-EVE/QUAKE-EVENT\nMagnitudo/NNP/B...,
80,,"\nHujan/NN/O/O\nDeras/NNP/O/O\n,/,/O/O\nKawasa...",
81,,\nBanjir/NN/B-EVE/FLOOD-EVENT\n3/CDP/B-ARG/Hei...,
82,,\nBus/NN/O/O\nAngkut/NN/O/O\nRombongan/NN/O/O\...,


In [80]:
class MyToken:
    def __init__(self, doc_id, word=None, pos_tag=None, event=None, argument=None, latlong=None, admin_level=None):
        self.doc_id = doc_id
        self.word = word if word is not None else ""
        self.pos_tag = pos_tag if pos_tag is not None else ""
        self.event = event if event is not None else ""
        self.argument = argument if argument is not None else ""
        self.latlong = latlong if latlong is not None else ""
        self.admin_level = admin_level if admin_level is not None else ""

In [92]:
import re

my_docs = df_copy['Document'].tolist()

my_docss = []

for index, document in enumerate(my_docs):
    lines = document.split('\n')
    document_tokens = []
    for line in lines:
        # Skip empty line
        if line.strip() == '':
            continue
        # Split each line by the delimiter    

        pattern1 = r"(\d{1,2}/\d{1,2}/\d{4})/(.*)"
        result1 = re.findall(pattern1, line)

        if result1: 
            first_part = result1[0][0]
            other_parts = result1[0][1].split('/')
            parts = [first_part] + other_parts
        else:
            parts = line.strip().split("/")
        
        # word = parts[0]
        # pos_tag = parts[1]
        # event = parts[2]
        # argument = parts[3]
        # latlong = parts[4]
        # admin_level = parts[5]
        if len(parts) == 6:
            token = MyToken(index, parts[0], parts[1], parts[2], parts[3], parts[4], parts[5])
        elif len(parts) == 5:
            token = MyToken(index, parts[0], parts[1], parts[2], parts[3], parts[4])
        elif len(parts) == 4:
            token = MyToken(index, parts[0], parts[1], parts[2], parts[3])
        elif len(parts) == 3:
            token = MyToken(index, parts[0], parts[1], parts[2])
        elif len(parts) == 2:
            token = MyToken(index, parts[0], parts[1])
        elif len(parts) == 1:
            token = MyToken(index, parts[0])
        else:
            token = MyToken(index)
        document_tokens.append(token)
    my_docss.append(document_tokens)

In [93]:
# ada yang salah nih harusnya 84 semua
print(len(my_docss))
# print(len(my_docss[0]))
# print(len(my_docss[1]))
# print(len(my_docss[2]))
# print(len(my_docss[3]))
# print(len(my_docss[4]))
# print(len(my_docss[5]))

84


In [95]:
documentids = []
words = []
pos_tags = []
events = []
arguments = []
coordinates = []
admin_levels = []

for document in my_docss:
    for token in document:
        documentids.append(token.doc_id)
        words.append(token.word)
        pos_tags.append(token.pos_tag)
        events.append(token.event)
        arguments.append(token.argument)
        coordinates.append(token.latlong)
        admin_levels.append(token.admin_level)

In [96]:
print(len(documentids))
print(len(words))
print(len(pos_tags))
print(len(events))
print(len(arguments))
print(len(coordinates))
print(len(admin_levels))

15438
15438
15438
15438
15438
15438
15438


In [97]:
new_df = pd.DataFrame({'doc_id': documentids, 'word': words, 'pos_tag': pos_tags, 'event': events, 'argument': arguments, 'coordinate': coordinates, 'admin_level': admin_levels})

In [98]:
from IPython.display import display

pd.set_option("display.max_rows", 20)
display(new_df)

Unnamed: 0,doc_id,word,pos_tag,event,argument,coordinate,admin_level
0,0,Kerabat,NNP,O,O,,
1,0,Layat,NNP,O,O,,
2,0,Jasad,NNP,O,O,,
3,0,Jerry,NNP,O,O,,
4,0,Wong,NNP,O,O,,
...,...,...,...,...,...,...,...
15433,82,Kecamatan,NNP,B-LOC,Place-Arg,"(-8.120459667921391, 112.3846910222594)",3
15434,82,Kesamben,NNP,I-LOC,Place-Arg,,
15435,82,",",",",O,O,,
15436,82,Kabupaten,NNP,B-PLOC,Place-Arg,"(-8.116787994159438, 112.21988109991273)",2


In [101]:
new_df.to_csv('dataframe/transformed.csv')

In [99]:
def get_unique_values_overview(column):
    column_unique_count = column.nunique()
    column_unique_values = column.unique()
    column_unique_value_counts = column.value_counts()

    print("==================" + column.name + "=================\n")
    
    print("Number of unique values in column:")
    pp.pprint(column_unique_count)

    # print("Unique values in column:")
    # pp.pprint(column_unique_values)

    pd.set_option('display.max_rows', None)
    print("Each unnique values count:")
    pp.pprint(column_unique_value_counts)

    print("==========================================\n")

In [100]:
# Try creating train test split with still 80/20 split
# But, make sure the percentage of the event trigger & argument role is also proportional
# Count the number of -EVENT and Arguments

get_unique_values_overview(new_df['event'])
get_unique_values_overview(new_df['argument'])

# based on the values here masih ada salah parsing karena masih ada arguments location di arguments & argument roles di event


Number of unique values in column:
20
Each unnique values count:
event
O         11073
I-ARG      1048
B-ARG       950
B-EVE       596
B-PLOC      446
B-LOC       322
B-ORG       288
I-ORG       285
I-PLOC      184
I-LOC       129
I-EVE       103
DD            3
8             2
              2
5             2
4             1
3             1
9             1
6             1
1             1
Name: count, dtype: int64


Number of unique values in column:
62
Each unnique values count:
argument
O                             11046
Place-Arg                      1043
Time-Arg                        410
OfficerOfficial-Arg             395
FLOOD-EVENT                     195
DeathVictim-Arg                 195
FIRE-EVENT                      178
Height-Arg                      172
Street-Arg                      157
                                146
ACCIDENT-EVENT                  129
QUAKE-EVENT                     125
Vehicle-Arg                     119
Strength-Arg                     94
Ce

In [114]:
# load csv
mydf = pd.read_csv('dataframe/transformed.csv')

In [115]:
get_unique_values_overview(mydf['event'])
get_unique_values_overview(mydf['argument'])


Number of unique values in column:
11
Each unnique values count:
event
O         11072
I-ARG      1048
B-ARG       962
B-EVE       597
B-PLOC      446
B-LOC       322
B-ORG       288
I-ORG       285
I-PLOC      184
I-LOC       129
I-EVE       103
Name: count, dtype: int64


Number of unique values in column:
58
Each unnique values count:
argument
O                             11046
Place-Arg                      1043
Time-Arg                        422
OfficerOfficial-Arg             395
FLOOD-EVENT                     195
DeathVictim-Arg                 195
FIRE-EVENT                      178
Height-Arg                      172
Street-Arg                      157
ACCIDENT-EVENT                  129
QUAKE-EVENT                     125
Vehicle-Arg                     119
Strength-Arg                     94
Central-Arg                      89
Facility-Arg                     81
Published-Arg                    79
Plate-Arg                        59
Cause-Arg                        50
Ho