# Joint Paradigm for Document-Level Event Extraction and Geoparsing for Indonesian News Text 

## 1. Import

In [1]:
import pandas as pd
import numpy as np
import pprint
import nltk

pp = pprint.PrettyPrinter(indent=2)

## 2. Data Pre-Processing

In [2]:
# Specify the path to the text file
file_path = "event-geoparsing-corpus.txt"

# Open the file
file = open(file_path, "r")

# Read the contents of the file
file_contents = file.read()

# Close the file
file.close()

# Print the contents of the file
pp.pprint(file_contents)

('Kerabat/NNP/O/O\n'
 'Layat/NNP/O/O\n'
 'Jasad/NNP/O/O\n'
 'Jerry/NNP/O/O\n'
 'Wong/NNP/O/O\n'
 'Pengemudi/NNP/O/O\n'
 'Mini/NNP/B-ARG/Vehicle-Arg\n'
 'Cooper/NNP/I-ARG/Vehicle-Arg\n'
 'di/IN/O/O\n'
 'RSCM/NN/B-ORG/Hospital-Arg\n'
 '-/-/O/O\n'
 '-/NNP/O/O\n'
 'Jakarta/NNP/B-PLOC/Published-Arg/(-6.197602429787846, 106.83139222722116)/1\n'
 '-/-/O/O\n'
 'Jenazah/NNP/O/O\n'
 'Jerry/NNP/O/O\n'
 'Wong/NNP/O/O\n'
 ',/,/O/O\n'
 'pengemudi/NN/O/O\n'
 'Mini/NNP/B-ARG/Vehicle-Arg\n'
 'Cooper/NNP/I-ARG/Vehicle-Arg\n'
 'yang/SC/O/O\n'
 'menabrak/VBT/B-EVE/ACCIDENT-EVENT\n'
 'bagian/O/O\n'
 'belakang/NN/O/O\n'
 'truk/NN/B-ARG/Vehicle-Arg\n'
 ',/,/O/O\n'
 'kini/RB/O/O\n'
 'berada/VBI/O/O\n'
 'di/IN/O/O\n'
 'RSCM/NNP/B-ORG/Hospital-Arg\n'
 '\n'
 'Sekitar/CDI/O/O\n'
 '15/CDP/O/O\n'
 'kerabat/NN/O/O\n'
 'dan/CC/O/O\n'
 'teman/NN/O/O\n'
 'datang/VBI/O/O\n'
 'melayat/VBT/O/O\n'
 '\n'
 'Pantauan/IN/O/O\n'
 'detikcom/NN/B-ORG/Reporter-Arg\n'
 'di/IN/O/O\n'
 'kamar/NN/O/O\n'
 'jenazah/NN/O/O\n'
 'RSCM/NNP/

Replace "PLOC" with "LOC" because "PLOC" indicates pseudo-location for the data owner's methodology.
Pseudo location is used to mark location which is not the actual location.

In [3]:
replaced_ploc = file_contents.replace("PLOC", "LOC")

In [4]:
type(file_contents)

str

Each document is splitted by "===" in the text file, so in order to get the documents in an array we need to split them.

In [5]:
documents = file_contents.split("===")
type(documents)

list

In [6]:
documents

['Kerabat/NNP/O/O\nLayat/NNP/O/O\nJasad/NNP/O/O\nJerry/NNP/O/O\nWong/NNP/O/O\nPengemudi/NNP/O/O\nMini/NNP/B-ARG/Vehicle-Arg\nCooper/NNP/I-ARG/Vehicle-Arg\ndi/IN/O/O\nRSCM/NN/B-ORG/Hospital-Arg\n-/-/O/O\n-/NNP/O/O\nJakarta/NNP/B-PLOC/Published-Arg/(-6.197602429787846, 106.83139222722116)/1\n-/-/O/O\nJenazah/NNP/O/O\nJerry/NNP/O/O\nWong/NNP/O/O\n,/,/O/O\npengemudi/NN/O/O\nMini/NNP/B-ARG/Vehicle-Arg\nCooper/NNP/I-ARG/Vehicle-Arg\nyang/SC/O/O\nmenabrak/VBT/B-EVE/ACCIDENT-EVENT\nbagian/O/O\nbelakang/NN/O/O\ntruk/NN/B-ARG/Vehicle-Arg\n,/,/O/O\nkini/RB/O/O\nberada/VBI/O/O\ndi/IN/O/O\nRSCM/NNP/B-ORG/Hospital-Arg\n\nSekitar/CDI/O/O\n15/CDP/O/O\nkerabat/NN/O/O\ndan/CC/O/O\nteman/NN/O/O\ndatang/VBI/O/O\nmelayat/VBT/O/O\n\nPantauan/IN/O/O\ndetikcom/NN/B-ORG/Reporter-Arg\ndi/IN/O/O\nkamar/NN/O/O\njenazah/NN/O/O\nRSCM/NNP/B-ORG/Hospital-Arg\n,/,/O/O\nJalan/NN/B-ARG/Street-Arg\nSalemba/NNP/I-ARG/Street-Arg\n,/,/O/O\nJakarta/NNP/B-PLOC/Place-Arg/(-6.1780125136691195, 106.83773321392438)/2\nPusat/NNP/I

### 2.1 Data Exploration

In [7]:
df = pd.DataFrame(documents, columns=['Document'])
df.head()

Unnamed: 0,Document
0,Kerabat/NNP/O/O\nLayat/NNP/O/O\nJasad/NNP/O/O\...
1,\n1/CDP/B-ARG/DeathVictim-Arg\nOrang/NN/I-ARG/...
2,\nAnalisis/NNP/O/O\nBMKG/NNP/B-ORG/Reporter-Ar...
3,\n3/CDP/B-ARG/DeathVictim-Arg\nTewas/JJ/I-ARG/...
4,\nTabrak/NN/B-EVE/ACCIDENT-EVENT\nPembatas/NN/...


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 84 entries, 0 to 83
Data columns (total 1 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Document  84 non-null     object
dtypes: object(1)
memory usage: 800.0+ bytes


In [9]:
first_row = df.iloc[0]
print(first_row.values)

['Kerabat/NNP/O/O\nLayat/NNP/O/O\nJasad/NNP/O/O\nJerry/NNP/O/O\nWong/NNP/O/O\nPengemudi/NNP/O/O\nMini/NNP/B-ARG/Vehicle-Arg\nCooper/NNP/I-ARG/Vehicle-Arg\ndi/IN/O/O\nRSCM/NN/B-ORG/Hospital-Arg\n-/-/O/O\n-/NNP/O/O\nJakarta/NNP/B-PLOC/Published-Arg/(-6.197602429787846, 106.83139222722116)/1\n-/-/O/O\nJenazah/NNP/O/O\nJerry/NNP/O/O\nWong/NNP/O/O\n,/,/O/O\npengemudi/NN/O/O\nMini/NNP/B-ARG/Vehicle-Arg\nCooper/NNP/I-ARG/Vehicle-Arg\nyang/SC/O/O\nmenabrak/VBT/B-EVE/ACCIDENT-EVENT\nbagian/O/O\nbelakang/NN/O/O\ntruk/NN/B-ARG/Vehicle-Arg\n,/,/O/O\nkini/RB/O/O\nberada/VBI/O/O\ndi/IN/O/O\nRSCM/NNP/B-ORG/Hospital-Arg\n\nSekitar/CDI/O/O\n15/CDP/O/O\nkerabat/NN/O/O\ndan/CC/O/O\nteman/NN/O/O\ndatang/VBI/O/O\nmelayat/VBT/O/O\n\nPantauan/IN/O/O\ndetikcom/NN/B-ORG/Reporter-Arg\ndi/IN/O/O\nkamar/NN/O/O\njenazah/NN/O/O\nRSCM/NNP/B-ORG/Hospital-Arg\n,/,/O/O\nJalan/NN/B-ARG/Street-Arg\nSalemba/NNP/I-ARG/Street-Arg\n,/,/O/O\nJakarta/NNP/B-PLOC/Place-Arg/(-6.1780125136691195, 106.83773321392438)/2\nPusat/NNP/I

In [10]:
df['Accident_Event'] = ''
df['Fire_Event'] = ''
df['Flood_Event'] = ''
df['Earthquake_Event'] = ''
df.head()

Unnamed: 0,Document,Accident_Event,Fire_Event,Flood_Event,Earthquake_Event
0,Kerabat/NNP/O/O\nLayat/NNP/O/O\nJasad/NNP/O/O\...,,,,
1,\n1/CDP/B-ARG/DeathVictim-Arg\nOrang/NN/I-ARG/...,,,,
2,\nAnalisis/NNP/O/O\nBMKG/NNP/B-ORG/Reporter-Ar...,,,,
3,\n3/CDP/B-ARG/DeathVictim-Arg\nTewas/JJ/I-ARG/...,,,,
4,\nTabrak/NN/B-EVE/ACCIDENT-EVENT\nPembatas/NN/...,,,,


In [11]:
def check_event_types(row):
    accident_key = "ACCIDENT-EVENT"
    fire_key = "FIRE-EVENT"
    flood_key = "FLOOD-EVENT"
    earthquake_key = "QUAKE-EVENT"
    document = row['Document']
    if accident_key in document:
        row['Accident_Event'] = True
    if fire_key in document:
        row['Fire_Event'] = True
    if flood_key in document:
        row['Flood_Event'] = True
    if earthquake_key in document:
        row['Earthquake_Event'] = True
    return row

In [12]:
df.apply(check_event_types, axis=1)

Unnamed: 0,Document,Accident_Event,Fire_Event,Flood_Event,Earthquake_Event
0,Kerabat/NNP/O/O\nLayat/NNP/O/O\nJasad/NNP/O/O\...,True,,,
1,\n1/CDP/B-ARG/DeathVictim-Arg\nOrang/NN/I-ARG/...,,True,,
2,\nAnalisis/NNP/O/O\nBMKG/NNP/B-ORG/Reporter-Ar...,,,,True
3,\n3/CDP/B-ARG/DeathVictim-Arg\nTewas/JJ/I-ARG/...,,,,True
4,\nTabrak/NN/B-EVE/ACCIDENT-EVENT\nPembatas/NN/...,True,,,
...,...,...,...,...,...
79,\nGempa/NNP/B-EVE/QUAKE-EVENT\nMagnitudo/NNP/B...,,,,True
80,"\nHujan/NN/O/O\nDeras/NNP/O/O\n,/,/O/O\nKawasa...",,,True,
81,\nBanjir/NN/B-EVE/FLOOD-EVENT\n3/CDP/B-ARG/Hei...,,,True,
82,\nBus/NN/O/O\nAngkut/NN/O/O\nRombongan/NN/O/O\...,True,,,


In [13]:
df = df.fillna(False)
df

Unnamed: 0,Document,Accident_Event,Fire_Event,Flood_Event,Earthquake_Event
0,Kerabat/NNP/O/O\nLayat/NNP/O/O\nJasad/NNP/O/O\...,True,,,
1,\n1/CDP/B-ARG/DeathVictim-Arg\nOrang/NN/I-ARG/...,,True,,
2,\nAnalisis/NNP/O/O\nBMKG/NNP/B-ORG/Reporter-Ar...,,,,True
3,\n3/CDP/B-ARG/DeathVictim-Arg\nTewas/JJ/I-ARG/...,,,,True
4,\nTabrak/NN/B-EVE/ACCIDENT-EVENT\nPembatas/NN/...,True,,,
...,...,...,...,...,...
79,\nGempa/NNP/B-EVE/QUAKE-EVENT\nMagnitudo/NNP/B...,,,,True
80,"\nHujan/NN/O/O\nDeras/NNP/O/O\n,/,/O/O\nKawasa...",,,True,
81,\nBanjir/NN/B-EVE/FLOOD-EVENT\n3/CDP/B-ARG/Hei...,,,True,
82,\nBus/NN/O/O\nAngkut/NN/O/O\nRombongan/NN/O/O\...,True,,,


In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 84 entries, 0 to 83
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Document          84 non-null     object
 1   Accident_Event    84 non-null     object
 2   Fire_Event        84 non-null     object
 3   Flood_Event       84 non-null     object
 4   Earthquake_Event  84 non-null     object
dtypes: object(5)
memory usage: 3.4+ KB


In [15]:
true_counts = df.apply(lambda x: x.astype(bool).sum())
print(true_counts)

Document            83
Accident_Event      22
Fire_Event          21
Flood_Event         24
Earthquake_Event    19
dtype: int64


# 3. Event Extraction

## Embedding

In [28]:
# NLTK
from nltk.tokenize import sent_tokenize, word_tokenize

# Warnings
import warnings
warnings.filterwarnings(action='ignore')

# Word2Vec
import gensim
from gensim.models import Word2Vec

# JSON
import json

# OS
import os

In [32]:
# Function to get text data from JSON
def extract_text_from_json(file_path):
    with open(file_path, 'r') as f:
        data = json.load(f)
        content = data['content']
        # Pre-process data 
    return content

# Function to traverse `newspapers` folder & extract each JSON files
def extract_text_from_json_files(root_dir):
    text_data = []
    for subdir, dirs, files in os.walk(root_dir):
        for file in files:
            file_path = os.path.join(subdir, file)
            if file_path.endswith('.json'):
                text_data.append(extract_text_from_json(file_path))
    return text_data

In [33]:
# Indonesian News Dataset 2018 as Corpus 
# https://huggingface.co/datasets/indonesian-nlp/id_newspapers_2018/tree/main

root_directory = '/Users/azurast/Downloads/newspapers/json'
text_data = extract_text_from_json_files(root_directory)

In [34]:
text_data

['Barcelona, CNN Indonesia -- Samsung akhirnya resmi memperkenalkan Galaxy S6 dan S6 Edge di Barcelona, Spanyol. Hanya saja produsen asal Korea Selatan itu belum berbicara soal harga. Pun begitu SamMobile, blog yang kerap membocorkan produk dan strategi Samsung, telah mendapatkan bocoran harga kedua ponsel tersebut untuk wilayah Eropa yang diklaim berasal dari salah satu pejabat Samsung. Galaxy S6 tersedia dalam berbagai pilihan kapasitas, yang terkecil 32 GB dibanderol dengan harga 699 euro, atau setara dengan Rp 10,1 juta. Sementara untuk versi di atasnya yakni 64 GB akan dipatok 799 euro, atau setara Rp 11,6 juta dan versi 128 GB dijual Rp 13 juta. Sedangkan untuk Galaxy S6 Edge untuk varian 32 GB, 64 GB dan 128 GB masing-masing dijual 849 euro, 949 euro dan 1049 euro atau setara Rp 15,2 juta. Aura Seksi Samsung Galaxy S6 Mulai Terpancar Resmi Dirilis, Ini Spesifikasi Galaxy S6 dan S6 Edge Di Barcelona, Samsung Luncurkan Duet Penggempur Apple Namun perlu diketahui bahwa harga terseb

In [36]:
# Tokenize the text data
tokenized_data = [gensim.utils.simple_preprocess(text) for text in text_data]

In [37]:
# Train the Word2Vec model
model = Word2Vec(tokenized_data, vector_size=100, window=5, min_count=5, workers=4)

In [38]:
# Save the trained model
model.save('indonesian_newspapers_word2vec.model')

In [39]:
# 1. Let's see the word embedding for "kebakaran" by accessing the "wv" attribute and passing in "kebakaran" as the key.
print("Embedding for kebakaran:", model.wv["kebakaran"])

Embedding for kebakaran: [ 2.0414462  -3.75315     0.39765197 -1.7338927  -0.0590134  -0.20744218
 -2.3028045  -4.8425446   6.326513   -1.1452388   4.8490887  -1.7260503
 -3.203366   -2.1963425  -1.1707214  -0.04511541  1.1918947   6.069368
  2.0980356  -5.1221356   3.9840798   0.4244976   2.0141964   0.4137472
 -0.37643066  0.61278456 -2.8625507   0.735602   -7.5731854   2.717005
  1.0401915  -1.4523487   3.1961093   2.7365744   1.5074236   2.698923
  0.965628    0.3513012   3.349316   -0.31840843 -0.53646666 -1.6347815
  1.1439109  -0.13277027  4.488565    1.6412611  -3.3502626   0.20625757
  0.09625302 -0.74816716 -2.5186696  -1.3817745   3.7868476  -1.0377343
 -2.3360722   2.5491724   1.4747478   0.75335705 -2.381334   -0.29497525
 -4.0756025   5.9198675  -1.2310169   3.0239708   6.1691537   0.975601
 -5.1645217   0.5768625  -1.4440215  -5.3698053   0.11450747  0.72386944
  0.58176506  5.371713    3.8904994  -0.36991042 -0.5352621   3.572925
 -4.8806863  -1.164251   -0.10001501 -3.

In [45]:
# 2. Inspect the model vocabulary by accessing keys of the "wv.index_to_key" attribute. We'll print the first 20 words.
print("Vocabulary length:", len(model.wv))
print("Check first 20 vocab:", model.wv.index_to_key[:20])

Vocabulary length: 171598
Check first 20 vocab: ['yang', 'di', 'dan', 'ini', 'dengan', 'untuk', 'itu', 'dari', 'dalam', 'pada', 'akan', 'tidak', 'juga', 'ke', 'jakarta', 'ada', 'tersebut', 'saat', 'bisa', 'kata']
