In [28]:
import pandas as pd

In [29]:
def load_corpus(path):
  lines = [line.strip() for line in open(path)]
  return lines
lines = load_corpus('/content/sample_data/data/TRAIN_FILE.TXT')

In [30]:
def clean_str(text):
  text = text.replace('<e1>', ' _e11_ ')
  text = text.replace('</e1>', ' _e12_ ')
  text = text.replace('<e2>', ' _e21_ ')
  text = text.replace('</e2>', ' _e22_ ')
  return text.split("\t")[1].strip().lstrip('\"').rstrip('\"')

In [31]:
check_relations = {"Cause-Effect" : True,
                   "Component-Whole" : True,
                   "Entity-Origin" : True,
                   "Content-Container" : True,
                   "Other" : True}

In [32]:
def data_preprocessing(lines):
  data = []
  relations = []
  for i in range(0, len(lines), 4):
    sentence = lines[i]
    label = lines[i+1]
    relation = label.strip()
    if("(" in label.strip()):
      relation = label.strip()[:-7]
      if(relation not in check_relations):
        relation = "Other"
      data.append([clean_str(sentence).strip(), relation])
    if(relation not in relations):
      relations.append(relation)
  return data, relations      
data, relations = data_preprocessing(lines)
print(data)



In [33]:
relation_dict = {}
label_dict = {}
for i in range(len(relations)):
  relation_dict[relations[i]] = i
  label_dict[i] = relations[i]
print(relation_dict)


{'Component-Whole': 0, 'Other': 1, 'Cause-Effect': 2, 'Content-Container': 3, 'Entity-Origin': 4}


In [34]:
data = []
for i in range(0, len(lines), 4):
  sentence = lines[i]
  label = lines[i+1]
  relation = label.strip()
  if("(" in label.strip()):
    relation = label.strip()[:-7]
    if(relation not in check_relations):
      relation = "Other"
  data.append([clean_str(sentence).strip(), relation_dict[relation]])

In [35]:
df = pd.DataFrame(data, columns=['sentences', 'labels'])

In [36]:
import tensorflow as tf
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split


In [40]:
n_most_common_words = 20000 # vocabulary size
max_len = 100
def sentence_tokenizer(df):
  # Initialization
  tokenizer = Tokenizer(num_words=n_most_common_words, filters='!"#$%&()*+,-./:;=?@[]^_`{|}~', lower=True)
  # Fit and transformation
  tokenizer.fit_on_texts(df['sentences'].values)
  sequences = tokenizer.texts_to_sequences(df['sentences'].values)
  word_index = tokenizer.word_index
  print('Found %s unique tokens.' % len(word_index))
  # Padding
  X = pad_sequences(sequences, maxlen=max_len, padding= 'post')
  return X
X = sentence_tokenizer(df)
print(X)

Found 19551 unique tokens.
[[   1   92   21 ...    0    0    0]
 [   1    2  447 ...    0    0    0]
 [   1    2  228 ...    0    0    0]
 ...
 [  10  765   23 ...    0    0    0]
 [   1 1016    8 ...    0    0    0]
 [   1    2 1566 ...    0    0    0]]


In [41]:
from keras.utils.np_utils import to_categorical
labels = to_categorical(df['labels'], num_classes=len(df.labels.unique()))
X_train, X_test, y_train, y_test = train_test_split(X , labels, test_size=0.20, random_state=42)

In [42]:
from keras.models import Sequential
from keras.layers import Dense, LSTM, Bidirectional, Embedding

embedding_size = 300
model = Sequential()
model.add(Embedding(n_most_common_words, embedding_size, input_length=X.shape[1]))
model.add(Bidirectional(LSTM(128, dropout=0.7, recurrent_dropout=0.7)))
model.add(Dense(labels.shape[1], activation='softmax'))
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc'])
print(model.summary())

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 100, 300)          6000000   
                                                                 
 bidirectional (Bidirectiona  (None, 256)              439296    
 l)                                                              
                                                                 
 dense (Dense)               (None, 5)                 1285      
                                                                 
Total params: 6,440,581
Trainable params: 6,440,581
Non-trainable params: 0
_________________________________________________________________
None


In [43]:
model.fit(X_train, y_train, epochs=10, batch_size=128, validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f637c1bf910>

In [44]:
import numpy as np
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

y_test_gold = []

index, true_label = np.where(y_test == 1)

In [45]:
def number_to_category(label):
  if(label in label_dict):
    return label_dict[label]
  else:
    return "Other"  

In [46]:
prediction_probas = model.predict(X_test) 
predictions = [np.argmax(pred) for pred in prediction_probas]

print(confusion_matrix(true_label, predictions))
print(classification_report(true_label, predictions, digits=3))

[[ 95  87   2   7   6]
 [ 50 823  12  13  40]
 [  6  36 138   0  19]
 [  5  37   1  85   2]
 [  4  54   2   2  74]]
              precision    recall  f1-score   support

           0      0.594     0.482     0.532       197
           1      0.794     0.877     0.833       938
           2      0.890     0.693     0.780       199
           3      0.794     0.654     0.717       130
           4      0.525     0.544     0.534       136

    accuracy                          0.759      1600
   macro avg      0.719     0.650     0.679      1600
weighted avg      0.758     0.759     0.755      1600



In [61]:
lines = load_corpus('/content/sample_data/data/TEST_FILE.txt')
data = []
for sentence in lines:
  data.append([clean_str(sentence).strip(), relation_dict[relation]])
# print(data)  
df = pd.DataFrame(data, columns=['sentences', 'relation'])
X = sentence_tokenizer(df)

prediction_probas = model.predict(X) 
predictions = [np.argmax(pred) for pred in prediction_probas]


Found 10608 unique tokens.


In [67]:
print(len(predictions))
print(len(df['sentences']))
for i, row in df.iterrows():
    print('index: ', i, 'sentence: ', row['sentences'], 'relation: ', number_to_category(predictions[i]))
    if i == 50:
      break


1
1
index:  0 sentence:  He is  _e11_ intelligent _e12_ . He is a  _e21_ good _e22_  chess player. relation:  Other


In [72]:
lines = ['8003	"The <e1>workpackage</e1> comprise of the following <e2>tasks</e2>: Task 2.1 Compilation of long term data sets."']
data = []
for sentence in lines:
  data.append([clean_str(sentence).strip(), relation_dict[relation]])
df = pd.DataFrame(data, columns=['sentences', 'relation'])
X = sentence_tokenizer(df)

prediction_probas = model.predict(X) 
predictions = [np.argmax(pred) for pred in prediction_probas]

for i, row in df.iterrows():
    print('index: ', i, 'sentence: ', row['sentences'], 'relation: ', number_to_category(predictions[i]))


Found 18 unique tokens.
index:  0 sentence:  The  _e11_ workpackage _e12_  comprise of the following  _e21_ tasks _e22_ : Task 2.1 Compilation of long term data sets. relation:  Entity-Origin
