In [1]:
import os
import os.path as osp
import nltk
# nltk.download('stopwords')
# nltk.download('punkt')
from nltk.corpus import stopwords
english_stopwords = stopwords.words("english")
import numpy as np
import re
import seaborn as sns
sns.set_theme(style="whitegrid")
import matplotlib.pyplot as plt
import pandas as pd
import string

import pickle
import random
import collections

In [11]:
def getFiles(path):
    all_files = os.listdir(path)
    text_files = [f for f in all_files if f[-3:]=='txt']
    ann_files = [f for f in all_files if f[-3:]=='ann']
    print("Numer of ann files: ", len(ann_files))
    print("Numer of text files: ", len(text_files))
    return text_files, ann_files


def labelIOBReactions(path, ann_files, text_files):
    reactions_per_document = []
    labels = {}
    ann_files.sort()
    text_files.sort()
    # 0: O (Outside) ; 1: I (Inside) ; 2: B (Beginning)
    for af, tf in zip(ann_files, text_files):
        
        with open(osp.join(path, tf), 'r',encoding="utf-8" ) as fp:
            with open(osp.join(path, af), 'r', encoding="utf-8" ) as ap:
                data = fp.read().rstrip()
                
                paras = data.count('\n')+1
              
                para_labels = np.zeros(paras)

                rpd = 0

                lines = ap.readlines()
                for line in lines:

                    ts = line.split()
                    if(ts[1] == "REACTION_SPAN"):
                        rpd +=1
                        st = data[:int(ts[2])].count('\n')
                        cnts = data[int(ts[2]):int(ts[3])].count('\n')+1
                        para_labels[st]=2; #beginning of a reaction
                        for i in range(st+1, st+cnts):
                            para_labels[i]=1; #inside a reaction
                            
                labels[tf[:-4]] = para_labels
                reactions_per_document.append(rpd)
    
    return labels, reactions_per_document

In [7]:
cv_path = "C:\\Users\\meais\\Documents\\CMU\\Independent Study\\ReactionExtraction\\data\\dev\\"
train_path = "C:\\Users\\meais\\Documents\\CMU\\Independent Study\\ReactionExtraction\\data\\train\\"

In [8]:
train_text_files, train_ann_files = getFiles(train_path)
dev_text_files, dev_ann_files = getFiles(cv_path)

Numer of ann files:  120
Numer of text files:  120
Numer of ann files:  30
Numer of text files:  30


In [73]:
dev_labels, dev_reactions_per_document = labelIOBReactions(cv_path, dev_ann_files, dev_text_files)
with open('dev_labels.pkl', 'wb') as f:
    pickle.dump(dev_labels, f)
print("Total dev reactions: ", sum(dev_reactions_per_document))

Total dev reactions:  1244


In [74]:
train_labels, train_reactions_per_document = labelIOBReactions(train_path, train_ann_files, train_text_files)
with open('train_labels.pkl', 'wb') as f:
    pickle.dump(train_labels, f)
print("Total train reactions: ", sum(train_reactions_per_document))

Total train reactions:  6378


In [75]:
def create_dataset(files, labels_dict, output_file, test_file=False):

  if(test_file):
      train_data = {'para':[], 'label':[], 'document':[]}
  else:
      train_data = {'para':[], 'label':[]}

  for f in files:
    with open(f, encoding="utf-8" ) as fp:
        
      lines = fp.readlines()
      train_data['para'].extend(lines)

      labels = labels_dict[f.split('\\')[-1][:-4]] #".txt"
      train_data['label'].extend(labels)
        
      if(test_file):
          train_data['document'].extend([f.split('\\')[-1] for i in range(len(lines))])
      

  nl = []
  for l in train_data['label']:
    if(l==1):
      nl.append("1")
    elif(l==2):
      nl.append("2")
    elif(l==0):
      nl.append("0")
    else:
      print("Label Error")
    
    
  train_data['label'] = nl

  df = pd.DataFrame(train_data)
  print(df.head())
  df.to_csv(output_file, index = False)


In [76]:
test_files = [osp.join(cv_path, f)  for f in os.listdir(cv_path) if f[-3:]=='txt']
print(len(test_files))
print(test_files[:5])

30
['C:\\Users\\meais\\Documents\\CMU\\Independent Study\\ReactionExtraction\\data\\dev\\EP3284738A1.txt', 'C:\\Users\\meais\\Documents\\CMU\\Independent Study\\ReactionExtraction\\data\\dev\\EP3305797A1.txt', 'C:\\Users\\meais\\Documents\\CMU\\Independent Study\\ReactionExtraction\\data\\dev\\EP3312182A1.txt', 'C:\\Users\\meais\\Documents\\CMU\\Independent Study\\ReactionExtraction\\data\\dev\\EP3323817A1.txt', 'C:\\Users\\meais\\Documents\\CMU\\Independent Study\\ReactionExtraction\\data\\dev\\EP3330263A1.txt']


In [82]:
train_files = [osp.join(train_path, f) for f in os.listdir(train_path) if f[-3:]=='txt']
print(len(train_files))
random.shuffle(train_files)

val_files = train_files[:20]
train_files = train_files[20:]

print(len(train_files))
print(len(val_files))

120
100
20


In [83]:
# create_dataset(dev_files, pickle.load(open(dev_label_path, "rb")), "dev_data.csv") # test set
create_dataset(test_files, pickle.load(open("dev_labels.pkl", "rb")), "test_data_iob.csv", test_file=True) # test set
create_dataset(train_files, pickle.load(open("train_labels.pkl", "rb")), "train_data_iob.csv", test_file=True) #train_set
create_dataset(val_files, pickle.load(open("train_labels.pkl", "rb")), "val_data_iob.csv", test_file=True) # validation set

                                                para label         document
0                           FIELD OF THE INVENTION\n     0  EP3284738A1.txt
1  The present invention relates to the field of ...     0  EP3284738A1.txt
2                      BACKGROUND OF THE INVENTION\n     0  EP3284738A1.txt
3  Glutamate which is the most important excitato...     0  EP3284738A1.txt
4  Glutamate receptors (GluR) are mainly divided ...     0  EP3284738A1.txt
                                                para label          document
0  The present invention relates to phosphorescen...     0  US10868254B2.txt
1  In OLEDs, the electroluminescence (EL) charact...     0  US10868254B2.txt
2  In other words, in case of organic light-emitt...     0  US10868254B2.txt
3                                     1. Substrate\n     0  US10868254B2.txt
4  2. Base electrode, hole-injecting (positive po...     0  US10868254B2.txt
                                                para label  \
0          CROSS-REF

In [84]:
df = pd.read_csv("test_data_iob.csv")
print(len(df))
collections.Counter(df['label'])

12871


Counter({0: 9881, 2: 1236, 1: 1754})

In [85]:
df = pd.read_csv("train_data_iob.csv")
print(len(df))
collections.Counter(df['label'])

46018


Counter({0: 34567, 2: 5192, 1: 6259})

In [86]:
df = pd.read_csv("val_data_iob.csv")
print(len(df))
collections.Counter(df['label'])

7545


Counter({0: 5486, 2: 911, 1: 1148})