In [7]:
import os
import os.path as osp
import nltk
# nltk.download('stopwords')
# nltk.download('punkt')
from nltk.corpus import stopwords
english_stopwords = stopwords.words("english")
import numpy as np
import re
import seaborn as sns
sns.set_theme(style="whitegrid")
import matplotlib.pyplot as plt
import pandas as pd
import string

import pickle
import random
import collections
from nltk.tokenize import sent_tokenize

In [191]:
def getFiles(path):
    all_files = os.listdir(path)
    text_files = [f for f in all_files if f[-3:]=='txt']
    ann_files = [f for f in all_files if f[-3:]=='ann']
    print("Numer of ann files: ", len(ann_files))
    print("Numer of text files: ", len(text_files))
    return text_files, ann_files


def labelIOBReactions(path, ann_files, text_files):
    reactions_per_document = []
    labels = {}
    ann_files.sort()
    text_files.sort()
    # 0: O (Outside) ; 1: I (Inside) ; 2: B (Beginning)
    for af, tf in zip(ann_files, text_files):
        
        with open(osp.join(path, tf), 'r',encoding="utf-8" ) as fp:
            with open(osp.join(path, af), 'r', encoding="utf-8" ) as ap:
                data = fp.read().rstrip()
                
                paras = data.count('\n')+1
              
                para_labels = np.zeros(paras)

                rpd = 0

                lines = ap.readlines()
                for line in lines:

                    ts = line.split()
                    if(ts[1] == "REACTION_SPAN"):
                        rpd +=1
                        st = data[:int(ts[2])].count('\n')
                        cnts = data[int(ts[2]):int(ts[3])].count('\n')+1
                        para_labels[st]=2; #beginning of a reaction
                        for i in range(st+1, st+cnts):
                            para_labels[i]=1; #inside a reaction
                            
                labels[tf[:-4]] = para_labels
                reactions_per_document.append(rpd)
    
    return labels, reactions_per_document

In [196]:
cv_path = "C:\\Users\\meais\\Documents\\CMU\\Independent Study\\ReactionExtraction\\data\\dev\\"
train_path = "C:\\Users\\meais\\Documents\\CMU\\Independent Study\\ReactionExtraction\\data\\train\\"

In [197]:
train_text_files, train_ann_files = getFiles(train_path)
dev_text_files, dev_ann_files = getFiles(cv_path)

Numer of ann files:  120
Numer of text files:  120
Numer of ann files:  30
Numer of text files:  30


In [73]:
dev_labels, dev_reactions_per_document = labelIOBReactions(cv_path, dev_ann_files, dev_text_files)
with open('dev_labels.pkl', 'wb') as f:
    pickle.dump(dev_labels, f)
print("Total dev reactions: ", sum(dev_reactions_per_document))

Total dev reactions:  1244


In [74]:
train_labels, train_reactions_per_document = labelIOBReactions(train_path, train_ann_files, train_text_files)
with open('train_labels.pkl', 'wb') as f:
    pickle.dump(train_labels, f)
print("Total train reactions: ", sum(train_reactions_per_document))

Total train reactions:  6378


In [75]:
def create_dataset(files, labels_dict, output_file, test_file=False):

  if(test_file):
      train_data = {'para':[], 'label':[], 'document':[]}
  else:
      train_data = {'para':[], 'label':[]}

  for f in files:
    with open(f, encoding="utf-8" ) as fp:
        
      lines = fp.readlines()
      train_data['para'].extend(lines)

      labels = labels_dict[f.split('\\')[-1][:-4]] #".txt"
      train_data['label'].extend(labels)
        
      if(test_file):
          train_data['document'].extend([f.split('\\')[-1] for i in range(len(lines))])
      

  nl = []
  for l in train_data['label']:
    if(l==1):
      nl.append("1")
    elif(l==2):
      nl.append("2")
    elif(l==0):
      nl.append("0")
    else:
      print("Label Error")
    
    
  train_data['label'] = nl

  df = pd.DataFrame(train_data)
  print(df.head())
  df.to_csv(output_file, index = False)


In [76]:
test_files = [osp.join(cv_path, f)  for f in os.listdir(cv_path) if f[-3:]=='txt']
print(len(test_files))
print(test_files[:5])

30
['C:\\Users\\meais\\Documents\\CMU\\Independent Study\\ReactionExtraction\\data\\dev\\EP3284738A1.txt', 'C:\\Users\\meais\\Documents\\CMU\\Independent Study\\ReactionExtraction\\data\\dev\\EP3305797A1.txt', 'C:\\Users\\meais\\Documents\\CMU\\Independent Study\\ReactionExtraction\\data\\dev\\EP3312182A1.txt', 'C:\\Users\\meais\\Documents\\CMU\\Independent Study\\ReactionExtraction\\data\\dev\\EP3323817A1.txt', 'C:\\Users\\meais\\Documents\\CMU\\Independent Study\\ReactionExtraction\\data\\dev\\EP3330263A1.txt']


In [82]:
train_files = [osp.join(train_path, f) for f in os.listdir(train_path) if f[-3:]=='txt']
print(len(train_files))
random.shuffle(train_files)

val_files = train_files[:20]
train_files = train_files[20:]

print(len(train_files))
print(len(val_files))

120
100
20


In [83]:
# create_dataset(dev_files, pickle.load(open(dev_label_path, "rb")), "dev_data.csv") # test set
create_dataset(test_files, pickle.load(open("dev_labels.pkl", "rb")), "test_data_iob.csv", test_file=True) # test set
create_dataset(train_files, pickle.load(open("train_labels.pkl", "rb")), "train_data_iob.csv", test_file=True) #train_set
create_dataset(val_files, pickle.load(open("train_labels.pkl", "rb")), "val_data_iob.csv", test_file=True) # validation set

                                                para label         document
0                           FIELD OF THE INVENTION\n     0  EP3284738A1.txt
1  The present invention relates to the field of ...     0  EP3284738A1.txt
2                      BACKGROUND OF THE INVENTION\n     0  EP3284738A1.txt
3  Glutamate which is the most important excitato...     0  EP3284738A1.txt
4  Glutamate receptors (GluR) are mainly divided ...     0  EP3284738A1.txt
                                                para label          document
0  The present invention relates to phosphorescen...     0  US10868254B2.txt
1  In OLEDs, the electroluminescence (EL) charact...     0  US10868254B2.txt
2  In other words, in case of organic light-emitt...     0  US10868254B2.txt
3                                     1. Substrate\n     0  US10868254B2.txt
4  2. Base electrode, hole-injecting (positive po...     0  US10868254B2.txt
                                                para label  \
0          CROSS-REF

In [84]:
df = pd.read_csv("test_data_iob.csv")
print(len(df))
collections.Counter(df['label'])

12871


Counter({0: 9881, 2: 1236, 1: 1754})

In [85]:
df = pd.read_csv("train_data_iob.csv")
print(len(df))
collections.Counter(df['label'])

46018


Counter({0: 34567, 2: 5192, 1: 6259})

In [86]:
df = pd.read_csv("val_data_iob.csv")
print(len(df))
collections.Counter(df['label'])

7545


Counter({0: 5486, 2: 911, 1: 1148})

In [232]:
val = pd.read_csv("val_data_iob.csv")
val_text_files = list(val['document'].unique())
val_ann_files = [k[:-4]+".ann" for k in val_text_files]
print(val_text_files)
print(val_ann_files)

['US20200115323A1.txt', 'EP3733671A1.txt', 'EP1427423B9.txt', 'US20200172489A1.txt', 'US20180065917A1.txt', 'US10730885B1.txt', 'EP3284743A1.txt', 'EP3305788A1.txt', 'US20180110873A1.txt', 'US20180016260A1.txt', 'US20180072753A1.txt', 'US20180051018A1.txt', 'US20180085386A1.txt', 'US20180051008A1.txt', 'US20180037547A1.txt', 'US10633340B1.txt', 'US20200079799A1.txt', 'US20180112132A1.txt', 'EP3257527A1.txt', 'US20200323822A1.txt']
['US20200115323A1.ann', 'EP3733671A1.ann', 'EP1427423B9.ann', 'US20200172489A1.ann', 'US20180065917A1.ann', 'US10730885B1.ann', 'EP3284743A1.ann', 'EP3305788A1.ann', 'US20180110873A1.ann', 'US20180016260A1.ann', 'US20180072753A1.ann', 'US20180051018A1.ann', 'US20180085386A1.ann', 'US20180051008A1.ann', 'US20180037547A1.ann', 'US10633340B1.ann', 'US20200079799A1.ann', 'US20180112132A1.ann', 'EP3257527A1.ann', 'US20200323822A1.ann']


In [233]:
tdata = pd.read_csv("train_data_iob.csv")
train_text_files = list(tdata['document'].unique())
train_ann_files = [k[:-4]+".ann" for k in tdata_text_files]

In [1]:
#######################################################################################################################

In [2]:
# Sentence based models

In [208]:
def labelIOBParas(path, ann_files, text_files):
    all_paras = []
    all_labels = []
    documents = []
    cumm_len  =[]
#     labels = {}
    ann_files.sort()
    text_files.sort()
    # 0: O (Outside) ; 1: I (Inside) ; 2: B (Beginning)
    for af, tf in zip(ann_files, text_files):
        print(tf)
        with open(osp.join(path, tf), 'r',encoding="utf-8" ) as fp:
            with open(osp.join(path, af), 'r', encoding="utf-8" ) as ap:
                annot = ap.read()
                annot = annot.split('\n')

                reactions = []
                try:
                    for line in annot:
                        ts = line.split()
                        if(ts[1] == "REACTION_SPAN"):
                            reactions.append((int(ts[2]), int(ts[3])))
                except:
                    continue


                reactions = sorted(reactions)
#                 print(reactions)
                data = fp.read().rstrip()

#                 para_data = data.split('\n')
                para_data = data.split('\n')
                paras = []
                for pdata in para_data:
                    stt = sent_tokenize(pdata) 
                    if(len(stt)==0):
                        continue
                        
                    for i in range(1, len(stt)):
                        stt[i] = " "+stt[i]

                    if(len("".join(stt))!=len(pdata)):
#                         print(pdata)
#                         print(stt)
                        build = ""
                        for i in range(len(stt)):
                            temp = build + stt[i]
                            if(temp != pdata[:len(temp)]):
                                stt[i] = stt[i][1:]
#                                 print("Incorr:", stt[i])

                            build = build + stt[i]
                            if(build != pdata[:len(build)]):
                                print("Build error")

                    if(len("".join(stt))!=len(pdata))  :
                        print("Fishy:")
                        print(pdata)
                        print(stt) 
                        stt[0] = " "*abs(len(pdata)-len("".join(stt))) + stt[0] 

                    if(len("".join(stt))!=len(pdata)):
                        print("hawwwwww")
                    
                    stt[-1] = stt[-1] + '\n'
                    paras.extend(stt)


                para_labels = np.zeros(len(paras))

                char_cnt = 0
                ri = 0
                pi = 0
                while(ri<len(reactions) and pi<len(paras)):
                    if(reactions[ri][0]>=char_cnt and reactions[ri][0]<char_cnt+len(paras[pi])):
                        para_labels[pi] = 2
                        char_cnt = char_cnt+len(paras[pi])
                        pi+=1
                        while(pi<len(paras) and reactions[ri][1] > char_cnt):
                            para_labels[pi] = 1
                            char_cnt = char_cnt+len(paras[pi])
                            pi+=1

                        ri += 1
                    else:
                        char_cnt = char_cnt+len(paras[pi])
                        pi+=1   


                cls =[]
                s = 0
                for p in paras:
                    s = s+len(p)
                    cls.append(s)

                all_paras.extend(paras)
                all_labels.extend(para_labels)
                documents.extend([tf[:-4]]*len(paras))
                cumm_len.extend(cls)
#                 df = pd.DataFrame({'paras': paras, 'labels':para_labels, 'cumm_chars':cumm_len})

#         break
    df = pd.DataFrame({'paras': all_paras, 'labels':all_labels, 'cls': cumm_len, 'document':documents})
    return df
                            
#                 labels[tf[:-4]] = para_labels
#                 reactions_per_document.append(rpd)
    
#     return labels, reactions_per_document

In [230]:
df_val = labelIOBParas(train_path, val_ann_files, val_text_files)

EP1427423B9.txt
EP3257527A1.txt
EP3284743A1.txt
EP3305788A1.txt
EP3733671A1.txt
US10633340B1.txt
US10730885B1.txt
Fishy:
 &  & (OCH3)3C6H2 &  &  &  &  &  &  & 
[' &  & (OCH3)3C6H2 &  &  &  &  &  &  &']
Fishy:
 &  & (OCH3)3C6H2 &  &  &  &  &  &  & 
[' &  & (OCH3)3C6H2 &  &  &  &  &  &  &']
Fishy:
floxacin &  &  &  &  &  &  & 
['floxacin &  &  &  &  &  &  &']
Fishy:
conazole &  &  &  &  &  &  & 
['conazole &  &  &  &  &  &  &']
Fishy:
floxacin &  &  &  &  &  &  & 
['floxacin &  &  &  &  &  &  &']
Fishy:
conazole &  &  &  &  &  &  & 
['conazole &  &  &  &  &  &  &']
Fishy:
 & 
[' &']
Fishy:
 & 
[' &']
Fishy:
 & 
[' &']
Fishy:
 & 
[' &']
US20180016260A1.txt
Fishy:
 & 
[' &']
Fishy:
 & 
[' &']
Fishy:
 & 
[' &']
US20180037547A1.txt
Fishy:
 & 
[' &']
Fishy:
 & Compound &  &  & 
[' & Compound &  &  &']
Fishy:
 & 
[' &']
Fishy:
 & 
[' &']
Fishy:
 & Compound &  &  & 
[' & Compound &  &  &']
Fishy:
 & 
[' &']
Fishy:
 & 
[' &']
Fishy:
 & 
[' &']
Fishy:
 &  &  &  &  & Color & 
[' &  &  &  &  & Colo

In [231]:
df_val.to_csv("val_iob_para.csv", index = False) 
print(len(df_val))

13543


In [227]:
train_df = labelIOBParas(train_path, train_ann_files, train_text_files)
train_df.to_csv("train_iob_para.csv", index = False) 
print(len(train_df))

EP1175383B1.txt
Fishy:
Example 160 omitted & 
['Example 160 omitted &']
EP2474544B1.txt
Fishy:
(IAA) & 98% & 9% &  &  & 
['(IAA) & 98% & 9% &  &  &']
EP2697246B1.txt
EP3202759A1.txt
EP3255042A2.txt
Fishy:
Example 61 & 0.4 & 0.5 &  & 0.9 & 
['Example 61 & 0.4 & 0.5 &  & 0.9 &']
Fishy:
Example 66 & 0.6 & 1.0 &  & 0.3 & 
['Example 66 & 0.6 & 1.0 &  & 0.3 &']
Fishy:
Example 1 & 0.15 & 0.15 & 0.28 & 0.85 &  &  & 
['Example 1 & 0.15 & 0.15 & 0.28 & 0.85 &  &  &']
Fishy:
Example 3 & 1.15 & 0.88 & 1.43 & 2.64 &  &  & 
['Example 3 & 1.15 & 0.88 & 1.43 & 2.64 &  &  &']
Fishy:
Example 5 & 3.59 &  &  &  &  &  & 
['Example 5 & 3.59 &  &  &  &  &  &']
Fishy:
Example 6 & 2.30 &  &  &  &  &  & 
['Example 6 & 2.30 &  &  &  &  &  &']
Fishy:
Example 7 & 0.67 &  &  &  &  &  & 
['Example 7 & 0.67 &  &  &  &  &  &']
Fishy:
Example 8 & 0.01 & 0.01 & 0.01 & 0.06 & 0.19 &  & 
['Example 8 & 0.01 & 0.01 & 0.01 & 0.06 & 0.19 &  &']
Fishy:
Example 9 & 0.20 & 0.09 & 0.20 & 0.63 &  &  & 
['Example 9 & 0.20 & 0.09 & 

US10844022B1.txt
Fishy:
 & 
[' &']
Fishy:
 & 
[' &']
US10868254B2.txt
Fishy:
 & 
[' &']
Fishy:
 & phosphorescent green & 
[' & phosphorescent green &']
Fishy:
 & ITO/p-H-1/TCTA & 0 & 
[' & ITO/p-H-1/TCTA & 0 &']
Fishy:
 & ITO/p-TCTA/TCTA & +38 & +5 & 
[' & ITO/p-TCTA/TCTA & +38 & +5 &']
Fishy:
 & 
[' &']
US20180000077A1.txt
Fishy:
 & Sorbitan (Sorb) & 
[' & Sorbitan (Sorb) &']
Fishy:
 & CIP &  &  & 
[' & CIP &  &  &']
US20180002337A1.txt
US20180006248A1.txt
Fishy:
 & (1.95), 696 & 735 &  &  &  &  &  & 
[' & (1.95), 696 & 735 &  &  &  &  &  &']
Fishy:
 & (1.81) &  &  &  &  &  &  & 
[' & (1.81) &  &  &  &  &  &  &']
Fishy:
 & (1.65) & 852 &  &  &  &  &  & 
[' & (1.65) & 852 &  &  &  &  &  &']
Fishy:
 &  & TA & JSC &  &  & 
[' &  & TA & JSC &  &  &']
US20180009741A1.txt
Fishy:
 &  &  & Gram-negative & 
[' &  &  & Gram-negative &']
US20180009776A1.txt
Fishy:
 & 
[' &']
Fishy:
 & 
[' &']
US20180009796A1.txt
US20180016278A1.txt
Fishy:
(2S)-2-((1-(3,4-Dimethoxyphenyl)imidazo[1,2-α]quinoxalin-

Fishy:
 & E. coli & E. coli & Human & Human &  &  & B. subtilis &  &  & 
[' & E. coli & E. coli & Human & Human &  &  & B. subtilis &  &  &']
US20180083196A1.txt
Build error
Fishy:
<in-line-formulae>[Ar301]xb11-[(L301)xb1-R301]xb21.  <Formula 301></in-line-formulae>
['<in-line-formulae>[Ar301]xb11-[(L301)xb1-R301]xb21.', '<Formula 301></in-line-formulae>']
Build error
Fishy:
<in-line-formulae>[Ar601]xe11-[(L601)xe1-R601]xe21.  <Formula 601></in-line-formulae>
['<in-line-formulae>[Ar601]xe11-[(L601)xe1-R601]xe21.', '<Formula 601></in-line-formulae>']
Fishy:
 & 
[' &']
Fishy:
 & 
[' &']
Fishy:
 & 
[' &']
US20180086775A1.txt
Fishy:
 & 
[' &']
Fishy:
 & 
[' &']
Fishy:
 & 
[' &']
Fishy:
 & 
[' &']
US20180093962A1.txt
Fishy:
 & 
[' &']
Fishy:
 & 
[' &']
Fishy:
 & 
[' &']
Fishy:
 & 
[' &']
US20180098952A1.txt
Fishy:
 &  &  & kinact/KI (mM−1 & 
[' &  &  & kinact/KI (mM−1 &']
Fishy:
 & A. Data Collection & 
[' & A.', ' Data Collection &']
Fishy:
 & 
[' &']
US20180102489A1.txt
Fishy:
 & 
[' &']


US20200308101A1.txt
Fishy:
pCF3 & H & H & CF3 & 
['pCF3 & H & H & CF3 &']
Fishy:
pCl & H & H & Cl & 
['pCl & H & H & Cl &']
Fishy:
pDMA & H & H & N(CH3)2 & 
['pDMA & H & H & N(CH3)2 &']
Fishy:
pDPA & H & H & N(Ph)2 & 
['pDPA & H & H & N(Ph)2 &']
Fishy:
pF & H & H & F & 
['pF & H & H & F &']
Fishy:
pMe & H & H & Me & 
['pMe & H & H & Me &']
Fishy:
pNO2 & H & H & NO2 & 
['pNO2 & H & H & NO2 &']
Fishy:
pOMe & H & H & OMe & 
['pOMe & H & H & OMe &']
Fishy:
BrOMe & Br & H & OMe & 
['BrOMe & Br & H & OMe &']
Fishy:
OMe2 & OMe & H & OMe & 
['OMe2 & OMe & H & OMe &']
Fishy:
 & 
[' &']
Fishy:
 & 
[' &']
Fishy:
 & 
[' &']
Fishy:
 & 
[' &']
Fishy:
 &  & collection & 
[' &  & collection &']
Fishy:
 &  & theta = 66.41° & 
[' &  & theta = 66.41° &']
Fishy:
dimensions & b = 7.5218(2) Å & transmission & 
['dimensions & b = 7.5218(2) Å & transmission &']
Fishy:
 & γ = 90.00° & parameters & 
[' & γ = 90.00° & parameters &']
Fishy:
coefficient &  & peak and hole & 
['coefficient &  & peak and hole &']
Fi

In [202]:
df = labelIOBParas(cv_path, dev_ann_files, dev_text_files)

EP3284738A1.txt
[(53986, 54456), (54457, 55090), (55091, 55550), (55756, 56165), (56432, 56839), (57144, 57464), (57685, 58000), (58182, 58575), (58772, 59067), (59284, 59579), (59760, 60075), (60277, 60671), (60886, 61172), (61465, 61825), (62099, 62389), (62631, 62995), (63183, 63506), (63772, 64168), (64417, 64759), (65021, 65436), (65712, 66150), (66324, 66640), (66855, 67334), (67605, 67999), (68175, 68569), (68746, 69112), (69327, 69732), (70045, 70391), (70606, 71040), (71235, 71691), (71887, 72298), (72493, 72883), (73063, 73528), (73821, 74315), (74608, 74978), (75173, 75543), (75738, 76112), (76287, 76661), (76836, 77202), (77397, 77781), (77976, 78343), (78553, 78915), (79110, 79472), (79667, 80050), (80225, 80606), (80781, 81154), (81364, 81710), (81905, 82267), (82447, 82808), (82988, 83372), (83552, 83914), (84142, 84504), (84718, 85080), (85293, 85676), (85889, 86248), (86462, 86810), (87024, 87390), (87600, 87920), (88130, 88529), (88837, 89197), (89392, 89707), (89902,

1.6 × 10−5 & 19 ± 1 & 23 ± 2 & 17 ± 8 &  & 
['1.6 × 10−5 & 19 ± 1 & 23 ± 2 & 17 ± 8 &  &']
Fishy:
5.6 × 10−5 & 19 ± 1 & 18 ± 10 & 25 ± 5 &  & 
['5.6 × 10−5 & 19 ± 1 & 18 ± 10 & 25 ± 5 &  &']
Fishy:
1.6 × 10−4 & 13 ± 1 & 16 ± 1 & 5 ± 3 &  & 
['1.6 × 10−4 & 13 ± 1 & 16 ± 1 & 5 ± 3 &  &']
Fishy:
4.8 × 10−5 & 19 ± 1 & 22 ± 2 & 30 ± 2 &  & 
['4.8 × 10−5 & 19 ± 1 & 22 ± 2 & 30 ± 2 &  &']
Fishy:
1.6 × 10−4 & 16 ± 1 & 20 ± 1 & 19 ± 2 &  & 
['1.6 × 10−4 & 16 ± 1 & 20 ± 1 & 19 ± 2 &  &']
Fishy:
8.0 × 10−5 & 24 ± 2 & 29 ± 1 & 25 ± 2 &  & 
['8.0 × 10−5 & 24 ± 2 & 29 ± 1 & 25 ± 2 &  &']
Fishy:
1.6 × 10−4 & 17 ± 1 & 22 ± 2 & 15 ± 2 &  & 
['1.6 × 10−4 & 17 ± 1 & 22 ± 2 & 15 ± 2 &  &']
Fishy:
 & Viable cells (%) & 
[' & Viable cells (%) &']
Fishy:
 & 
[' &']
Fishy:
 & 
[' &']
US20180072767A1.txt
[(59729, 61512), (62341, 63985), (64555, 65265), (66423, 66701), (67506, 67789), (68705, 68915), (69509, 69797), (70579, 71079), (71906, 72532), (73516, 74126), (109675, 111178), (111776, 112307), (112944, 113

US20200164075A1.txt
[(75705, 76694), (76695, 76808), (77403, 77523), (78079, 78187), (78724, 79546), (79547, 79706), (80401, 80563), (81373, 81523), (82220, 82362), (83104, 83265), (83994, 84150), (84951, 85109), (85875, 86028), (86815, 86978), (87635, 87787), (88456, 89163), (89164, 89327), (90015, 91086)]
Fishy:
 &  & JB7 H-bonding & JB7 Post or & 
[' &  & JB7 H-bonding & JB7 Post or &']
US20200165244A1.txt
[(109713, 110413), (110620, 111154), (111292, 111750), (111869, 112336), (112426, 113160), (113378, 114027), (114688, 114700), (114702, 114718), (114724, 114772), (114778, 114826), (115764, 116398), (117097, 117115), (117117, 117135), (117399, 118035), (118267, 118478), (118749, 119245), (119246, 119930), (120722, 120742), (120744, 120762), (121319, 121721)]
Fishy:
 & 
[' &']
Fishy:
 & 
[' &']
Fishy:
 & 
[' &']
Fishy:
 & 
[' &']
Fishy:
 & 
[' &']
Fishy:
 & 
[' &']
Fishy:
 & 
[' &']
Fishy:
 & 
[' &']
Fishy:
 & 
[' &']
Fishy:
 & 
[' &']
Fishy:
 & 
[' &']
Fishy:
 & 
[' &']
Fishy:
 & 

In [187]:
df.to_csv("test_iob_para.csv", index = False) 

In [9]:
with open("C:\\Users\\meais\\Documents\\CMU\\Independent Study\\ReactionExtraction\\data\\dev\\EP3284738A1.txt", 'r',encoding="utf-8" ) as fp:
    data = fp.read().rstrip()
    print(data[55881: 56165])

2-fluoro-5-iodobenzoic acid was replaced by 2-chloro-5-iodobenzoic acid, and 4-fluoroaniline was replaced by 7-oxa-2-azaspiro [3.5] nonane, while the remaining raw materials, reagents and the preparation method were the same as those in Example 1 to give the product ZD002, yield 80%.


In [2]:
with open("C:\\Users\\meais\\Documents\\CMU\\Independent Study\\ReactionExtraction\\data\\dev\\EP3284738A1.ann", 'r',encoding="utf-8" ) as fp:
    data = fp.read()
    ds = data.split('\n')
    for d in ds:
        print(d)

T118	REACTION_SPAN 89392 89707	Example 59 The preparation of 2-fluoro-N-(4-fluorophenyl)-5-((2-methyloxazol-4-yl) ethynyl) benzamide (ZD090) 2-ethynylpyridine was replaced by 2-methyl-4-ethynyloxazole, while the remaining raw materials, reagents and the preparation method were the same as those in Example 1 to give the product ZD090, yield 80%.
T18	REACTION_SPAN 59760 60075	Example 9 The preparation of N-(4-fluorophenyl)-2-methyl-5-(pyridine-2-ethynyl) benzamide (ZD040) 2-fluoro-5-iodobenzoic acid was replaced by 2-methyl-5-iodobenzoic acid, while the remaining raw materials, reagents and the preparation method were the same as those in Example 1 to give the product ZD040, yield 80%.
T65	CUE 73967 74315	2-ethynylpyridine was replaced by 4-ethynyl-2-methylthiazole, 2-fluoro-5-iodobenzoic acid was replaced by 2-trifluoromethyl-5-iodo-picolinic acid, 4-fluoroaniline was replaced by 3,4-dihydroisoquinoline, while the remaining raw materials, reagents and the preparation method were the sam

In [46]:
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None)

In [6]:
l = [(2,1),(3,1),(4,4),(2,0)]
l = sorted(l)
l

[(2, 0), (2, 1), (3, 1), (4, 4)]