# In the name of Allah

### Addr to uptodate paragraphs data

In [3]:
utd_parag_addr = '/content/drive/MyDrive/UNIVERSITY/Term 3/NLP/NLP_Project/NLP_Final_Project/data/UptoDate_parags.zip'

In [4]:
!unzip '$utd_parag_addr'

Archive:  /content/drive/MyDrive/UNIVERSITY/Term 3/NLP/NLP_Project/NLP_Final_Project/data/UptoDate_parags.zip
  inflating: parag.csv               


### Package imports

In [5]:
import requests
import time
import pickle
import pandas as pd
import random
import numpy as np
import os
import re
from tqdm import tqdm

### Addr to pickle file where annotations are going to be saved (or saved before to load and continue annotation)

In [11]:
annot_pickle_addr = './annots_list.pickle'

### Addr to pickle file which has a list of papers id to annotate (None to annotate all the papers)

In [9]:
portion_id_addr = None

### Function to query BERN2 server with exception handling.

In [12]:
def query_plain(text, url="http://bern2.korea.ac.kr/plain"):
    trys = 0
    while(True):
        try:
            return requests.post(url, json={'text': text}).json()
        except Exception as e:
            print(F"Error in request: {str(e)}")
            print('Sleep for 1 seconds')
            time.sleep(1)
            if 'Expecting value' in str(e):
                trys += 1
                text = re.sub(r'www\.\w+\.\w+', '', text)
            if trys > 10:
                return {'annotations':[]}

### Main procedure to annotate paragraphs

In [None]:
# Load portion_id file and get papers id which you want to get annotate.
# If there is no portion_id file, this means you want to annotate all of the papers
if portion_id_addr != None:
  if os.path.exists(portion_id_addr):
    with open(portion_id_addr, 'rb') as f:
        portion_id = pickle.load(f)
  else:
    portion_id = None
else:
  portion_id = None

# Load previously annotated data
if os.path.exists(annot_pickle_addr):
    with open(annot_pickle_addr, 'rb') as f:
        l = pickle.load(f)
else:
    l=[]

# Get paper ids which processed before
proccesed_papers_id=[]
for row in l:
    if row[0] not in proccesed_papers_id:
        proccesed_papers_id.append(row[0])

# Load uptodate paragraphs
df_iter = pd.read_csv('/content/parag.csv').iterrows()
i=0
for ind, row in tqdm(df_iter):
    paper_id = row['paper_id']
    if paper_id in proccesed_papers_id or (portion_id != None and paper_id not in portion_id):
        continue
    paragraph_id = row['parag_id']

    if i>180:
        i=0
        with open(annot_pickle_addr, 'wb') as f:
            pickle.dump(l, f)
        print('Sleep for 5 seconds')
        time.sleep(5)

    # If paragpraph length is less than 5000, query whole paragpraph.
    # Else break paragraph into sentences and query text with less than 5000 characters
    if len(row['text']) < 5000:
        annotations = query_plain(row['text'])['annotations']
        i+=1
        for annot in annotations:
            l.append([paper_id, paragraph_id, annot['mention'],
                      annot['id'], annot['obj'], annot['prob'],
                      annot['span']['begin'], annot['span']['end']])
    else:
        sentcs = row['text'].split('.')
        query = ""
        qslen = 0
        for sentc in sentcs:
            if len(query +'. '+ sentc) < 5000:
                query = query + sentc + '. '
            else:
                annotations = query_plain(query)['annotations']
                i += 1
                for annot in annotations:
                    l.append([paper_id, paragraph_id, annot['mention'],
                              annot['id'], annot['obj'], annot['prob'],
                              qslen+annot['span']['begin'], qslen+annot['span']['end']])
                qslen += len(query)
                query = ""
        else:
            if query != "":
                annotations = query_plain(query)['annotations']
                i += 1
                query = ""
                for annot in annotations:
                    l.append([paper_id, paragraph_id, annot['mention'],
                              annot['id'], annot['obj'], annot['prob'],
                              qslen+annot['span']['begin'], qslen+annot['span']['end']])

# Last pickle dump
with open(annot_pickle_addr, 'wb') as f:
    pickle.dump(l, f)

### Create dataframe from annotations

In [None]:
bern_df = pd.DataFrame(l, columns=['paper_id', 'parag_id', 'mention', 'object_id', 'object_type', 'prob', 'begin_ind', 'end_ind'])

### Get unique annotations and save it as pickle

In [None]:
unique_mentions = list(pd.unique(bern_df['mention']))

with open('all_unique_mentions.pickle', 'wb') as f:
    pickle.dump(unique_mentions,f)