### Creates output labels and POS tagging corresponding to each token in input sentence ###

In [1]:
import pandas as pd
import numpy as np
import json
import re
import os, glob
import sys
import seaborn as sns
import matplotlib.pyplot as plt
import zipfile

In [2]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ankit\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ankit\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
## converts input text into tokens and returns tokens and their POS tagging obtained using NLTK library methods 
def tokenize(string):
    tokens = word_tokenize(string)
    pos_tags = [x[1] for x in nltk.pos_tag(tokens)]
    return tokens, pos_tags

In [9]:
## converts each keyword into tokens and assign them output label and POS tag  
## To distinguish each keyword type in the output, the IOB output label of each token is appended with the corresponding keyword's type.
## INPUT:  text => list of keywords, position => a list of integres indicating keywords's start and end index in the text
## OUTPUT: text_tokens => list of tokens, text_annotations => list of output labels, text_pos_tags => list of POS tags
def annotate_tokens_by_position(text, position):
    start = 0
    text_tokens = []
    text_annotations = []
    text_pos_tags = []
    covered = np.zeros(len(text))
    for ind in range(len(position)):
        if position[ind][0] - start >= 1:
            end = position[ind][0]
            substr = text[start:end]
            covered[start:end] = 1
            tokens, pos_tags = tokenize(substr)
            for i, token in enumerate(tokens):
                if token == '':
                    continue
                text_tokens.append(token)
                text_annotations.append('O')
                text_pos_tags.append(pos_tags[i])
                
        if (covered[position[ind][0]:position[ind][1]+1].any()):
            continue                
        keyphrase_tokens, pos_tags = tokenize(text[position[ind][0]:position[ind][1]+1])
        covered[position[ind][0]:position[ind][1]+1] = 1
        label = position[ind][3]
        for i, token in enumerate(keyphrase_tokens):
            if i == 0:
                text_annotations.append('B_' + label)
            else:
                text_annotations.append('I_' + label)
            text_tokens.append(token)
            text_pos_tags.append(pos_tags[i])
            
        start = position[ind][1] + 1
        
    return text_tokens, text_annotations, text_pos_tags
            

In [None]:
data_dir = 'data/train2/' #directory name containing files corresponding to training data
mode = 'train'

## uncomment below code to process test instead of train data
# data_dir = 'data/semeval_articles_test/'
# mode = 'test'

In [None]:
filenames = []
for file in os.listdir(data_dir):
    if file.endswith(".txt"):
        filenames.append(data_dir + file.split('.')[0])
len(filenames)

In [10]:
annotations = []
text_sentences = []
pos_tags = []
for file in filenames:
    try:
        text_file = open(file+'.txt', 'r', encoding="utf8")
        annotation_file = open(file+'.ann', 'r', encoding="utf8")
        keyphrases = []
        labels = []
        position = []
        for line in annotation_file:
            line_split = line.split()
            if(line_split[0][0] == 'T'):
                if ';' not in line:
                    position.append((int(line_split[2]),int(line_split[3]), " ".join(line_split[4:]), line_split[1]))
                else:
                    spans = line.split("\t")[1].split(";")
                    position.append((int(spans[0].split(" ")[1]), int(spans[-1].split(" ")[1]), line.split("\t")[2], spans[0].split(" ")[0]))

        text = text_file.read().replace("\n", " ")
        position = sorted(position, key=lambda tup: tup[0])

        text_tokens, text_annotations, text_tags = annotate_tokens_by_position(text, position)
        annotations.append(text_annotations)
        text_sentences.append(text_tokens)
        pos_tags.append(text_tags)
    except Exception as e:
        print(line)
        print(file)
        print(e)

In [8]:
with open('data/' + mode + '_tokens_annotations.npy', 'wb') as f:
    np.save(f, np.array(annotations), allow_pickle=True)
with open('data/' + mode + '_tokens_sentences.npy', 'wb') as f:
    np.save(f, np.array(text_sentences), allow_pickle=True)
with open('data/'+ mode + '_pos_tag.npy', 'wb') as f:
    np.save(f, np.array(pos_tags), allow_pickle=True)