Skip to content

Commit

Permalink
appending pos tags
Browse files Browse the repository at this point in the history
  • Loading branch information
Hrant-Khachatrian committed Apr 20, 2018
1 parent d67f5cc commit a29ee89
Showing 1 changed file with 29 additions and 2 deletions.
31 changes: 29 additions & 2 deletions add_sdg.py
Expand Up @@ -6,6 +6,7 @@
import json
import os
from subprocess import check_call
from tqdm import tqdm


def sdg_id(sdg):
Expand All @@ -15,6 +16,30 @@ def sdg_id(sdg):
def remove_sdg_id(sdg):
return '\n'.join(sdg.split('\n')[1:]) # Removing the ID line


def parse_sdg_line(sdg_line):
line_arr = sdg_line.split('\t')
id = int(line_arr[0])
word = line_arr[1]
lemma = line_arr[2]
pos = line_arr[3]
parent_id = int(line_arr[4])
edge = line_arr[5]
return id, word, lemma, pos, parent_id, edge


def append_tokenized_text(sample):
sdg = sample['sdg']
tokenized_text = []
pos_tags = []
for line in sdg.split('\n'):
_, word, _, pos, _, _ = parse_sdg_line(line)
tokenized_text.append(word)
pos_tags.append(pos)
sample['tokenized_text'] = tokenized_text
sample['pos_tags'] = pos_tags
return sample

def main():
parser = argparse.ArgumentParser()
parser.add_argument('--input_text', '-it', required=True, type=str)
Expand All @@ -25,7 +50,7 @@ def main():
args = parser.parse_args()

sdg_output = os.path.join(args.tmp_dir, 'output.conll')

if args.model == 'stanford':
check_call(['bash', 'submodules/conll_parser/parse_script.sh',
args.input_text, sdg_output])
Expand All @@ -45,13 +70,15 @@ def main():
with io.open(args.input_json, 'r', encoding='utf-8') as f:
data = json.load(f)

for i in range(len(data)):
for i in tqdm(range(len(data))):
id = data[i]['id']
if id not in sdg_dict:
print("Sentence with ID='{}' has no SDG graph".format(id))
continue

data[i]['sdg'] = sdg_dict[id]
data[i] = append_tokenized_text(data[i])


with io.open(args.output_json, 'w', encoding='utf-8') as f:
dumps = json.dumps(data, indent=True)
Expand Down

0 comments on commit a29ee89

Please sign in to comment.