<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#create-a-dataset-with-two-stars-at-the-end-of-both-entities" data-toc-modified-id="create-a-dataset-with-two-stars-at-the-end-of-both-entities-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>create a dataset with two stars at the end of both entities</a></span></li><li><span><a href="#Create-a-dataset-with-a-star-at-the-end-of-first-entity" data-toc-modified-id="Create-a-dataset-with-a-star-at-the-end-of-first-entity-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Create a dataset with a star at the end of first entity</a></span></li><li><span><a href="#Read-translated-datasets-in-German" data-toc-modified-id="Read-translated-datasets-in-German-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Read translated datasets in German</a></span></li><li><span><a href="#Process-translated-german-sentences" data-toc-modified-id="Process-translated-german-sentences-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Process translated german sentences</a></span></li><li><span><a href="#create-dictionary-of-label-pairs" data-toc-modified-id="create-dictionary-of-label-pairs-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>create dictionary of label pairs</a></span></li><li><span><a href="#Filter" data-toc-modified-id="Filter-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>Filter</a></span></li></ul></div>

In [50]:
import pandas as pd
import numpy as np
import re

In [51]:
with open('data/semeval/test_file_full.txt') as f: 
    eng_small = f.readlines()

In [52]:
sentences_raw = eng_small[::4] # get strings with sentences
labels = eng_small[1::4] # get labels
sentences_ent = [re.findall(r'(?<=\t")(.*)(?="\n)', string)[0] for string in sentences_raw] # extract sentences

In [None]:
def extract_entities(lst_with_tags): 
    e1_de = [re.findall(r'(?<=<e1>)(.*)(?=</e1>)', string) for string in lst_with_tags] # get entity 1
    e2_de = [re.findall(r'(?<=<e2>)(.*)(?=</e2>)', string) for string in lst_with_tags] # get entity 2
    return e1_de, e2_de

In [4]:
# preprocess English dataset
sentences_ent = [string.rstrip() for string in sentences_ent] # extract sentences
e1, e2 = extract_entities(sentences_ent)

### create a dataset with two stars at the end of both entities 

In [5]:
# replace entity markers with stars for correct translation
sentences_ = [re.sub(r'<e1>',"",st) for st in sentences_ent] 
sentences_ = [re.sub(r'<e2>',"",st) for st in sentences_] 
sentences_ = [re.sub(r'</e1>',"*",st) for st in sentences_]
sentences_ = [re.sub(r'</e2>','*',st) for st in sentences_] 

In [6]:
# save redacted sentences
with open('data/semeval_de_new/test_sentences_en_stars.txt', 'w') as f: 
    f.writelines(line + '\n' for line in sentences_)

### Create a dataset with a star at the end of first entity

In [None]:
# replace entity markers with stars for correct translation
sentences_1 = [re.sub(r'<e1>',"",st) for st in sentences_ent] 
sentences_1 = [re.sub(r'<e2>',"",st) for st in sentences_1] 
sentences_1 = [re.sub(r'</e1>',"*",st) for st in sentences_1]
sentences_1 = [re.sub(r'</e2>','',st) for st in sentences_1]

In [None]:
# save redacted sentences
with open('data/semeval_de_new/test_sentences_en_star1.txt', 'w') as f: 
    f.writelines(line + '\n' for line in sentences_1)

### Read translated datasets in German 

**Note**: This part should be replaced with working API for DeepL translations. However, the API is not freely available, therefore this step is currently done manually. 

In [8]:
# read translated sentences in German (2stars)
with open('data/semeval_de_new/test_sentences_de_stars.txt', 'r', encoding = 'utf-8') as f: 
    de_small = f.readlines()

# read translated sentences in German (1star)
with open('data/semeval_de_new/test_sentences_de_star1.txt', 'r', encoding = 'utf-8') as f: 
    de_small1 = f.readlines()

### Process translated german sentences 

In [12]:
# replace * with entity end tags

def return_entity_tags(lst_with_stars):
    
    sentences_de = []

    for i, sent_de in enumerate(lst_with_stars): 

        sent = sent_de.rstrip()
        # delete special quotation marks
        sent = re.sub(r'\"', "", sent)
        # add a space at the beginning of each sentence
        sent = re.sub(r'^', ' ', sent)

        #replace e1
        sent = re.sub(r'\*',"</e1>",sent, 1)
        sent = re.sub(r'[ ](?=[a-zA-Z\u00C0-\u017F\'-\/0-9\&]+<\/e1>)', ' <e1>', sent)

        #replace e2
        sent = re.sub(r'\*',"</e2>",sent)
        sent = re.sub(r'[ ](?=[a-zA-Z\u00C0-\u017F\'-\/0-9\&]+<\/e2>)', ' <e2>', sent)

        sentences_de.append(sent)
        
    return sentences_de

In [None]:
sentences_de = return_entity_tags(de_small) # with stars at the end of both entities
sentences_de1 = return_entity_tags(de_small1) # with one star for 1st entity 

In [None]:
#test
for i, sent in enumerate(sentences_de):
    print(i,sent)

In [16]:
e1_de, e2_de = extract_entities(sentences_de)
e1_de1, e2_de1 = extract_entities(sentences_de1)

In [17]:
e1_missing = []
for i, word in enumerate(e1_de): 
    if len(word) == 0: 
        e1_missing.append(i)

In [18]:
e2_missing = []
for i, word in enumerate(e2_de): 
    if len(word) == 0: 
        e2_missing.append(i)

In [19]:
len(e1_missing), len(e2_missing)

(0, 140)

In [41]:
#compare e2 from the first list with e1 from first to identify changed labels
changed_labels = [i for i in range(len(e2_de)) if (e2_de[i] == e1_de1[i])&(labels[i]!='Other\n')]

In [42]:
len(changed_labels)

73

In [None]:
# test
for i in changed_labels: 
    print(i, 
          labels[i],
          sentences_ent[i], 
          de_small[i]
         )

In [34]:
labels[191]

'Other\n'

### create dictionary of label pairs

In [44]:
set(labels)

{'Cause-Effect(e1,e2)\n',
 'Cause-Effect(e2,e1)\n',
 'Component-Whole(e1,e2)\n',
 'Component-Whole(e2,e1)\n',
 'Content-Container(e1,e2)\n',
 'Content-Container(e2,e1)\n',
 'Entity-Destination(e1,e2)\n',
 'Entity-Destination(e2,e1)\n',
 'Entity-Origin(e1,e2)\n',
 'Entity-Origin(e2,e1)\n',
 'Instrument-Agency(e1,e2)\n',
 'Instrument-Agency(e2,e1)\n',
 'Member-Collection(e1,e2)\n',
 'Member-Collection(e2,e1)\n',
 'Message-Topic(e1,e2)\n',
 'Message-Topic(e2,e1)\n',
 'Other\n',
 'Product-Producer(e1,e2)\n',
 'Product-Producer(e2,e1)\n'}

In [45]:
opp_dir_rel = {'Cause-Effect(e1,e2)\n': 'Cause-Effect(e2,e1)\n', 
                 'Cause-Effect(e2,e1)\n': 'Cause-Effect(e1,e2)\n',
                 'Component-Whole(e1,e2)\n': 'Component-Whole(e2,e1)\n', 
                 'Component-Whole(e2,e1)\n': 'Component-Whole(e1,e2)\n', 
                 'Content-Container(e1,e2)\n': 'Content-Container(e2,e1)\n', 
                 'Content-Container(e2,e1)\n': 'Content-Container(e1,e2)\n',
                 'Entity-Destination(e1,e2)\n': 'Entity-Destination(e2,e1)\n', 
                 'Entity-Destination(e2,e1)\n': 'Entity-Destination(e1,e2)\n',
                 'Entity-Origin(e1,e2)\n': 'Entity-Origin(e2,e1)\n', 
                 'Entity-Origin(e2,e1)\n': 'Entity-Origin(e1,e2)\n',
                 'Instrument-Agency(e1,e2)\n': 'Instrument-Agency(e2,e1)\n', 
                 'Instrument-Agency(e2,e1)\n': 'Instrument-Agency(e1,e2)\n',
                 'Member-Collection(e1,e2)\n': 'Member-Collection(e2,e1)\n', 
                 'Member-Collection(e2,e1)\n': 'Member-Collection(e1,e2)\n',
                 'Message-Topic(e1,e2)\n': 'Message-Topic(e2,e1)\n', 
                 'Message-Topic(e2,e1)\n': 'Message-Topic(e1,e2)\n',
                 'Product-Producer(e1,e2)\n': 'Product-Producer(e2,e1)\n', 
                 'Product-Producer(e2,e1)\n': 'Product-Producer(e1,e2)\n',
                 'Other\n': 'Other\n'
                }

In [53]:
labels_new = labels
for i in changed_labels: 
    labels_new[i] = opp_dir_rel[labels_new[i]]

### Filter 

In [25]:
sentences_de_final = [sentences_de[i] for i in range(len(sentences_de)) if i not in e2_missing]
labels_de_final = [labels_new[i] for i in range(len(sentences_de)) if i not in e2_missing]

In [54]:
len(sentences_de_final), len(labels_de_final)

(2577, 2577)

In [58]:
combined = [str(str(i)+'   "'+sentences_de_final[i]+'"'+'\n'+labels_de_final[i]+'Comment: \n'+'\n') for i in range(len(sentences_de_final))]

In [59]:
# save redacted sentences
with open('data/semeval_de_new/final/test_file_de_final.txt', 'w', encoding = 'utf-8') as f: 
    f.writelines(line for line in combined)

In [69]:
# for review save also initial german with english version
sentences_en_final = [sentences_ent[i] for i in range(len(sentences_ent)) if i not in e2_missing]
labels_en_final = [labels[i] for i in range(len(sentences_ent)) if i not in e2_missing]

comb_review = [str(str(i)
                +'   "'
                +sentences_de_final[i]
                +'"'+'\n'
                +labels_de_final[i]
                +'"'
                +sentences_en_final[i]
                +'"'
                +'\n'
                +'Old Label: '
                +labels_en_final[i]
                +'Comment:' 
                +'\n\n') 
            for i in range(len(sentences_de_final))]

# save redacted sentences
with open('data/semeval_de_new/review/test_for_review.txt', 'w', encoding = 'utf-8') as f: 
    f.writelines(line for line in comb_review)