In [1]:
import pandas as pd

In [2]:
# remove comment lines all together (bc there are tokens '#')
outfile = open('../data/train_without_comments.tsv','w', encoding='utf-8')
for line in open('../data/en_ewt-up-train.conllu','r',encoding='utf-8'):
    if line.startswith('#'):
        continue
    else:
        outfile.write(line)
outfile.close()

outfile = open('../data/test_without_comments.tsv','w', encoding='utf-8')
for line in open('../data/en_ewt-up-test.conllu','r',encoding='utf-8'):
    if line.startswith('#'):
        continue
    else:
        outfile.write(line)
outfile.close()

### Train set split

In [3]:
# yes there are up to 35 predicates in one sentence in the training data
header = ['ID','FORM','LEMMA','UPOS','XPOS','FEATS','HEAD','DEPREL','DEPS','MISC','UP:PRED','UP:ARGHEADS_1','UP:ARGHEADS_2','UP:ARGHEADS_3','UP:ARGHEADS_4','UP:ARGHEADS_5','UP:ARGHEADS_6','UP:ARGHEADS_7','UP:ARGHEADS_8','UP:ARGHEADS_9','UP:ARGHEADS_10','UP:ARGHEADS_11','UP:ARGHEADS_12','UP:ARGHEADS_13','UP:ARGHEADS_14','UP:ARGHEADS_15','UP:ARGHEADS_16','UP:ARGHEADS_17','UP:ARGHEADS_18','UP:ARGHEADS_19','UP:ARGHEADS_20','UP:ARGHEADS_21','UP:ARGHEADS_22','UP:ARGHEADS_23','UP:ARGHEADS_24','UP:ARGHEADS_25','UP:ARGHEADS_26','UP:ARGHEADS_27','UP:ARGHEADS_28','UP:ARGHEADS_29','UP:ARGHEADS_30','UP:ARGHEADS_31','UP:ARGHEADS_32','UP:ARGHEADS_33','UP:ARGHEADS_34','UP:ARGHEADS_35']
# header names taken from: 
# https://universaldependencies.org/format.html
# https://universalpropositions.github.io/

In [4]:
train_path = '../data/train_without_comments.tsv'
train_df = pd.read_csv(train_path, sep='\t', names=header, encoding='utf-8',quotechar='№')
# this quotechar is needed to avoid misinterpretation of doublequote tokens
# print(train_df)

  train_df = pd.read_csv(train_path, sep='\t', names=header, encoding='utf-8',quotechar='№')


In [5]:
# remove rows with CopyOf= in the MISC column
train_df = train_df[~train_df.MISC.str.contains('CopyOf=',na=False)]

In [6]:
# replace NaN values in predicate column with '_'
train_df['UP:PRED'].fillna('_',inplace=True)

In [7]:
# tokens that are at the beginning of each sentence
start_of_sent = train_df.index[train_df['ID'] == 1 ].tolist()
# get sentence IDs for each sentence
def sent(row, list_firsts):
    for ix, first in enumerate(list_firsts):
        if row.name == first:
            sent_num = ix+1
            return sent_num

train_df['Sent_ID'] = train_df.apply(lambda row: sent(row, start_of_sent), axis=1)

In [8]:
# move the sentence number column to the front
# adapted from: https://stackoverflow.com/questions/25122099/move-column-by-name-to-front-of-table-in-pandas
cols = list(train_df)
cols.insert(0,cols.pop(cols.index('Sent_ID')))
train_df = train_df.loc[:,cols]
# fill NaN values for all tokens that are not at the beginning of the sentence
train_df.Sent_ID.ffill(inplace=True)
# print(train_df)

In [9]:
# group data by sentences
sentences = train_df.groupby(['Sent_ID'])

In [10]:
# the headers needed for split data except for UP:ARGHEADS that will be added later
header_split_df = ['Sent_ID','ID','FORM','LEMMA','UPOS','XPOS','FEATS','HEAD','DEPREL','DEPS','MISC','UP:PRED']

In [11]:
# create the new df to store splitted data in
# start with the first sentence (based on the fact that we know it only has one predicate in it)
first = sentences.get_group(1)
pred = first['UP:PRED'].nunique()
new_train = first.filter(items=header_split_df)
new_train['UP:ARGHEADS'] = first['UP:ARGHEADS_1']
# print(new_train)

In [12]:
# iterate through grouped sentences
for name, sentence in sentences:
    if name == 1: # skip the first sentence
        continue
    predicates_list = sentence['UP:PRED'].tolist()
    if '_' in predicates_list:
        predicates_list = [value for value in predicates_list if value != '_']
    if not predicates_list: # if empty bc there are no predicates in the sentence
        sentence_df = sentence.filter(items=header_split_df)
        sentence_df['UP:ARGHEADS'] = '_'
        new_train = pd.concat([new_train,sentence_df])
    else:
        predicates_dict = {k:v for k,v in enumerate(predicates_list)}
        for ix, p in predicates_dict.items():
            sentence_df = sentence.filter(items=header_split_df)
            replace_dict = dict(predicates_dict)
            del replace_dict[ix]
            replace_list = list(replace_dict.values())
            sentence_df['UP:ARGHEADS'] = sentence[f'UP:ARGHEADS_{ix+1}']
            new_train = pd.concat([new_train,sentence_df])

In [13]:
# remove the extra verbs with loc function?
new_train.loc[new_train['UP:ARGHEADS'] != 'V', 'UP:PRED'] = '_'

In [14]:
# reset the index of the dataframe (bc it is copied for each sentence copy)
new_train.reset_index(drop=True,inplace=True)

In [15]:
# add copy id? aka just normal sentence id that will count the copies as sentences too
# tokens that are at the beginning of each sentence
start_of_sentence = new_train.index[new_train['ID'] == 1 ].tolist()
# get sentence IDs for each sentence
new_train['Copy_ID'] = new_train.apply(lambda row: sent(row, start_of_sentence), axis=1)

# move the sentence number column to the front
# adapted from: https://stackoverflow.com/questions/25122099/move-column-by-name-to-front-of-table-in-pandas
cols = list(new_train)
cols.insert(0,cols.pop(cols.index('Copy_ID')))
new_train = new_train.loc[:,cols]
# fill NaN values for all tokens that are not at the beginning of the sentence
new_train.Copy_ID.ffill(inplace=True)

In [16]:
# replace _ with O for gold argument labels
new_train.loc[new_train['UP:ARGHEADS'] == '_', 'UP:ARGHEADS'] = 'O'

In [17]:
# save to a tsv file
new_train.to_csv('../data/train_split.tsv',sep='\t',index=False,encoding='utf-8',quotechar='№')
# this quotechar is needed to avoid misinterpretation of doublequote tokens

### Test set split

In [18]:
# yes there are up to 35 predicates in one sentence in the training data
header = ['ID','FORM','LEMMA','UPOS','XPOS','FEATS','HEAD','DEPREL','DEPS','MISC','UP:PRED','UP:ARGHEADS_1','UP:ARGHEADS_2','UP:ARGHEADS_3','UP:ARGHEADS_4','UP:ARGHEADS_5','UP:ARGHEADS_6','UP:ARGHEADS_7','UP:ARGHEADS_8','UP:ARGHEADS_9','UP:ARGHEADS_10','UP:ARGHEADS_11','UP:ARGHEADS_12','UP:ARGHEADS_13','UP:ARGHEADS_14','UP:ARGHEADS_15','UP:ARGHEADS_16','UP:ARGHEADS_17','UP:ARGHEADS_18','UP:ARGHEADS_19','UP:ARGHEADS_20','UP:ARGHEADS_21','UP:ARGHEADS_22','UP:ARGHEADS_23','UP:ARGHEADS_24','UP:ARGHEADS_25','UP:ARGHEADS_26','UP:ARGHEADS_27','UP:ARGHEADS_28','UP:ARGHEADS_29','UP:ARGHEADS_30','UP:ARGHEADS_31','UP:ARGHEADS_32','UP:ARGHEADS_33','UP:ARGHEADS_34','UP:ARGHEADS_35']
# header names taken from: 
# https://universaldependencies.org/format.html
# https://universalpropositions.github.io/

In [19]:
# the headers needed for split data except for UP:ARGHEADS that will be added later
header_split_df = ['Sent_ID','ID','FORM','LEMMA','UPOS','XPOS','FEATS','HEAD','DEPREL','DEPS','MISC','UP:PRED']

In [20]:
test_path = '../data/test_without_comments.tsv'
test_df = pd.read_csv(test_path, sep='\t', names=header,encoding='utf-8',quotechar='№') 
# this quotechar is needed to avoid misinterpretation of doublequote tokens
# print(test_df)

  test_df = pd.read_csv(test_path, sep='\t', names=header,encoding='utf-8',quotechar='№')


In [21]:
# remove rows with CopyOf= in the MISC column
test_df = test_df[~test_df.MISC.str.contains('CopyOf=',na=False)]

In [22]:
# replace NaN values in predicate column with '_'
test_df['UP:PRED'].fillna('_',inplace=True)

In [23]:
# tokens that are at the beginning of each sentence
start_of_sent_test = test_df.index[test_df['ID'] == 1 ].tolist()
# get sentence IDs for each sentence
def sent(row, list_firsts):
    for ix, first in enumerate(list_firsts):
        if row.name == first:
            sent_num = ix+1
            return sent_num

test_df['Sent_ID'] = test_df.apply(lambda row: sent(row, start_of_sent_test), axis=1)

In [24]:
# move the sentence number column to the front
# adapted from: https://stackoverflow.com/questions/25122099/move-column-by-name-to-front-of-table-in-pandas
cols = list(test_df)
cols.insert(0,cols.pop(cols.index('Sent_ID')))
test_df = test_df.loc[:,cols]
# fill NaN values for all tokens that are not at the beginning of the sentence
test_df.Sent_ID.ffill(inplace=True)
# print(test_df)

In [25]:
# group data by sentences
sentences_test = test_df.groupby(['Sent_ID'])

In [26]:
# create the new df to store splitted data in
# start with the first sentence (based on the fact that we know it only has one predicate in it)
first = sentences_test.get_group(1)
pred = first['UP:PRED'].nunique()
new_test = first.filter(items=header_split_df)
new_test['UP:ARGHEADS'] = first['UP:ARGHEADS_1']
# print(new_test)

In [27]:
# iterate through grouped sentences
for name, sentence in sentences_test:
    if name == 1: # skip the first sentence
        continue
    predicates_list = sentence['UP:PRED'].tolist()
    if '_' in predicates_list:
        predicates_list = [value for value in predicates_list if value != '_']
    if not predicates_list: # if empty bc there are no predicates in the sentence
        sentence_df = sentence.filter(items=header_split_df)
        sentence_df['UP:ARGHEADS'] = '_'
        new_test = pd.concat([new_test,sentence_df])
    else:
        predicates_dict = {k:v for k,v in enumerate(predicates_list)}
        for ix, p in predicates_dict.items():
            sentence_df = sentence.filter(items=header_split_df)
            replace_dict = dict(predicates_dict)
            del replace_dict[ix]
            replace_list = list(replace_dict.values())
            sentence_df['UP:ARGHEADS'] = sentence[f'UP:ARGHEADS_{ix+1}']
            new_test = pd.concat([new_test,sentence_df])

In [28]:
# remove the extra verbs with loc function?
new_test.loc[new_test['UP:ARGHEADS'] != 'V', 'UP:PRED'] = '_'

In [29]:
# reset the index of the dataframe (bc it is copied for each sentence copy)
new_test.reset_index(drop=True,inplace=True)

In [30]:
# add copy id? aka just normal sentence id that will count the copies as sentences too
# tokens that are at the beginning of each sentence
start_of_sentence = new_test.index[new_test['ID'] == 1 ].tolist()
# get sentence IDs for each sentence
new_test['Copy_ID'] = new_test.apply(lambda row: sent(row, start_of_sentence), axis=1)

# move the sentence number column to the front
# adapted from: https://stackoverflow.com/questions/25122099/move-column-by-name-to-front-of-table-in-pandas
cols = list(new_test)
cols.insert(0,cols.pop(cols.index('Copy_ID')))
new_test = new_test.loc[:,cols]
# fill NaN values for all tokens that are not at the beginning of the sentence
new_test.Copy_ID.ffill(inplace=True)

In [31]:
# replace _ with O for gold argument labels
new_test.loc[new_test['UP:ARGHEADS'] == '_', 'UP:ARGHEADS'] = 'O'

In [32]:
# save to a tsv file
new_test.to_csv('../data/test_split.tsv',sep='\t',index=False,encoding='utf-8',quotechar='№')
# this quotechar is needed to avoid misinterpretation of doublequote tokens