In [1]:
import pandas as pd

### Train set split

In [None]:
# yes there are up to 35 predicates in one sentence in the training data
header = ['ID','FORM','LEMMA','UPOS','XPOS','FEATS','HEAD','DEPREL','DEPS','MISC','UP:PRED','UP:ARGHEADS_1','UP:ARGHEADS_2','UP:ARGHEADS_3','UP:ARGHEADS_4','UP:ARGHEADS_5','UP:ARGHEADS_6','UP:ARGHEADS_7','UP:ARGHEADS_8','UP:ARGHEADS_9','UP:ARGHEADS_10','UP:ARGHEADS_11','UP:ARGHEADS_12','UP:ARGHEADS_13','UP:ARGHEADS_14','UP:ARGHEADS_15','UP:ARGHEADS_16','UP:ARGHEADS_17','UP:ARGHEADS_18','UP:ARGHEADS_19','UP:ARGHEADS_20','UP:ARGHEADS_21','UP:ARGHEADS_22','UP:ARGHEADS_23','UP:ARGHEADS_24','UP:ARGHEADS_25','UP:ARGHEADS_26','UP:ARGHEADS_27','UP:ARGHEADS_28','UP:ARGHEADS_29','UP:ARGHEADS_30','UP:ARGHEADS_31','UP:ARGHEADS_32','UP:ARGHEADS_33','UP:ARGHEADS_34','UP:ARGHEADS_35']
# header names taken from: 
# https://universaldependencies.org/format.html
# https://universalpropositions.github.io/

In [2]:
train_path = '../data/en_ewt-up-train.conllu'
train_df = pd.read_csv(train_path, sep='\t', comment='#', names=header)
print(train_df)

          ID      FORM     LEMMA   UPOS   XPOS  \
0        1.0        Al        Al  PROPN    NNP   
1        2.0         -         -  PUNCT   HYPH   
2        3.0     Zaman     Zaman  PROPN    NNP   
3        4.0         :         :  PUNCT      :   
4        5.0  American  american    ADJ     JJ   
...      ...       ...       ...    ...    ...   
202254  22.0        on        on    ADP     IN   
202255  23.0        my        my   PRON   PRP$   
202256  24.0       car       car   NOUN     NN   
202257  25.0         )         )  PUNCT  -RRB-   
202258  26.0         .         .  PUNCT      .   

                                             FEATS HEAD     DEPREL  \
0                                      Number=Sing    0       root   
1                                                _    1      punct   
2                                      Number=Sing    1       flat   
3                                                _    1      punct   
4                                       Degree=Po

  train_df = pd.read_csv(train_path, sep='\t', comment='#', names=header)


In [3]:
# tokens that are at the beginning of each sentence
start_of_sent = train_df.index[train_df['ID'] == 1 ].tolist()
# get sentence IDs for each sentence
def sent(row, list_firsts):
    for ix, first in enumerate(list_firsts):
        if row.name == first:
            sent_num = ix+1
            return sent_num

train_df['Sent_ID'] = train_df.apply(lambda row: sent(row, start_of_sent), axis=1)

In [4]:
# move the sentence number column to the front
# adapted from: https://stackoverflow.com/questions/25122099/move-column-by-name-to-front-of-table-in-pandas
cols = list(train_df)
cols.insert(0,cols.pop(cols.index('Sent_ID')))
train_df = train_df.loc[:,cols]
# fill NaN values for all tokens that are not at the beginning of the sentence
train_df.Sent_ID.ffill(inplace=True)
print(train_df)

        Sent_ID    ID      FORM     LEMMA   UPOS   XPOS  \
0           1.0   1.0        Al        Al  PROPN    NNP   
1           1.0   2.0         -         -  PUNCT   HYPH   
2           1.0   3.0     Zaman     Zaman  PROPN    NNP   
3           1.0   4.0         :         :  PUNCT      :   
4           1.0   5.0  American  american    ADJ     JJ   
...         ...   ...       ...       ...    ...    ...   
202254  12456.0  22.0        on        on    ADP     IN   
202255  12456.0  23.0        my        my   PRON   PRP$   
202256  12456.0  24.0       car       car   NOUN     NN   
202257  12456.0  25.0         )         )  PUNCT  -RRB-   
202258  12456.0  26.0         .         .  PUNCT      .   

                                             FEATS HEAD     DEPREL  \
0                                      Number=Sing    0       root   
1                                                _    1      punct   
2                                      Number=Sing    1       flat   
3          

In [5]:
# group data by sentences
sentences = train_df.groupby(['Sent_ID'])

In [6]:
header_split_df = ['Sent_ID','ID','FORM','LEMMA','UPOS','XPOS','FEATS','HEAD','DEPREL','DEPS','MISC','UP:PRED']

In [7]:
# create the new df to store splitted data in
# start with the first sentence (based on the fact that we know it only has one predicate in it)
first = sentences.get_group(1)
pred = first['UP:PRED'].nunique()
new_train = first.filter(items=header_split_df)
new_train['UP:ARGHEADS'] = first['UP:ARGHEADS_1']
# print(new_train)

In [8]:
# iterate through grouped sentences
for name, sentence in sentences:
    if name == 1: # skip the first sentence
        continue
    predicates = sentence['UP:PRED'].nunique()
    if predicates <= 2:
        sentence_df = sentence.filter(items=header_split_df)
        sentence_df['UP:ARGHEADS'] = sentence['UP:ARGHEADS_1']
        new_train = pd.concat([new_train,sentence_df])
    else:
        for i in range(predicates-1):
            sentence_df = sentence.filter(items=header_split_df)
            sentence_df['UP:ARGHEADS'] = sentence[f'UP:ARGHEADS_{i+1}']
            new_train = pd.concat([new_train,sentence_df])

In [9]:
# save to a tsv file
new_train.to_csv('../data/train_split.tsv',sep='\t',header=0,index=False)

### Test set split

In [10]:
test_path = '../data/en_ewt-up-test.conllu'
test_df = pd.read_csv(test_path, sep='\t', comment='#', names=header)
print(test_df)

         ID        FORM     LEMMA   UPOS XPOS  \
0       1.0        What      what   PRON   WP   
1       2.0          if        if  SCONJ   IN   
2       3.0      Google    Google  PROPN  NNP   
3       4.0     Morphed     morph   VERB  VBD   
4       5.0        Into      into    ADP   IN   
...     ...         ...       ...    ...  ...   
25092  16.0  suggesting   suggest   VERB  VBG   
25093  17.0   exercises  exercise   NOUN  NNS   
25094  18.0          to        to   PART   TO   
25095  19.0         use       use   VERB   VB   
25096  20.0           .         .  PUNCT    .   

                                  FEATS HEAD DEPREL                   DEPS  \
0                          PronType=Int    0   root                 0:root   
1                                     _    4   mark                 4:mark   
2                           Number=Sing    4  nsubj                4:nsubj   
3      Mood=Ind|Tense=Past|VerbForm=Fin    1  advcl             1:advcl:if   
4                    

  test_df = pd.read_csv(test_path, sep='\t', comment='#', names=header)


In [11]:
# tokens that are at the beginning of each sentence
start_of_sent_test = test_df.index[test_df['ID'] == 1 ].tolist()
# get sentence IDs for each sentence
def sent(row, list_firsts):
    for ix, first in enumerate(list_firsts):
        if row.name == first:
            sent_num = ix+1
            return sent_num

test_df['Sent_ID'] = test_df.apply(lambda row: sent(row, start_of_sent_test), axis=1)

In [12]:
# move the sentence number column to the front
# adapted from: https://stackoverflow.com/questions/25122099/move-column-by-name-to-front-of-table-in-pandas
cols = list(test_df)
cols.insert(0,cols.pop(cols.index('Sent_ID')))
test_df = test_df.loc[:,cols]
# fill NaN values for all tokens that are not at the beginning of the sentence
test_df.Sent_ID.ffill(inplace=True)
print(test_df)

       Sent_ID    ID        FORM     LEMMA   UPOS XPOS  \
0          1.0   1.0        What      what   PRON   WP   
1          1.0   2.0          if        if  SCONJ   IN   
2          1.0   3.0      Google    Google  PROPN  NNP   
3          1.0   4.0     Morphed     morph   VERB  VBD   
4          1.0   5.0        Into      into    ADP   IN   
...        ...   ...         ...       ...    ...  ...   
25092   2077.0  16.0  suggesting   suggest   VERB  VBG   
25093   2077.0  17.0   exercises  exercise   NOUN  NNS   
25094   2077.0  18.0          to        to   PART   TO   
25095   2077.0  19.0         use       use   VERB   VB   
25096   2077.0  20.0           .         .  PUNCT    .   

                                  FEATS HEAD DEPREL                   DEPS  \
0                          PronType=Int    0   root                 0:root   
1                                     _    4   mark                 4:mark   
2                           Number=Sing    4  nsubj                4:

In [13]:
# group data by sentences
sentences_test = test_df.groupby(['Sent_ID'])

In [14]:
# create the new df to store splitted data in
# start with the first sentence (based on the fact that we know it only has one predicate in it)
first = sentences_test.get_group(1)
pred = first['UP:PRED'].nunique()
new_test = first.filter(items=header_split_df)
new_test['UP:ARGHEADS'] = first['UP:ARGHEADS_1']
# print(new_test)

In [15]:
# iterate through grouped sentences
for name, sentence in sentences_test:
    if name == 1: # skip the first sentence
        continue
    predicates = sentence['UP:PRED'].nunique()
    if predicates <= 2:
        sentence_df = sentence.filter(items=header_split_df)
        sentence_df['UP:ARGHEADS'] = sentence['UP:ARGHEADS_1']
        new_test = pd.concat([new_test,sentence_df])
    else:
        for i in range(predicates-1):
            sentence_df = sentence.filter(items=header_split_df)
            sentence_df['UP:ARGHEADS'] = sentence[f'UP:ARGHEADS_{i+1}']
            new_test = pd.concat([new_test,sentence_df])

In [18]:
# save to a tsv file
new_test.to_csv('../data/test_split.tsv',sep='\t',header=0,index=False)