In [4]:
import pandas as pd

def read_conll_file(file_path):
    sentences = []
    with open(file_path, encoding='utf-8') as f:
        current_sentence = []
        for line in f:
            line = line.strip()
            if not line:
                sentences.append(current_sentence)
                current_sentence = []
            elif not line.startswith('#'):
                fields = line.split('\t')
                current_sentence.append(fields)
    if current_sentence:
        sentences.append(current_sentence)
    return sentences


In [5]:
def write_conll_file(sentences, file_path):
    with open(file_path, 'w', encoding='utf-8') as f:
        for sentence in sentences:
            for fields in sentence:
                # Convert float values to strings
                f.write('\t'.join(fields) + '\n')
            f.write('\n')

In [84]:
def repeat_multi_predicate_sentences(sentences):
    final_sentences = []
    for sentence in sentences:
        # Check if sentence has at least 11 fields
        if len(sentence[0]) < 11:
            continue
        # Get the values of all predicate columns in the sentence
        predicate_values = list([fields[10] for fields in sentence if len(fields) >= 11])
        predicate_values = [item for item in predicate_values if item != '_']
        # If there is only one predicate value or no predicates, don't repeat the sentence
        if len(predicate_values) <= 1:
            final_sentences.append(sentence)
        elif len(predicate_values) > 1:
            # Repeat sentence for each predicate value
            for i, pred in enumerate(predicate_values):
                b = i + 1
                # Convert sentence to DataFrame
                df = pd.DataFrame(sentence)
                # Create a new DataFrame with only the first 11 columns of the original DataFrame
                df_2 = df.iloc[:, :11].copy()
                new_col = df.iloc[:, (10 + b)]
                df_2[11] = new_col

                new_sentence = df_2.values.tolist()
                final_sentences.append(new_sentence)
                
    return final_sentences


In [85]:
sentences = read_conll_file('../data/en_ewt-up-dev.conllu')


In [86]:
sentences = repeat_multi_predicate_sentences(sentences)

In [87]:
write_conll_file(sentences, 'output2.conllu')

In [88]:
df_dev = pd.read_csv('output2.conllu', sep = '\t', header = None, names = ["id", "word", "lemma", "posuniv", "pos", "morph","head", "dep", "head_dep" , "space", "predicate", "gold_label"])


In [89]:
df_dev[:40]

Unnamed: 0,id,word,lemma,posuniv,pos,morph,head,dep,head_dep,space,predicate,gold_label
0,1.0,From,from,ADP,IN,_,3,case,3:case,_,_,_
1,2.0,the,the,DET,DT,Definite=Def|PronType=Art,3,det,3:det,_,_,_
2,3.0,AP,AP,PROPN,NNP,Number=Sing,4,obl,4:obl:from,_,_,ARG2
3,4.0,comes,come,VERB,VBZ,Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbF...,0,root,0:root,_,come.03,V
4,5.0,this,this,DET,DT,Number=Sing|PronType=Dem,6,det,6:det,_,_,_
5,6.0,story,story,NOUN,NN,Number=Sing,4,nsubj,4:nsubj,_,_,ARG1
6,7.0,:,:,PUNCT,:,_,4,punct,4:punct,_,_,_
7,1.0,President,President,PROPN,NNP,Number=Sing,5,nsubj,5:nsubj,_,_,ARG0
8,2.0,Bush,Bush,PROPN,NNP,Number=Sing,1,flat,1:flat,_,_,_
9,3.0,on,on,ADP,IN,_,4,case,4:case,_,_,_
