In [210]:
import pandas as pd

def read_conll_file(file_path):
    sentences = []
    with open(file_path, encoding='utf-8') as f:
        current_sentence = []
        for line in f:
            line = line.strip()
            if not line:
                sentences.append(current_sentence)
                current_sentence = []
            elif not line.startswith('#'):
                fields = line.split('\t')
                current_sentence.append(fields)
    if current_sentence:
        sentences.append(current_sentence)
    return sentences


In [253]:
def repeat_multi_predicate_sentences(sentences):
    final_sentences = []
    for sentence in sentences:

        # Check if sentence has at least 11 fields
        if len(sentence[0]) < 11:
            continue
        # Get the values of all predicate columns in the sentence
        predicate_values = list([fields[10] for fields in sentence if len(fields) >= 11])
        predicate_values = [item for item in predicate_values if item != '_']
        # If there is only one predicate value or no predicates, don't repeat the sentence
        if len(predicate_values) <= 1:
            final_sentences.append(sentence)
        elif len(predicate_values) > 1:
            # Repeat sentence for each predicate value
            for i, pred in enumerate(predicate_values):
                b = i + 1
                # Convert sentence to DataFrame
                df = pd.DataFrame(sentence)
                # Create a new DataFrame with only the first 11 columns of the original DataFrame
                df_2 = df.iloc[:, :11].copy()
                new_col = df.iloc[:, (10 + b)]
                df_2[11] = new_col
                new_sentence = df_2.values.tolist()
                final_sentences.append(new_sentence)
                
    return final_sentences


In [263]:
def write_conll_file(sentences, file_path):
    with open(file_path, 'w', encoding='utf-8') as f:
        for sentence in sentences:
            for fields in sentence:
                if None in fields:
                    continue
                else:
                    f.write('\t'.join([field.replace('\t', '') for field in fields]) + '\n')
            f.write('\n')


In [264]:
sentences = read_conll_file('../data/en_ewt-up-train.conllu')


In [265]:
sentences = repeat_multi_predicate_sentences(sentences)

In [299]:

def handle_bad_lines(line):
    with open("bad_lines.conllu", "w", encoding='utf-8') as f:
        f.write("\t".join(line))


df_train = pd.read_csv('output3.conllu', sep='\t',header = None, 
                       names = ["id", "word", "lemma", "posuniv", "pos", "morph","head", "dep", "head_dep" ,
                                "space", "predicate", "gold_label"], on_bad_lines=handle_bad_lines, engine='python')

In [301]:
df_badlines = pd.read_csv('bad_lines.conllu', sep='\t',header = None,)

In [302]:
df_badlines

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,37,”,,PUNCT,'',_,11,punct,11:punct,_,_,_
1,1,This,this,DET,DT,Number=Sing|PronType=Dem,2,det,2:det,_,_,_
2,2,right,right,NOUN,NN,Number=Sing,11,nsubj,11:nsubj,_,right.05,_
3,3,to,to,PART,TO,_,4,mark,4:mark,_,_,_
4,4,love,love,VERB,VB,VerbForm=Inf,2,acl,2:acl:to,_,love.01,_
5,5,and,and,CCONJ,CC,_,6,cc,6:cc,_,_,_
6,6,form,form,VERB,VB,VerbForm=Inf,4,conj,2:acl:to|4:conj:and,_,form.01,_
7,7,a,a,DET,DT,Definite=Ind|PronType=Art,8,det,8:det,_,_,_
8,8,family,family,NOUN,NN,Number=Sing,6,obj,6:obj,_,_,_
9,9,is,be,AUX,VBZ,Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbF...,11,cop,11:cop,_,be.01,_


In [290]:
df_dev = pd.read_csv('output2.conllu', sep = '\t', header = None, names = ["id", "word", "lemma", "posuniv", "pos", "morph","head", "dep", "head_dep" , "space", "predicate", "gold_label"])


In [250]:
df_test = pd.read_csv('output4.conllu', sep = '\t', header = None, names = ["id", "word", "lemma", "posuniv", "pos", "morph","head", "dep", "head_dep" , "space", "predicate", "gold_label"])


In [304]:
df_test

Unnamed: 0,id,word,lemma,posuniv,pos,morph,head,dep,head_dep,space,predicate,gold_label
0,1,What,what,PRON,WP,PronType=Int,0,root,0:root,_,_,_
1,2,if,if,SCONJ,IN,_,4,mark,4:mark,_,_,_
2,3,Google,Google,PROPN,NNP,Number=Sing,4,nsubj,4:nsubj,_,_,ARG1
3,4,Morphed,morph,VERB,VBD,Mood=Ind|Tense=Past|VerbForm=Fin,1,advcl,1:advcl:if,_,morph.01,V
4,5,Into,into,ADP,IN,_,6,case,6:case,_,_,_
...,...,...,...,...,...,...,...,...,...,...,...,...
103153,16,suggesting,suggest,VERB,VBG,VerbForm=Ger,7,conj,5:advcl:in|7:conj:and,_,suggest.01,_
103154,17,exercises,exercise,NOUN,NNS,Number=Plur,16,obj,16:obj,_,exercise.02,ARG1
103155,18,to,to,PART,TO,_,19,mark,19:mark,_,_,_
103156,19,use,use,VERB,VB,VerbForm=Inf,17,acl,17:acl:to,SpaceAfter=No,use.01,V


In [292]:
df_train

Unnamed: 0,id,word,lemma,posuniv,pos,morph,head,dep,head_dep,space,predicate,gold_label
0,1.0,Al,Al,PROPN,NNP,Number=Sing,0,root,0:root,SpaceAfter=No,_,_
1,2.0,-,-,PUNCT,HYPH,_,1,punct,1:punct,SpaceAfter=No,_,_
2,3.0,Zaman,Zaman,PROPN,NNP,Number=Sing,1,flat,1:flat,_,_,_
3,4.0,:,:,PUNCT,:,_,1,punct,1:punct,_,_,_
4,5.0,American,american,ADJ,JJ,Degree=Pos,6,amod,6:amod,_,_,_
...,...,...,...,...,...,...,...,...,...,...,...,...
1027737,22.0,on,on,ADP,IN,_,24,case,24:case,_,_,_
1027738,23.0,my,my,PRON,PRP$,Number=Sing|Person=1|Poss=Yes|PronType=Prs,24,nmod:poss,24:nmod:poss,_,_,_
1027739,24.0,car,car,NOUN,NN,Number=Sing,21,obl,21:obl:on,SpaceAfter=No,_,_
1027740,25.0,),),PUNCT,-RRB-,_,4,punct,4:punct,SpaceAfter=No,_,_


In [305]:
df_dev

Unnamed: 0,id,word,lemma,posuniv,pos,morph,head,dep,head_dep,space,predicate,gold_label
0,1,Al,Al,PROPN,NNP,Number=Sing,0,root,0:root,SpaceAfter=No,_,_
1,2,-,-,PUNCT,HYPH,_,1,punct,1:punct,SpaceAfter=No,_,_
2,3,Zaman,Zaman,PROPN,NNP,Number=Sing,1,flat,1:flat,_,_,_
3,4,:,:,PUNCT,:,_,1,punct,1:punct,_,_,_
4,5,American,american,ADJ,JJ,Degree=Pos,6,amod,6:amod,_,_,_
...,...,...,...,...,...,...,...,...,...,...,...,...
9280,4,are,be,AUX,VBP,Mood=Ind|Tense=Pres|VerbForm=Fin,5,aux:pass,5:aux:pass,_,be.03,V
9281,5,reported,report,VERB,VBN,Tense=Past|VerbForm=Part|Voice=Pass,0,root,0:root,_,report.01,_
9282,6,dead,dead,ADJ,JJ,Degree=Pos,5,xcomp,5:xcomp,_,_,_
9283,7,and,and,CCONJ,CC,_,8,cc,8:cc|8.1:cc,_,_,_
