In [1]:
import pandas as pd

def read_conll_file(file_path):
    '''read original conllu file and return all sentences'''
    sentences = []
    with open(file_path, encoding='utf-8') as f:
        current_sentence = []
        for line in f:
            line = line.strip()
            if not line:
                sentences.append(current_sentence)
                current_sentence = []
            elif not line.startswith('#'):
                fields = line.split('\t')
                current_sentence.append(fields)
    if current_sentence:
        sentences.append(current_sentence)
    return sentences

In [2]:
def repeat_multi_predicate_sentences(sentences):
    '''split each sentence into propositions based on predicates'''
    final_sentences = []
    for sentence in sentences:
        # Check if sentence has at least 11 fields
        if len(sentence[0]) < 11:
            continue
        # Get the values of all predicate columns in the sentence
        predicate_values = list([fields[10] for fields in sentence if len(fields) >= 11])
        predicate_values = [item for item in predicate_values if item != '_']
        # If there is only one predicate value or no predicates, don't repeat the sentence
        if len(predicate_values) <= 1:
            final_sentences.append(sentence)
        elif len(predicate_values) > 1:
            # Repeat sentence for each predicate value
            for i, pred in enumerate(predicate_values):
                b = i + 1
                # Convert sentence to DataFrame
                df = pd.DataFrame(sentence)
                # Create a new DataFrame with only the first 11 columns of the original DataFrame
                df_2 = df.iloc[:, :11].copy()
                new_col = df.iloc[:, (10 + b)]
                df_2[11] = new_col
                new_sentence = df_2.values.tolist()
                final_sentences.append(new_sentence)
                
    return final_sentences

In [3]:
def write_conll_file(sentences, file_path):
    '''write all preprocessed sentences to corresponding conll files and save in the specified path'''
    with open(file_path, 'w', encoding='utf-8') as f:
        for sentence in sentences:
            for fields in sentence:
                if None in fields:
                    continue
                else:
                    f.write('\t'.join(fields) + '\n')
            f.write('\n')

In [4]:
# preprocess the training set and save as train.conll
train_sentences = read_conll_file('data/original data/en_ewt-up-train.conllu')
train = repeat_multi_predicate_sentences(train_sentences)
write_conll_file(train, 'data/preprocessed data/train.conll')

In [5]:
# preprocess the test set and save as test.conll
test_sentences = read_conll_file('data/original data/en_ewt-up-test.conllu')
test = repeat_multi_predicate_sentences(test_sentences)
write_conll_file(test, 'data/preprocessed data/test.conll')