In [None]:
import pandas as pd

In [None]:
dfs_treat = pd.read_excel("/Users/adamkovacs/data/Medical-Relation-Extraction/train_dev_test/ground_truth_treat.xlsx", sheet_name=None)

In [None]:
treat_train = dfs_treat['train']
treat_dev = dfs_treat['dev']
treat_test = dfs_treat['test']

In [None]:
import re
import numpy as np

def extract_entities(df):
    sen = re.sub(re.escape(df.term1), 'XXX', df.sentence, flags=re.IGNORECASE)
    sen = re.sub(re.escape(df.term2), 'YYY', sen, flags=re.IGNORECASE)
    #return sen.encode('ascii', errors='ignore').decode('utf-8')
    return sen
    
def extract_labels(df):
    expert = df.expert
    crowd = df.crowd
    label = 0
    if expert == 1:
        label = 1
    elif pd.isnull(expert) and crowd > 0:
        label = 1
    
    return label

treat_train['preprocessed_sen'] = treat_train.apply(extract_entities, axis=1)
treat_train['label_id'] = treat_train.apply(extract_labels, axis=1)
treat_train['label'] = treat_train.label_id.replace({1: 'TREAT', 0: 'NOT'})

treat_dev['preprocessed_sen'] = treat_dev.apply(extract_entities, axis=1)
treat_dev['label_id'] = treat_dev.apply(extract_labels, axis=1)
treat_dev['label'] = treat_dev.label_id.replace({1: 'TREAT', 0: 'NOT'})


treat_test['preprocessed_sen'] = treat_test.apply(extract_entities, axis=1)
treat_test['label_id'] = treat_test.apply(extract_labels, axis=1)
treat_test['label'] = treat_test.label_id.replace({1: 'TREAT', 0: 'NOT'})



In [None]:
from potato.dataset.dataset import Dataset
from potato.models.trainer import GraphTrainer

In [None]:
train_rows = treat_train.iterrows()
dev_rows = treat_dev.iterrows()
test_rows = treat_test.iterrows()

In [None]:
train_sentences = [(row[1].preprocessed_sen, row[1].label) for row in train_rows]
dev_sentences = [(row[1].preprocessed_sen, row[1].label) for row in dev_rows]
test_sentences = [(row[1].preprocessed_sen, row[1].label) for row in test_rows]

train_dataset = Dataset(train_sentences, label_vocab={"TREAT":1, "NOT": 0}, lang='en_bio')
train_dataset.set_graphs(train_dataset.parse_graphs(graph_format="ud"))

dev_dataset = Dataset(dev_sentences, label_vocab={"TREAT":1, "NOT": 0}, lang='en_bio')
dev_dataset.set_graphs(dev_dataset.parse_graphs(graph_format="ud"))

test_dataset = Dataset(test_sentences, label_vocab={"TREAT":1, "NOT": 0}, lang='en_bio')
test_dataset.set_graphs(test_dataset.parse_graphs(graph_format="ud"))

In [None]:
train_df = train_dataset.to_dataframe()
dev_df = dev_dataset.to_dataframe()
test_df = test_dataset.to_dataframe()

train_df.to_pickle("crowdtruth_train_dataset_treat_ud.pickle")
dev_df.to_pickle("crowdtruth_dev_dataset_treat_ud.pickle")
test_df.to_pickle("crowdtruth_test_dataset_treat_ud.pickle")

## Fourlang

In [None]:
train_sentences = [(row[1].preprocessed_sen, row[1].label) for row in train_rows]
dev_sentences = [(row[1].preprocessed_sen, row[1].label) for row in dev_rows]
test_sentences = [(row[1].preprocessed_sen, row[1].label) for row in test_rows]

train_dataset = Dataset(train_sentences, label_vocab={"TREAT":1, "NOT": 0})
train_dataset.set_graphs(train_dataset.parse_graphs(graph_format="fourlang"))

dev_dataset = Dataset(dev_sentences, label_vocab={"TREAT":1, "NOT": 0})
dev_dataset.set_graphs(dev_dataset.parse_graphs(graph_format="fourlang"))

test_dataset = Dataset(test_sentences, label_vocab={"TREAT":1, "NOT": 0})
test_dataset.set_graphs(test_dataset.parse_graphs(graph_format="fourlang"))

In [None]:
train_df = train_dataset.to_dataframe()
dev_df = dev_dataset.to_dataframe()
test_df = test_dataset.to_dataframe()

train_df.to_pickle("crowdtruth_train_dataset_treat_fourlang.pickle")
dev_df.to_pickle("crowdtruth_dev_dataset_treat_fourlang.pickle")
test_df.to_pickle("crowdtruth_test_dataset_treat_fourlang.pickle")