In [None]:
!wget -nc -q -O "ground_truth_cause.csv" "https://raw.githubusercontent.com/CrowdTruth/Medical-Relation-Extraction/master/ground_truth_cause.csv"
!wget -nc -q -O "ground_truth_treat.csv" "https://raw.githubusercontent.com/CrowdTruth/Medical-Relation-Extraction/master/ground_truth_treat.csv"
!wget -nc -q -O "ground_truth_cause.xlsx" "https://github.com/CrowdTruth/Medical-Relation-Extraction/blob/master/train_dev_test/ground_truth_cause.xlsx?raw=true"
!wget -nc -q -O "ground_truth_treat.xlsx" "https://github.com/CrowdTruth/Medical-Relation-Extraction/blob/master/train_dev_test/ground_truth_treat.xlsx?raw=true"
!wget -nc -q -O "food_disease_dataset.csv" "https://raw.githubusercontent.com/gjorgjinac/food-disease-dataset/main/food_disease_dataset.csv"

In [None]:
import pandas as pd

# Food Disease dataset

In [None]:
df_train = pd.read_csv("/Users/adamkovacs/data/food-disease-dataset/splits/cause_folds/fold0/train.csv", sep=",", quotechar='"')

In [None]:
df_dev = pd.read_csv("/Users/adamkovacs/data/food-disease-dataset/splits/cause_folds/fold0/val.csv", sep=",", quotechar='"')

In [None]:
import re

def extract_entities(df):
    sen = re.sub(re.escape(df.term1), 'XXX', df.sentence, flags=re.IGNORECASE)
    sen = re.sub(re.escape(df.term2), 'YYY', sen, flags=re.IGNORECASE)
    return sen.encode('ascii', errors='ignore').decode('utf-8')
    

df_train['preprocessed_sen'] = df_train.apply(extract_entities, axis=1)
df_train['treat_label'] = df_train.is_treat.replace({1: 'TREAT', 0: 'NOT'})
df_train['cause_label'] = df_train.is_cause.replace({1: 'CAUSE', 0: 'NOT'})

df_dev['preprocessed_sen'] = df_dev.apply(extract_entities, axis=1)
df_dev['treat_label'] = df_dev.is_treat.replace({1: 'TREAT', 0: 'NOT'})
df_dev['cause_label'] = df_dev.is_cause.replace({1: 'CAUSE', 0: 'NOT'})

In [None]:
from potato.dataset.dataset import Dataset
from potato.models.trainer import GraphTrainer

## Detecting treat

In [None]:
train_rows = df_train.iterrows()
dev_rows = df_dev.iterrows()

In [None]:
train_sentences = [(row[1].preprocessed_sen, row[1].treat_label) for row in train_rows]
dev_sentences = [(row[1].preprocessed_sen, row[1].treat_label) for row in dev_rows]

train_dataset = Dataset(train_sentences, label_vocab={"TREAT":1, "NOT": 0})
train_dataset.set_graphs(train_dataset.parse_graphs(graph_format="fourlang"))

dev_dataset = Dataset(dev_sentences, label_vocab={"TREAT":1, "NOT": 0})
dev_dataset.set_graphs(dev_dataset.parse_graphs(graph_format="fourlang"))

In [None]:
train_df = train_dataset.to_dataframe()
dev_df = dev_dataset.to_dataframe()

In [None]:
train_df.to_pickle("food_train_dataset_treat_fourlang.pickle")
dev_df.to_pickle("food_dev_dataset_treat_fourlang.pickle")

## Detecting cause

In [None]:
train_rows = df_train.iterrows()
dev_rows = df_dev.iterrows()

train_sentences = [(row[1].preprocessed_sen, row[1].cause_label) for row in train_rows]
dev_sentences = [(row[1].preprocessed_sen, row[1].cause_label) for row in dev_rows]

train_dataset_cause = Dataset(train_sentences, label_vocab={"CAUSE":1, "NOT": 0})
train_dataset_cause.set_graphs(train_dataset.graphs)

dev_dataset_cause = Dataset(dev_sentences, label_vocab={"CAUSE":1, "NOT": 0})
dev_dataset_cause.set_graphs(dev_dataset.graphs)

In [None]:
train_df = train_dataset.to_dataframe()
dev_df = dev_dataset.to_dataframe()

In [None]:
train_df.to_pickle("food_train_dataset_cause_fourlang.pickle")
dev_df.to_pickle("food_dev_dataset_cause_fourang.pickle")