# Transformation of tabular dataset into graph

In [2]:
import pandas as pd
import numpy as np
import joblib
import re

In [3]:
df = pd.read_csv('./syn_data.csv', index_col=0)
df

Unnamed: 0,hospital_stay_length,gcs,nb_acte,gender,entry,output,entry_code,ica,ttt,ica_therapy,...,ivh,age,nimodipine,paracetamol,nad,corotrop,morphine,dve,atl,iot
0,27.408750,15.463376,16.876151,1,0,1.0,0,0,1,1,...,1,64.129138,19,-1,-1,-1,-1,-1,-1,46
1,99.555309,16.319167,65.199637,0,1,0.0,1,1,1,0,...,0,41.157506,25,76,-1,-1,-1,-1,-1,52
2,56.288059,24.284115,116.950650,0,3,0.0,2,3,0,0,...,0,25.843085,-1,109,80,56,-1,29,-1,143
3,46.042605,18.332474,24.961537,0,1,0.0,1,3,1,0,...,0,43.071809,23,-1,-1,-1,47,-1,-1,75
4,32.338527,16.943132,17.607540,0,1,0.0,2,4,1,0,...,0,42.114657,28,64,-1,-1,-1,44,-1,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,12.304598,14.471077,16.788628,1,2,1.0,1,1,1,1,...,0,56.471927,-1,69,-1,-1,41,18,-1,85
9996,33.985456,14.730414,14.941905,0,3,0.0,2,2,0,0,...,0,28.714539,-1,-1,59,-1,-1,24,-1,-1
9997,25.174506,14.429871,16.328751,0,3,0.0,2,9,1,0,...,1,43.071809,25,47,-1,-1,94,77,-1,-1
9998,60.516443,18.268104,55.190213,1,3,1.0,2,2,1,1,...,1,61.257684,-1,-1,-1,-1,-1,-1,-1,19


In [4]:
size_train = int(len(df) * 0.8)

In [122]:
numerical = ['hospital_stay_length', 'gcs', 'nb_acte', 'age']
categorical = ['gender', 'entry', 'entry_code', 'ica', 'ttt', 'ica_therapy', 'fever', 'o2_clinic', 'o2', 'hta', 'hct', 'tabagisme', 'etOH', 'diabete', 'headache', 'instable', 'vasospasme', 'ivh']
events = ['nimodipine',  'paracetamol', 'nad', 'corotrop', 'morphine', 'dve', 'atl', 'iot']

In [123]:
def get_events(d):
    s = d[events]
    clean = s[s != -1]
    sort_idx = np.argsort(clean.values)
    return [(clean.index[i], clean.iloc[i]) for i in sort_idx]

def add_temporal_events(graph, d, id):
    s = get_events(d)
    uris = []

    for (event, time) in s:
        uris += [f"""{event}_{id}"""]
        graph += f"""{event}_{id}\tstate\tP{id}
{event}_{id}\thasTime\t{time}
{event}_{id}\trdf:type\t{event}
"""
    
    for i in range(len(s) - 1):
        graph += f"""{uris[i]}\tTIME.intervalMeets\t{uris[i+1]}
"""
    
    for i in range(len(uris) - 2):
        graph += f"""{uris[i]}\tTIME.intervalBefore\t{uris[i+2]}
"""

    return graph

In [124]:
def create_graph_patient(graph, d, train):
    id = d.name
    outs = ["Back2Home", "Reabilitation", "Death"]

    if train:
        out = outs[int(d['output'])]
        graph += f"""P{id}\thasOutput\t{out}
{out}\toutput\tP{id}
{out}\trdf:type\toutput
"""
    graph += f"P{id}\trdf:type\tpatient\n"

    for feature in numerical:
        graph += f"""{feature}_{id}\tvalue_feature\tP{id}
{feature}_{id}\thasValue\t{d[feature]}
{feature}_{id}\trdf:type\t{feature}
"""
    
    for feature in categorical:
        graph += f"""{feature}_{int(d[feature])}\tfeature\tP{id}
{feature}_{int(d[feature])}\trdf:type\t{feature}
"""

    graph = add_temporal_events(graph, d, id)
    
    return graph   

In [127]:
print(create_graph_patient("", df.iloc[2], True))

P2	hasOutput	Back2Home
Back2Home	output	P2
Back2Home	rdf:type	output
P2	rdf:type	patient
hospital_stay_length_2	value_feature	P2
hospital_stay_length_2	hasValue	56.288059249050704
hospital_stay_length_2	rdf:type	hospital_stay_length
gcs_2	value_feature	P2
gcs_2	hasValue	24.28411462572017
gcs_2	rdf:type	gcs
nb_acte_2	value_feature	P2
nb_acte_2	hasValue	116.9506496526396
nb_acte_2	rdf:type	nb_acte
age_2	value_feature	P2
age_2	hasValue	25.84308527987733
age_2	rdf:type	age
gender_0	feature	P2
gender_0	rdf:type	gender
entry_3	feature	P2
entry_3	rdf:type	entry
entry_code_2	feature	P2
entry_code_2	rdf:type	entry_code
ica_3	feature	P2
ica_3	rdf:type	ica
ttt_0	feature	P2
ttt_0	rdf:type	ttt
ica_therapy_0	feature	P2
ica_therapy_0	rdf:type	ica_therapy
fever_1	feature	P2
fever_1	rdf:type	fever
o2_clinic_0	feature	P2
o2_clinic_0	rdf:type	o2_clinic
o2_1	feature	P2
o2_1	rdf:type	o2
hta_0	feature	P2
hta_0	rdf:type	hta
hct_0	feature	P2
hct_0	rdf:type	hct
tabagisme_1	feature	P2
tabagisme_1	rdf:type	tabag

In [128]:
kg = ""

for i in range(len(df)):
    kg = create_graph_patient(kg, df.iloc[i], i < size_train)

In [130]:
with open("syn_data_graph.xml", "w") as file:
    file.write(kg)

In [131]:
turtle = kg.replace("\t", " ")

def process_line(line):
    # Split the line by tab characters
    parts = line.split(' ')
    # Insert ':' in front of each part except numbers
    processed_parts = []
    for part in parts:
        if re.match(r'^\d', part):  # Check if the part starts with a digit
            processed_parts.append(part)
        else:
            processed_parts.append(':' + part)
    # Join the parts back with tabs and append ' .'
    return ' '.join(processed_parts) + ' .'

# Split the input string into lines
lines = turtle.split('\n')

# Process each line
processed_lines = [process_line(line) for line in lines][:-1]

# Join the lines back into a single string
output_string = '\n'.join(processed_lines)

turtle = "@prefix : <http://example.org/prefix/> .\n" + output_string

In [133]:
with open("syn_data_graph.ttl", "w") as file:
    file.write(turtle)

In [14]:
joblib.dump(df['output'].astype(int).to_list(), 'ouput.joblib')

['ouput.joblib']