# TransE Graph Model Training

In [None]:
import numpy as np
import pandas as pd
from pykeen.triples import TriplesFactory
from pykeen.pipeline import pipeline
from pykeen.predict import predict_target
import joblib
from sklearn.manifold import TSNE
from itertools import cycle
from sklearn.metrics import RocCurveDisplay, auc, roc_curve, accuracy_score, confusion_matrix, ConfusionMatrixDisplay, precision_score, recall_score, f1_score, roc_auc_score, classification_report
from sklearn.preprocessing import LabelBinarizer
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from scipy.stats import mode
from scipy.special import softmax

In [None]:
size_train = 8000
y = joblib.load('./ouput.joblib')
y_train = y[:size_train]
y_test = y[size_train:]
size_test = len(y_test)

In [None]:
pd.DataFrame(y).value_counts()

In [None]:
pd.DataFrame(y_train).value_counts()

In [None]:
pd.DataFrame(y_test).value_counts()

In [None]:
tf = TriplesFactory.from_path("syn_data_graph.xml")

In [None]:
n_runs = 1
epochs = 10

y_pred = np.zeros((n_runs, len(y_test)))
y_probs = np.zeros((n_runs, len(y_test), 3))
log_loss = np.zeros((n_runs, epochs))
hits = np.zeros((n_runs, 4))


for run in range(n_runs):
    model_name = 'TransE'
    dataset = 'RIA'
    embedding_dim = 5

    result = pipeline(
        model=model_name,       
        training=tf,
        testing=tf,     
        model_kwargs=dict(
            embedding_dim=embedding_dim,         
            loss="softplus", 
        ),  
        optimizer_kwargs=dict(
            lr=0.001,
            weight_decay=1e-4,
        ),  
        training_kwargs=dict(
            num_epochs=epochs, 
            use_tqdm_batch=True,
        ),  
        training_loop='sLCWA',
        negative_sampler='basic',
        device='gpu',
        use_tqdm=True,   
    )

    #plot loss
    loss_plot = result.plot_losses()
    #loss_plot.figure.savefig(f'loss_{model_name}_{embedding_dim}_{epochs}_{time.strftime("%Y%m%d-%H%M%S")}.png',dpi=600)

    log_loss[run] = result.losses

    preds = []
    for i in range(len(y_test)):
        pred = predict_target(
                model=result.model,
                head=f"P{size_train + i}",
                relation="hasOutput",
                triples_factory=tf
            )
        preds += [pred]

    y_pred[run] = np.array([
        np.argmin(
            [pred.df.reset_index(drop=True)['tail_label'][pred.df.reset_index(drop=True)['tail_label'] == output].index[0] for output in ["Back2Home", "Reabilitation", "Death"]]
        )
        for pred in preds
        ])
    
    y_probs[run] = np.array([
        softmax(
            [pred.df.reset_index(drop=True)['score'][pred.df.reset_index(drop=True)['tail_label'] == output].iloc[0] for output in ["Back2Home", "Reabilitation", "Death"]]
        )
        for pred in preds
        ])
    
    hits[run] = [
        round(result.get_metric('hits_at_1'), 2),
        round(result.get_metric('hits_at_3'), 2),
        round(result.get_metric('hits_at_5'), 2),
        round(result.get_metric('hits_at_10'), 2)
    ]


In [None]:
y_pred = mode(y_pred, axis=0).mode
y_score = np.mean(y_probs, axis=0)
hits = np.mean(hits, axis=0)

In [None]:
preds[0]

In [None]:
print("EVALUATION")
print("Hits@1", round(hits[0], 2))
print("Hits@3", round(hits[1], 2))
print("Hits@5", round(hits[2], 2))
print("Hits@10", round(hits[3], 2))

In [None]:
matrix = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=matrix, display_labels=["Back2Home", "Reabilitation", "Death"])
disp.plot()
plt.show()

In [None]:
matrix

In [None]:
print(classification_report(y_test, y_pred, target_names=["Back2Home", "Reabilitation", "Death"]))

In [None]:
auc = round(roc_auc_score(y_test, y_score, multi_class='ovr'), 2)
print("AUC ROC:", auc)

In [None]:
model = result.model

entity_embedding_tensor = model.entity_representations[0](indices=None).cpu()
relation_embedding_tensor = model.relation_representations[0](indices=None).cpu()

In [None]:
colors = [['g', 'b', 'r'][i] for i in y_train]
labels = [i for i in y_train]
patients_names = [f"P{i}" for i in range(size_train)]
train_pos = entity_embedding_tensor[tf.entities_to_ids(patients_names)]
train_pos = train_pos.detach().numpy()

In [None]:
out_colors = ['g', 'b', 'r']
outcomes_names = ["Back2Home", "Reabilitation", "Death"]
out_pos = entity_embedding_tensor[tf.entities_to_ids(outcomes_names)]
out_pos = out_pos.detach().numpy()

In [None]:
pca = PCA(n_components=2)
train_pos = pca.fit_transform(train_pos)
out_pos = pca.transform(out_pos)

In [None]:
print(f"{round(sum(pca.explained_variance_), 2)}% variance explained")

In [None]:
relation_names = ["hasOutput", "output"]
rel_pos = relation_embedding_tensor[tf.relations_to_ids(relation_names)]
rel_pos = rel_pos.detach().numpy()
origin = np.array([[0] * 2, [0] * 2])

In [None]:
plt.scatter(train_pos[:,0], train_pos[:,1], c=colors)
plt.scatter(out_pos[:,0], out_pos[:,1], s=200, marker='X', edgecolors=out_colors, facecolors=['w'] * len(out_colors))
plt.quiver(*origin, arrows[0, :], arrows[1, :], angles='xy', scale_units='xy', scale=1, color=['black', 'black'])

plt.scatter([], [], marker='X', edgecolors='g', facecolors=['w'], label='Back2Home')
plt.scatter([], [], marker='X', edgecolors='b', facecolors=['w'], label='Reabilitation')
plt.scatter([], [], marker='X', edgecolors='r', facecolors=['w'], label='Death')
plt.scatter([], [], color='black', label='hasOutput', marker='^')

plt.legend(loc='lower right')
plt.show()

In [None]:
train_pos = entity_embedding_tensor[tf.entities_to_ids(patients_names)]
train_pos = train_pos.detach().numpy()

out_pos = entity_embedding_tensor[tf.entities_to_ids(outcomes_names)]
out_pos = out_pos.detach().numpy()

rel_pos = relation_embedding_tensor[tf.relations_to_ids(relation_names)]
rel_pos = rel_pos.detach().numpy()

In [None]:
points = np.concatenate((train_pos, out_pos, rel_pos))
tsne = TSNE(n_components=2)
points_tsne = tsne.fit_transform(points)

In [None]:
plt.scatter(points_tsne[:-5,0], points_tsne[:-5,1], c=colors)
plt.scatter(points_tsne[-5:-2,0], points_tsne[-5:-2,1], s=200, marker='X', edgecolors=out_colors, facecolors=['w'] * len(out_colors))
plt.quiver(*origin, points_tsne[-2:,0], points_tsne[-2:,1], scale=100, color=['b', 'r'])
plt.show()

In [None]:
label_binarizer = LabelBinarizer().fit(y_train)
y_onehot_test = label_binarizer.transform(y_test)
y_onehot_test.shape 

In [None]:
# store the fpr, tpr, and roc_auc for all averaging strategies
fpr, tpr, roc_auc = dict(), dict(), dict()
# Compute micro-average ROC curve and ROC area
fpr["micro"], tpr["micro"], _ = roc_curve(y_onehot_test.ravel(), y_score.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])

print(f"Micro-averaged One-vs-Rest ROC AUC score:\n{roc_auc['micro']:.2f}")

In [None]:
n_classes = 3

for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_onehot_test[:, i], y_score[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

fpr_grid = np.linspace(0.0, 1.0, 1000)

# Interpolate all ROC curves at these points
mean_tpr = np.zeros_like(fpr_grid)

for i in range(n_classes):
    mean_tpr += np.interp(fpr_grid, fpr[i], tpr[i])  # linear interpolation

# Average it and compute AUC
mean_tpr /= n_classes

fpr["macro"] = fpr_grid
tpr["macro"] = mean_tpr
roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])

print(f"Macro-averaged One-vs-Rest ROC AUC score:\n{roc_auc['macro']:.2f}")

In [None]:
class_weights = [0.443396, 0.432075, 0.124529]

# Compute ROC curve and ROC area for each class with sample weights
for i in range(n_classes):
    sample_weight = np.ones(y_onehot_test.shape[0]) * class_weights[i]
    fpr[i], tpr[i], _ = roc_curve(y_onehot_test[:, i], y_score[:, i], sample_weight=sample_weight)
    roc_auc[i] = auc(fpr[i], tpr[i])

# Interpolation grid
fpr_grid = np.linspace(0.0, 1.0, 1000)

# Interpolate all ROC curves at these points
weighted_mean_tpr = np.zeros_like(fpr_grid)

for i in range(n_classes):
    interp_tpr = np.interp(fpr_grid, fpr[i], tpr[i])  # linear interpolation
    weighted_mean_tpr += class_weights[i] * interp_tpr

# Compute AUC for the weighted macro-average
fpr["weighted_macro"] = fpr_grid
tpr["weighted_macro"] = weighted_mean_tpr
roc_auc["weighted_macro"] = auc(fpr["weighted_macro"], tpr["weighted_macro"])

print(f"Weighted Macro-averaged One-vs-Rest ROC AUC score:\n{roc_auc['weighted_macro']:.2f}")


In [None]:
fig, ax = plt.subplots(figsize=(6, 6))

target_names = ["Back2Home", "Reabilitation", "Death"]

plt.plot(
    fpr["micro"],
    tpr["micro"],
    label=f"micro-average (AUC = {roc_auc['micro']:.2f})",
    color="green",
    linestyle=":",
    linewidth=4,
)

plt.plot(
    fpr["macro"],
    tpr["macro"],
    label=f"macro-average (AUC = {roc_auc['macro']:.2f})",
    color="navy",
    linestyle=":",
    linewidth=4,
)

plt.plot(
    fpr["weighted_macro"],
    tpr["weighted_macro"],
    label=f"weighted-average (AUC = {roc_auc['weighted_macro']:.2f})",
    color="deeppink",
    linestyle=":",
    linewidth=4,
)

colors = cycle(["aqua", "darkorange", "cornflowerblue"])
for class_id, color in zip(range(n_classes), colors):
    RocCurveDisplay.from_predictions(
        y_onehot_test[:, class_id],
        y_score[:, class_id],
        name=f"{target_names[class_id]}",
        color=color,
        ax=ax,
        plot_chance_level=(class_id == 2),
    )

_ = ax.set(
    xlabel="False Positive Rate",
    ylabel="True Positive Rate",
    title="Extension of Receiver Operating Characteristic\nto One-vs-Rest multiclass",
)

#ax.grid(False)
#ax.set_facecolor('white')

plt.savefig("ROC_quadruples4.png")

In [None]:
colors = [['g', 'r', 'b'][i] for i in y_test]
labels = [i for i in y_train]
patients_names = [f"P{size_train + i}" for i in range(size_test)]
train_pos = entity_embedding_tensor[tf.entities_to_ids(patients_names)]
train_pos = train_pos.detach().numpy()

In [None]:
out_colors = ['g', 'r', 'b']
outcomes_names = ["Back2Home", "Reabilitation", "Death"]
out_pos = entity_embedding_tensor[tf.entities_to_ids(outcomes_names)]
out_pos = out_pos.detach().numpy()

In [None]:
train_pos = entity_embedding_tensor[tf.entities_to_ids(patients_names)]
train_pos = train_pos.detach().numpy()

out_pos = entity_embedding_tensor[tf.entities_to_ids(outcomes_names)]
out_pos = out_pos.detach().numpy()

rel_pos = relation_embedding_tensor[tf.relations_to_ids(relation_names)]
rel_pos = rel_pos.detach().numpy()

In [None]:
points = np.concatenate((train_pos, out_pos, rel_pos))
tsne = TSNE(n_components=2)
points_tsne = tsne.fit_transform(points)

In [None]:
plt.scatter(points_tsne[:-5,0], points_tsne[:-5,1], c=colors)
plt.scatter(points_tsne[-5:-2,0], points_tsne[-5:-2,1], s=200, marker='X', edgecolors=out_colors, facecolors=['w'] * len(out_colors))
plt.quiver(*origin, points_tsne[-2:,0], points_tsne[-2:,1], scale=100, color=['b', 'r'])
plt.show()