In [None]:
!pip install pykeen

In [1]:
from pykeen.pipeline import pipeline
from pykeen.triples import TriplesFactory

triples_factory = TriplesFactory.from_path('formatted_triples_FINAL.txt')
training, validation, testing = triples_factory.split([0.8, 0.1, 0.1])



result = pipeline(
    training=training,
    validation=validation,
    testing=testing,

    model='transE',
    model_kwargs={
        'embedding_dim': 20,
    },

    optimizer='Adam',
    optimizer_kwargs={
        'lr': 1e-3,
        'weight_decay': 1e-5
    },

    negative_sampler='basic',
    # negative_sampler_kwargs={
    #     'num_negs_per_pos': 1
    # },

    loss='SoftplusLoss',

    training_loop='sLCWA',

    training_kwargs={
        'num_epochs': 150,
        'batch_size': 32,
        'label_smoothing': 0.1
    },

    evaluator_kwargs=  {
        # 'batch_size': 64,
        "filtered": True
    },
    filter_validation_when_testing = True,
)

print(result)


  from .autonotebook import tqdm as notebook_tqdm
using automatically assigned random_state=183999714
No random seed is specified. Setting to 1461782988.
No cuda devices were available. The model runs on CPU
Training epochs on cpu: 100%|██████████| 150/150 [01:23<00:00,  1.80epoch/s, loss=0.492, prev_loss=0.493]
Evaluating on cpu: 100%|██████████| 2.21k/2.21k [00:02<00:00, 785triple/s]
INFO:pykeen.evaluation.evaluator:Evaluation took 2.86s seconds


PipelineResult(random_seed=1461782988, model=TransE(
  (loss): SoftplusLoss(
    (margin_activation): Softplus(beta=1.0, threshold=20.0)
  )
  (interaction): TransEInteraction()
  (entity_representations): ModuleList(
    (0): Embedding(
      (_embeddings): Embedding(7714, 20)
    )
  )
  (relation_representations): ModuleList(
    (0): Embedding(
      (_embeddings): Embedding(5, 20)
    )
  )
  (weight_regularizers): ModuleList()
), training=TriplesFactory(num_entities=7714, num_relations=5, create_inverse_triples=False, num_triples=17674, path="/Users/bernardocosta/Desktop/EHRPipeline/EHRPipeline/semantic_completeness/formatted_triples_FINAL.txt"), training_loop=<pykeen.training.slcwa.SLCWATrainingLoop object at 0x30434ad70>, losses=[2.0380067797293497, 1.763980762867988, 1.6679241961208433, 1.6002589554726323, 1.537342095892425, 1.477626315292786, 1.4215251959685176, 1.3699311822060003, 1.3225472941321017, 1.2791590324146622, 1.239633094984196, 1.2037612033795706, 1.17081690179504

In [2]:
# Plot the training loss by epoch
result.plot_losses()

ModuleNotFoundError: No module named 'seaborn'

In [3]:
from pykeen import predict  # or pykeen.models.predict, depending on version

df_predictions = predict.predict_target(
    model=result.model,
    head="Diagnosis/10033/PATIENTS/112578",
    relation="hasCode",
    triples_factory=result.training
).df

# Inspect the top 10
df_predictions.head(10)


Unnamed: 0,tail_id,score,tail_label
7482,7482,-0.490115,icd9#25000
7494,7494,-0.530894,icd9#4019
7492,7492,-0.55132,icd9#311
7505,7505,-0.672903,icd9#5119
7488,7488,-0.674284,icd9#2859
7514,7514,-0.680279,icd9#5849
7507,7507,-0.690865,icd9#51881
7522,7522,-0.736446,icd9#7907
7480,7480,-0.752592,icd9#20300
7485,7485,-0.92146,icd9#2762


### Part of Fact Validation - Generate a csv with the previsions from the cell above

In [7]:
output_file = "../fact_validation/predictions.txt"

with open(output_file, "w", encoding="utf-8") as f:
    for idx, row in df_predictions.head(10).iterrows():
        predicted_code = row["tail_label"] 
        subject_uri = "<http://example.org/Diagnosis/10033/PATIENTS/112578>"
        predicate_uri = "<https://biomedit.ch/rdf/sphn-schema/sphn#hasCode>"
        object_uri = f"<http://example.org/Code/{predicted_code}>"

        triple_line = f"{subject_uri}  {predicate_uri}  {object_uri}"
        f.write(triple_line + "\n")

print(f"Wrote top-10 predictions to {output_file}")


Wrote top-10 predictions to ../fact_validation/predictions.txt
