In [48]:
import numpy as np
import pandas as pd

### 1. Load the dataset and remove rows with null values 

In [49]:
data_df = pd.read_csv("../data/mtsamples_descriptions_clean.csv")
nonmiss_df = data_df.dropna()
dataset = nonmiss_df.copy()
dataset["length"] = nonmiss_df["text"].apply(lambda x: len(x))
dataset.head()

Unnamed: 0,id,id_description,medical_specialty_new,text,year,borough,length
0,0,0,Gastroenterology,EGD with photos and biopsies,2013.0,Merton,29
1,1,0,Gastroenterology,This is a 75-year-old female who presents wit...,2013.0,Merton,106
2,2,0,Gastroenterology,She has a previous history of hiatal hernia,2013.0,Merton,44
3,3,0,Gastroenterology,She was on Prevacid currently,2013.0,Merton,30
4,4,1,Urology,"Pelvic tumor, cystocele, rectocele, and uteri...",2013.0,Harrow,56


### 2. Transform the identifiers to strings

In [50]:
dataset["id"] = dataset["id"].astype(str)
dataset["id_description"] = dataset["id_description"].astype(str)

### 3. Reshape dataset to use with the API

### 4. Create a list where each element is an ordered dictionary of a row of the data set

In [51]:
from collections import OrderedDict

In [52]:
sentences = dataset.to_dict(into=OrderedDict, orient="records")

In [53]:
sentences[0]

OrderedDict([('id', '0'),
             ('id_description', '0'),
             ('medical_specialty_new', ' Gastroenterology'),
             ('text', ' EGD with photos and biopsies'),
             ('year', 2013.0),
             ('borough', 'Merton'),
             ('length', 29)])

### 5. Load the API key for the Project Debater

In [54]:
import pathlib

apikey_path = pathlib.Path("../APIkey.txt")
api_key = apikey_path.read_text().strip()

### 6. Initialize clients for the two services form the debater API

In [55]:
from debater_python_api.api.debater_api import DebaterApi

debater_api = DebaterApi(apikey=api_key)

#Performs clustering of the data to segment it
arg_quality_client = debater_api.get_argument_quality_client()

#Identifies key points from the semi structured text to for a supporting text.
keypoints_client = debater_api.get_keypoints_client()

### 7. Set a topic and use the argument quality service to select the top 1000 sentences from the dataset more closely related to the chosen topic

In [56]:
topic = """
The patient is a 30-year-old who was admitted with symptoms including obstructions, failures, and pain that started four days ago.
"""

#Pair every test with the topic in a dic
sentences_topic = [
    { "sentence": sentence["text"], "topic": topic, }
    for sentence in sentences
]

#Asign the scores
scores = arg_quality_client.run(sentences_topic)

#Sort the paired sentences by the score, from highest to lowest
sentences_sorted = [
    s
    for s, _ in sorted(zip(sentences,scores), key=lambda x: x[1], reverse=True)
]

#Keep the top 1000
top_k = 1000
sentences_top_100_aq = sentences_sorted[:top_k]

ArgumentQualityClient: 100%|████████████████| 3245/3245 [00:36<00:00, 88.08it/s]
2022-07-16 21:19:26,994 [INFO] argument_quality_client.py 21: argument_quality_client.run = 36901.381969451904ms.


### 8. Configure two parameters required for the key point analysis service

`mapping_thershold`: A float within 0 and 1 (by default 0.99) that sets the minimum score for a match to be considered.

`n_top_kps`: An integer set by an internal algorithm that sets the number of key points to generate.

### 9. Reshape the data into two structures `sentences_texts` and `sentences_ids`

In [63]:
domain = "medical_demo"

run_params = {
    "mapping_threshold": 0.95,
    "n_top_kps": 20,
}

#List of texts...
sentences_texts = [
    sentence["text"]
    for sentence in sentences_top_100_aq
]

#...and list with their respective ids
sentences_ids = [
    sentence["id"]
    for sentence in sentences_top_100_aq
]

### 10. Clear the domain (in case the analysis has been run previously) and load the data for the key point analysis 

In [64]:
keypoints_client.delete_domain_cannot_be_undone(domain)

keypoints_client.upload_comments(
    domain=domain,
    comments_ids=sentences_ids,
    comments_texts=sentences_texts,
    dont_split=True,
)

keypoints_client.wait_till_all_comments_are_processed(domain=domain)

2022-07-16 21:26:17,356 [INFO] keypoints_client.py 245: client calls service (delete): https://keypoint-matching-backend.debater.res.ibm.com/data
2022-07-16 21:26:18,567 [INFO] keypoints_client.py 316: uploading 1000 comments in batches
2022-07-16 21:26:18,570 [INFO] keypoints_client.py 245: client calls service (post): https://keypoint-matching-backend.debater.res.ibm.com/comments
2022-07-16 21:26:24,360 [INFO] keypoints_client.py 333: uploaded 1000 comments, out of 1000
2022-07-16 21:26:24,363 [INFO] keypoints_client.py 245: client calls service (get): https://keypoint-matching-backend.debater.res.ibm.com/comments
2022-07-16 21:26:30,002 [INFO] keypoints_client.py 345: domain: medical_demo, comments status: {'processed_comments': 1000, 'processed_sentences': 1000, 'pending_comments': 0}


### 11. Run the key point analysis (KPA) job

In [65]:
future = keypoints_client.start_kp_analysis_job(
    domain=domain,
    #comments_ids=sentences_ids,
    run_params=run_params,
)

2022-07-16 21:26:46,019 [INFO] keypoints_client.py 245: client calls service (post): https://keypoint-matching-backend.debater.res.ibm.com/kp_extraction
2022-07-16 21:26:46,675 [INFO] keypoints_client.py 407: started a kp analysis job - domain: medical_demo, job_id: 62d373668e7da8a7796a92b1


In [66]:
kpa_result = future.get_result(
    high_verbosity=False,
    polling_timout_secs=5,
)

2022-07-16 21:26:49,192 [INFO] keypoints_client.py 245: client calls service (get): https://keypoint-matching-backend.debater.res.ibm.com/kp_extraction
2022-07-16 21:26:57,663 [INFO] keypoints_client.py 245: client calls service (get): https://keypoint-matching-backend.debater.res.ibm.com/kp_extraction
2022-07-16 21:27:03,691 [INFO] keypoints_client.py 245: client calls service (get): https://keypoint-matching-backend.debater.res.ibm.com/kp_extraction
2022-07-16 21:27:09,691 [INFO] keypoints_client.py 245: client calls service (get): https://keypoint-matching-backend.debater.res.ibm.com/kp_extraction
2022-07-16 21:27:15,751 [INFO] keypoints_client.py 245: client calls service (get): https://keypoint-matching-backend.debater.res.ibm.com/kp_extraction
2022-07-16 21:27:21,755 [INFO] keypoints_client.py 245: client calls service (get): https://keypoint-matching-backend.debater.res.ibm.com/kp_extraction
2022-07-16 21:27:27,775 [INFO] keypoints_client.py 245: client calls service (get): http

In [68]:
future.get_job_id()

'62d373668e7da8a7796a92b1'

### 12. Examine the structure of one of the matching SPA results

In [69]:
print(kpa_result["keypoint_matchings"][0]["matching"][0])

{'domain': 'medical_demo', 'comment_id': '1220', 'sentence_id': 0, 'sents_in_comment': 1, 'span_start': 0, 'span_end': 157, 'num_tokens': 26, 'argument_quality': 0.650917649269104, 'sentence_text': '   The patient is a 1-year-old male with a history of chronic otitis media with effusion and conductive hearing loss refractory to outpatient medical therapy', 'score': 0}


### 13. Convert the KPA result to a pandas DataFrame and sample the results

In [76]:
matchings_rows = []

for keypoint_matching in kpa_result["keypoint_matchings"]:
    kp = keypoint_matching["keypoint"]
    
    for match in keypoint_matching["matching"]:
        match_row = [
            kp,
            match["sentence_text"],
            match["score"],
            match["comment_id"],
            match["sentence_id"],
            match["sents_in_comment"],
            match["span_start"],
            match["span_end"],
            match["num_tokens"],
            match["argument_quality"],
        ]
        
        matchings_rows.append(match_row)
        
cols = [
    "kp",
    "sentence_text",
    "match_score",
    "comment_id",
    "sentence_id",
    "sents_in_comment",
    "span_start",
    "span_end",
    "num_tokens",
    "argument_quality",
]

match_df = pd.DataFrame(matchings_rows, columns=cols)
match_df.tail()

Unnamed: 0,kp,sentence_text,match_score,comment_id,sentence_id,sents_in_comment,span_start,span_end,num_tokens,argument_quality
887,Sepsis due to urinary tract infection.,"Fever, otitis media, and possible sepsis.",0.999713,56,0,1,0,42,6,0.614363
888,Sepsis due to urinary tract infection.,The patient was discovered to have a MRSA bac...,0.999438,1693,0,1,0,155,27,0.552165
889,Sepsis due to urinary tract infection.,Recurrent urinary tract infection in a patien...,0.998664,718,0,1,0,116,16,0.657505
890,Clinical correlation is recommended.,Clinical correlation is recommended.,1.0,929,0,1,0,37,4,0.614774
891,Clinical correlation is recommended.,Maculopapular rash in kind of a linear patter...,0.995213,3042,0,1,0,135,25,0.5533


### 14. Merge the KPA results with related sentences from the input dataset, and save to a csv

In [77]:
df_merge = match_df.merge(
    dataset[["id", "id_description", "medical_specialty_new"]],
    left_on = "comment_id",
    right_on = "id",
    validate = "one_to_one",
)

df_merge.tail()

Unnamed: 0,kp,sentence_text,match_score,comment_id,sentence_id,sents_in_comment,span_start,span_end,num_tokens,argument_quality,id,id_description,medical_specialty_new
887,Sepsis due to urinary tract infection.,"Fever, otitis media, and possible sepsis.",0.999713,56,0,1,0,42,6,0.614363,56,49,Pediatrics - Neonatal
888,Sepsis due to urinary tract infection.,The patient was discovered to have a MRSA bac...,0.999438,1693,0,1,0,155,27,0.552165,1693,1354,Nephrology
889,Sepsis due to urinary tract infection.,Recurrent urinary tract infection in a patien...,0.998664,718,0,1,0,116,16,0.657505,718,739,Urology
890,Clinical correlation is recommended.,Clinical correlation is recommended.,1.0,929,0,1,0,37,4,0.614774,929,923,Cardiovascular / Pulmonary
891,Clinical correlation is recommended.,Maculopapular rash in kind of a linear patter...,0.995213,3042,0,1,0,135,25,0.5533,3042,2197,Dermatology


In [75]:
df_merge.to_csv("df_merge.csv", index=False)