In [1]:
import numpy as np
import pandas as pd

### 1. Load the dataset and remove rows with null values 

In [2]:
data_df = pd.read_csv("../data/mtsamples_descriptions_clean.csv")
nonmiss_df = data_df.dropna()
dataset = nonmiss_df.copy()
dataset["length"] = nonmiss_df["text"].apply(lambda x: len(x))
dataset.head()

Unnamed: 0,id,id_description,medical_specialty_new,text,year,borough,length
0,0,0,Gastroenterology,EGD with photos and biopsies,2013.0,Merton,29
1,1,0,Gastroenterology,This is a 75-year-old female who presents wit...,2013.0,Merton,106
2,2,0,Gastroenterology,She has a previous history of hiatal hernia,2013.0,Merton,44
3,3,0,Gastroenterology,She was on Prevacid currently,2013.0,Merton,30
4,4,1,Urology,"Pelvic tumor, cystocele, rectocele, and uteri...",2013.0,Harrow,56


### 2. Transform the identifiers to strings

In [3]:
dataset["id"] = dataset["id"].astype(str)
dataset["id_description"] = dataset["id_description"].astype(str)

### 3. Reshape dataset to use with the API

### 4. Create a list where each element is an ordered dictionary of a row of the data set

In [4]:
from collections import OrderedDict

In [5]:
sentences = dataset.to_dict(into=OrderedDict, orient="records")

In [6]:
sentences[0]

OrderedDict([('id', '0'),
             ('id_description', '0'),
             ('medical_specialty_new', ' Gastroenterology'),
             ('text', ' EGD with photos and biopsies'),
             ('year', 2013.0),
             ('borough', 'Merton'),
             ('length', 29)])

### 5. Load the API key for the Project Debater

In [7]:
import pathlib

apikey_path = pathlib.Path("../APIkey.txt")
api_key = apikey_path.read_text().strip()

### 6. Initialize clients for the two services form the debater API

In [8]:
from debater_python_api.api.debater_api import DebaterApi

debater_api = DebaterApi(apikey=api_key)

#Performs clustering of the data to segment it
arg_quality_client = debater_api.get_argument_quality_client()

#Identifies key points from the semi structured text to for a supporting text.
keypoints_client = debater_api.get_keypoints_client()

### 7. Set a topic and use the argument quality service to select the top 1000 sentences from the dataset more closely related to the chosen topic

In [9]:
topic = """
The patient is a 30-year-old who was admitted with symptoms including obstructions, failures, and pain that started four days ago.
"""

#Pair every test with the topic in a dic
sentences_topic = [
    { "sentence": sentence["text"], "topic": topic, }
    for sentence in sentences
]

#Asign the scores
scores = arg_quality_client.run(sentences_topic)

#Sort the paired sentences by the score, from highest to lowest
sentences_sorted = [
    s
    for s, _ in sorted(zip(sentences,scores), key=lambda x: x[1], reverse=True)
]

#Keep the top 1000
top_k = 1000
sentences_top_1000_aq = sentences_sorted[:top_k]

ArgumentQualityClient: 100%|████████████████| 3245/3245 [00:39<00:00, 82.04it/s]


### 8. Configure two parameters required for the key point analysis service

`mapping_thershold`: A float within 0 and 1 (by default 0.99) that sets the minimum score for a match to be considered.

`n_top_kps`: An integer set by an internal algorithm that sets the number of key points to generate.

### 9. Reshape the data into two structures `sentences_texts` and `sentences_ids`

In [10]:
domain = "medical_demo"

run_params = {
    "mapping_threshold": 0.95,
    "n_top_kps": 20,
}

#List of texts...
sentences_texts = [
    sentence["text"]
    for sentence in sentences_top_1000_aq
]

#...and list with their respective ids
sentences_ids = [
    sentence["id"]
    for sentence in sentences_top_1000_aq
]

### 10. Clear the domain (in case the analysis has been run previously) and load the data for the key point analysis 

In [11]:
keypoints_client.delete_domain_cannot_be_undone(domain)

keypoints_client.upload_comments(
    domain=domain,
    comments_ids=sentences_ids,
    comments_texts=sentences_texts,
    dont_split=True,
)

keypoints_client.wait_till_all_comments_are_processed(domain=domain)

### 11. Run the key point analysis (KPA) job

In [12]:
future = keypoints_client.start_kp_analysis_job(
    domain=domain,
    #comments_ids=sentences_ids,
    run_params=run_params,
)

In [13]:
kpa_result = future.get_result(
    high_verbosity=False,
    polling_timout_secs=5,
)

In [14]:
future.get_job_id()

'62dd700b8e7da8a7796aa4e1'

### 12. Examine the structure of one of the matching SPA results

In [15]:
print(kpa_result["keypoint_matchings"][0]["matching"][0])

{'domain': 'medical_demo', 'comment_id': '1220', 'sentence_id': 0, 'sents_in_comment': 1, 'span_start': 0, 'span_end': 157, 'num_tokens': 26, 'argument_quality': 0.650917649269104, 'sentence_text': '   The patient is a 1-year-old male with a history of chronic otitis media with effusion and conductive hearing loss refractory to outpatient medical therapy', 'score': 0}


### 13. Convert the KPA result to a pandas DataFrame and sample the results

In [18]:
matchings_rows = []

for keypoint_matching in kpa_result["keypoint_matchings"]:
    kp = keypoint_matching["keypoint"]
    
    for match in keypoint_matching["matching"]:
        match_row = [
            kp,
            match["sentence_text"],
            match["score"],
            match["comment_id"],
            match["sentence_id"],
            match["sents_in_comment"],
            match["span_start"],
            match["span_end"],
            match["num_tokens"],
            match["argument_quality"],
        ]
        
    matchings_rows.append(match_row)
        
cols = [
    "kp",
    "sentence_text",
    "match_score",
    "comment_id",
    "sentence_id",
    "sents_in_comment",
    "span_start",
    "span_end",
    "num_tokens",
    "argument_quality",
]

match_df = pd.DataFrame(matchings_rows, columns=cols)
match_df.tail()

Unnamed: 0,kp,sentence_text,match_score,comment_id,sentence_id,sents_in_comment,span_start,span_end,num_tokens,argument_quality
16,"Dysphagia, possible stricture",The patient complained of globus sensation hi...,0.955405,752,0,1,0,107,17,0.523058
17,The patient has a history of malrotation.,An 86-year-old woman with a history of aortic...,0.957704,756,0,1,0,113,19,0.559989
18,"Palpitations, possibly related to anxiety",The patient was originally hospitalized secon...,0.962797,352,0,1,0,82,10,0.490882
19,Cognitive linguistic impairment secondary to ...,"Mild organic brain syndrome, presumably secon...",0.962278,723,0,1,0,89,12,0.565019
20,"Brachytherapy, iodine-125 seed implantation, ...",The placement was confirmed with indirect oph...,0.968803,2337,0,1,0,57,7,0.479318


### 14. Merge the KPA results with related sentences from the input dataset, and save to a csv

In [19]:
df_merge = match_df.merge(
    dataset[["id", "id_description", "medical_specialty_new"]],
    left_on = "comment_id",
    right_on = "id",
    validate = "one_to_one",
)

df_merge.tail()

Unnamed: 0,kp,sentence_text,match_score,comment_id,sentence_id,sents_in_comment,span_start,span_end,num_tokens,argument_quality,id,id_description,medical_specialty_new
16,"Dysphagia, possible stricture",The patient complained of globus sensation hi...,0.955405,752,0,1,0,107,17,0.523058,752,785,Gastroenterology
17,The patient has a history of malrotation.,An 86-year-old woman with a history of aortic...,0.957704,756,0,1,0,113,19,0.559989,756,788,Cardiovascular / Pulmonary
18,"Palpitations, possibly related to anxiety",The patient was originally hospitalized secon...,0.962797,352,0,1,0,82,10,0.490882,352,418,Cardiovascular / Pulmonary
19,Cognitive linguistic impairment secondary to ...,"Mild organic brain syndrome, presumably secon...",0.962278,723,0,1,0,89,12,0.565019,723,746,Psychiatry / Psychology
20,"Brachytherapy, iodine-125 seed implantation, ...",The placement was confirmed with indirect oph...,0.968803,2337,0,1,0,57,7,0.479318,2337,1668,Ophthalmology


In [None]:
df_merge.to_csv("df_merge.csv", index=False)