In [None]:
import colbert

In [None]:
from colbert import Indexer, Searcher
from colbert.infra import Run, RunConfig, ColBERTConfig
from colbert.data import Queries, Collection

## Indexing

In [None]:
nbits = 2   # encode each dimension with 2 bits
doc_maxlen = 300 # truncate passages at 300 tokens
max_id = 10000

index_name = f'Mimic.{nbits}bits'

## Data Clean

In [None]:
import os
import json
import random
import time
import re
import pickle
import traceback
from tqdm import tqdm
import pandas as pd

In [None]:
queries = [
    "Identify the sentence discussing stage.",
    "Identify the sentence discussing histology",
    "Identify the sentence discussing tumor size",
    "Identify the sentence discussing recurrence"
]

In [None]:
## Functions to process the data
def DataClean(data,labels_path,data_path=None,delete=None):

    print('\n\nImporting and filtering database...')

    if data is not None and not data.empty:
        notes = data
    else:
        notes = pd.read_csv(data_path)

    ints_str = '0123456789-#[]' # characters that usually main categories don't start with

    print('\n\nSplitting each note into sections:\n\n')

    notes_sections = {}

    for note_index in tqdm(range(notes.shape[0])):
        note = notes['text'][note_index].replace('\n\n\n\n','\n').replace('\n\n\n','\n').replace('     ','\n')
        paragraphs = note.split('\n')

        subsections, new_section = [], ' '
        for p in paragraphs:
            line = p.strip()
            if len(line)>0 and ':' in line and not (line[line.find(':')-1] in ints_str) and not(line[0] in ints_str):
                subsections.append([new_section.strip()])
                new_section = p + ' '
            else:
                new_section += p + ' '
        subsections.append([new_section])
        subsections.pop(0)

        note_sect_tit,note_sect_par = [],[]
        for sect in subsections:
            note_sect_tit += [str(*sect)[0:str(*sect).find(':')]]
            note_sect_par += [str(*sect)[str(*sect).find(':')+1:].strip()]
        note_df = pd.DataFrame({'title':note_sect_tit,'category':'','text':note_sect_par, 'label':''})
        notes_sections[notes['note_id'][note_index]] = note_df

    f = open(labels_path, 'r')
    obj_label = f.readlines()
    obj_label_dict = {}
    i = 0
    for s in obj_label:
        i += 1
        if '/' in s:
            buffer = s.strip('\n').lower().split('/')
            for item in buffer:
                obj_label_dict[item] = i
        else:
            obj_label_dict[s.strip('\n').lower()] = i
    f.close()

    for key in tqdm(list(notes_sections.keys())):
        buffer = 'begin_title'
        t = list(notes_sections[key]['title'])
        for idx in range(len(t)):
            for item in list(obj_label_dict.keys()):
                if item in t[idx].lower() and len(t[idx].lower())>2:
                    buffer = item
                    notes_sections[key]['category'][idx] = buffer
                    notes_sections[key]['label'][idx] = obj_label_dict[buffer]
                    break
            notes_sections[key]['category'][idx] = buffer
            notes_sections[key]['label'][idx] = obj_label_dict[buffer]

    notes_sections_output = {}
    row_id  = notes_sections.keys()
    for key in tqdm(row_id):
        buffer = ''
        note_sect_tit, note_sect_par, note_sect_lab = [], [], []
        for i in range(len(notes_sections[key]['category'])):
            if buffer != notes_sections[key]['category'][i]:
                buffer = notes_sections[key]['category'][i]
                note_sect_tit.append(buffer)
                note_sect_lab.append(notes_sections[key]['title'][i])
                note_sect_par.append(notes_sections[key]['text'][i])
                # if buffer == 'followup instruction' or buffer == 'follow up' or buffer == 'follow-up':
                #     break
            else:
                note_sect_par[-1] = note_sect_par[-1] + ' ' + notes_sections[key]['title'][i] + ' ' + notes_sections[key]['text'][i]
        note_df = pd.DataFrame({'title': note_sect_tit, 'text': note_sect_par, 'label': note_sect_lab})
        notes_sections_output[key] = note_df


    notes_sections = notes_sections_output

    if delete != None:
        for key,value in notes_sections.items():
            notes_sections[key] = notes_sections[key][~notes_sections[key]['label'].isin(delete)]

    return notes_sections

In [None]:
df  = pd.read_csv("/content/malignant_neoplasm_updated_first_100_rows.csv").iloc[:10,]


In [None]:
NotesSections = DataClean(data=df,
                          labels_path="/content/labels.txt",
                          delete=['Name','Admission Date','Discharge Date','Date of Birth','Followup Instructions'])



Importing and filtering database...


Splitting each note into sections:




100%|██████████| 10/10 [00:00<00:00, 641.76it/s]
100%|██████████| 10/10 [00:00<00:00, 208.45it/s]
100%|██████████| 10/10 [00:00<00:00, 1390.73it/s]


## Index & Search

In [None]:
import re

In [None]:
checkpoint = 'colbert-ir/colbertv2.0'

Data = {}

for key,value in NotesSections.items():

  Data[key] = {}

  Strings = NotesSections[key]['text'].tolist()

  ## Delete some useless infomation
  paragraphs = [s for s in Strings if s != ""]

  collection = []

  for para in paragraphs:

    Sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z]\.)(?<![A-Z][a-z]\.)(?<! [a-z]\.)(?<![A-Z][a-z][a-z]\.)(?<=\.|\?|\!)\"*\s*\s*(?:\W*)(?<![A-Z])', para)

    collection = collection + [s for s in Sentences if len(s) > 10]

  Data[key]['content'] = ''.join(collection)

  Data[key]['qas'] = []

  with Run().context(RunConfig(nranks=1,rank=1, experiment='notebook')):  # nranks specifies the number of GPUs to use

      config = ColBERTConfig(doc_maxlen=doc_maxlen, nbits=nbits, kmeans_niters=4) # kmeans_niters specifies the number of iterations of k-means clustering; 4 is a good and fast default.
                                                                                  # Consider larger numbers for small datasets.
      indexer = Indexer(checkpoint=checkpoint, config=config)
      indexer.index(name=index_name, collection=collection, overwrite=True)

  with Run().context(RunConfig(experiment='notebook')):

    searcher = Searcher(index=index_name, collection=collection)

  for query in queries:

    qas = {}

    print(f"#> {query}")

    qas[query] = []

    # Find the top-k passages for this query
    results = searcher.search(query, k=5)

    # Print out the top-k retrieved passages
    for passage_id, passage_rank, passage_score in zip(*results):
        print(f"\t [{passage_rank}] \t\t {passage_score:.1f} \t\t {searcher.collection[passage_id]}")
        qas[query].append(searcher.collection[passage_id])

    Data[key]['qas'].append(qas)




[Jul 05, 12:17:01] #> Note: Output directory /content/experiments/notebook/indexes/Mimic.2bits already exists


[Jul 05, 12:17:01] #> Will delete 10 files already at /content/experiments/notebook/indexes/Mimic.2bits in 20 seconds...
#> Joined...
[Jul 05, 12:17:36] #> Loading codec...
[Jul 05, 12:17:36] #> Loading IVF...
[Jul 05, 12:17:36] #> Loading doclens...


100%|██████████| 1/1 [00:00<00:00, 5698.78it/s]

[Jul 05, 12:17:36] #> Loading codes and residuals...



100%|██████████| 1/1 [00:00<00:00, 1150.39it/s]

#> Identify the sentence discussing stage.

#> QueryTokenizer.tensorize(batch_text[0], batch_background[0], bsize) ==
#> Input: . Identify the sentence discussing stage., 		 True, 		 None
#> Output IDs: torch.Size([32]), tensor([  101,     1,  6709,  1996,  6251, 10537,  2754,  1012,   102,   103,
          103,   103,   103,   103,   103,   103,   103,   103,   103,   103,
          103,   103,   103,   103,   103,   103,   103,   103,   103,   103,
          103,   103])
#> Output Mask: torch.Size([32]), tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0])

	 [1] 		 8.1 		 No separate primary lesion identified.
	 [2] 		 7.7 		 No pelvic masses are identified.
	 [3] 		 7.0 		 Bilateral symmetrical sclerosis is identified on the iliac side of the sacroiliac joints, consistent with osteitis condensans ilii.
	 [4] 		 6.6 		 Assess for extent of lesions.
	 [5] 		 6.2 		 Calcification is noted within the right pleural cavity (sequ




#> Starting...
#> Joined...
[Jul 05, 12:18:11] #> Loading codec...
[Jul 05, 12:18:11] #> Loading IVF...
[Jul 05, 12:18:11] #> Loading doclens...


100%|██████████| 1/1 [00:00<00:00, 4993.22it/s]

[Jul 05, 12:18:11] #> Loading codes and residuals...



100%|██████████| 1/1 [00:00<00:00, 1160.89it/s]

#> Identify the sentence discussing stage.

#> QueryTokenizer.tensorize(batch_text[0], batch_background[0], bsize) ==
#> Input: . Identify the sentence discussing stage., 		 True, 		 None
#> Output IDs: torch.Size([32]), tensor([  101,     1,  6709,  1996,  6251, 10537,  2754,  1012,   102,   103,
          103,   103,   103,   103,   103,   103,   103,   103,   103,   103,
          103,   103,   103,   103,   103,   103,   103,   103,   103,   103,
          103,   103])
#> Output Mask: torch.Size([32]), tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0])

	 [1] 		 6.2 		 There may be mild pulmonary vascular congestion.
	 [2] 		 5.4 		 The patient is status post right lower lobectomy.
	 [3] 		 5.2 		 Innumerable pulmonary metastases.
	 [4] 		 4.6 		 The mediastinal and hilar contours are relatively unremarkable.
	 [5] 		 4.4 		 Possible mild pulmonary vascular congestion.
#> Identify the sentence discussing histology
	 [1] 




#> Starting...
#> Joined...
[Jul 05, 12:18:46] #> Loading codec...
[Jul 05, 12:18:46] #> Loading IVF...
[Jul 05, 12:18:46] #> Loading doclens...


100%|██████████| 1/1 [00:00<00:00, 4999.17it/s]

[Jul 05, 12:18:46] #> Loading codes and residuals...



100%|██████████| 1/1 [00:00<00:00, 1111.96it/s]

#> Identify the sentence discussing stage.

#> QueryTokenizer.tensorize(batch_text[0], batch_background[0], bsize) ==
#> Input: . Identify the sentence discussing stage., 		 True, 		 None
#> Output IDs: torch.Size([32]), tensor([  101,     1,  6709,  1996,  6251, 10537,  2754,  1012,   102,   103,
          103,   103,   103,   103,   103,   103,   103,   103,   103,   103,
          103,   103,   103,   103,   103,   103,   103,   103,   103,   103,
          103,   103])
#> Output Mask: torch.Size([32]), tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0])

	 [1] 		 5.1 		 The airways are patent to the subsegmental level.
	 [2] 		 5.1 		 Coronal and sagittal reformations are performed.
	 [3] 		 4.9 		 Shortness of breath, evaluate for pulmonary embolism.
	 [4] 		 4.5 		 No pericardial effusion.
	 [5] 		 4.4 		 There are innumerable pulmonary nodules bilaterally, similar in appearance to prior study in most cases although som




#> Starting...
#> Joined...
[Jul 05, 12:19:21] #> Loading codec...
[Jul 05, 12:19:21] #> Loading IVF...
[Jul 05, 12:19:21] #> Loading doclens...


100%|██████████| 1/1 [00:00<00:00, 5482.75it/s]

[Jul 05, 12:19:21] #> Loading codes and residuals...



100%|██████████| 1/1 [00:00<00:00, 352.97it/s]

#> Identify the sentence discussing stage.

#> QueryTokenizer.tensorize(batch_text[0], batch_background[0], bsize) ==
#> Input: . Identify the sentence discussing stage., 		 True, 		 None
#> Output IDs: torch.Size([32]), tensor([  101,     1,  6709,  1996,  6251, 10537,  2754,  1012,   102,   103,
          103,   103,   103,   103,   103,   103,   103,   103,   103,   103,
          103,   103,   103,   103,   103,   103,   103,   103,   103,   103,
          103,   103])
#> Output Mask: torch.Size([32]), tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0])

	 [1] 		 5.1 		 The liver is diffusely involved with innumerable metastatic nodules throughout all portions of the liver and replacing much of the liver parenchyma.
	 [2] 		 3.7 		 There is no evidence of ascites or splenomegaly.
	 [3] 		 3.6 		 ___ female with known metastatic cancer, unknown primary with liver metastases and elevated liver function tests.
	 [4] 		 3.3 	




#> Starting...
#> Joined...
[Jul 05, 12:19:56] #> Loading codec...
[Jul 05, 12:19:56] #> Loading IVF...
[Jul 05, 12:19:56] #> Loading doclens...


100%|██████████| 1/1 [00:00<00:00, 5562.74it/s]

[Jul 05, 12:19:56] #> Loading codes and residuals...



100%|██████████| 1/1 [00:00<00:00, 373.76it/s]

#> Identify the sentence discussing stage.

#> QueryTokenizer.tensorize(batch_text[0], batch_background[0], bsize) ==
#> Input: . Identify the sentence discussing stage., 		 True, 		 None
#> Output IDs: torch.Size([32]), tensor([  101,     1,  6709,  1996,  6251, 10537,  2754,  1012,   102,   103,
          103,   103,   103,   103,   103,   103,   103,   103,   103,   103,
          103,   103,   103,   103,   103,   103,   103,   103,   103,   103,
          103,   103])
#> Output Mask: torch.Size([32]), tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0])

	 [1] 		 7.3 		 No mass is identified.
	 [2] 		 5.3 		 MRI is more sensitive for evaluation of metastases.
	 [3] 		 5.3 		 No contrast was administered.
	 [4] 		 4.5 		 The visualized paranasal sinuses and mastoid air cells are well aerated.
	 [5] 		 4.5 		 Newly diagnosed metastatic colon cancer, evaluate for brain metastases, altered mental status.
#> Identify the sente




#> Starting...
#> Joined...
[Jul 05, 12:20:31] #> Loading codec...
[Jul 05, 12:20:31] #> Loading IVF...
[Jul 05, 12:20:31] #> Loading doclens...


100%|██████████| 1/1 [00:00<00:00, 5053.38it/s]

[Jul 05, 12:20:31] #> Loading codes and residuals...



100%|██████████| 1/1 [00:00<00:00, 1151.96it/s]

#> Identify the sentence discussing stage.

#> QueryTokenizer.tensorize(batch_text[0], batch_background[0], bsize) ==
#> Input: . Identify the sentence discussing stage., 		 True, 		 None
#> Output IDs: torch.Size([32]), tensor([  101,     1,  6709,  1996,  6251, 10537,  2754,  1012,   102,   103,
          103,   103,   103,   103,   103,   103,   103,   103,   103,   103,
          103,   103,   103,   103,   103,   103,   103,   103,   103,   103,
          103,   103])
#> Output Mask: torch.Size([32]), tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0])

	 [1] 		 6.7 		 No fracture is identified.
	 [2] 		 6.7 		 NOTE ADDED AT ATTENDING REVIEW Although I agree there is no evidence of hemorrhage, the mass effect and edema associated with the right frontal lesion have increased since ___.
	 [3] 		 6.4 		 Follow up post-op changes.
	 [4] 		 4.2 		 Basal cisterns remain patent.
	 [5] 		 2.7 		 Axial bone algorithm reformats in




#> Starting...
#> Joined...
[Jul 05, 12:21:07] #> Loading codec...
[Jul 05, 12:21:07] #> Loading IVF...
[Jul 05, 12:21:07] #> Loading doclens...


100%|██████████| 1/1 [00:00<00:00, 1509.29it/s]

[Jul 05, 12:21:07] #> Loading codes and residuals...



100%|██████████| 1/1 [00:00<00:00, 1209.08it/s]

#> Identify the sentence discussing stage.

#> QueryTokenizer.tensorize(batch_text[0], batch_background[0], bsize) ==
#> Input: . Identify the sentence discussing stage., 		 True, 		 None
#> Output IDs: torch.Size([32]), tensor([  101,     1,  6709,  1996,  6251, 10537,  2754,  1012,   102,   103,
          103,   103,   103,   103,   103,   103,   103,   103,   103,   103,
          103,   103,   103,   103,   103,   103,   103,   103,   103,   103,
          103,   103])
#> Output Mask: torch.Size([32]), tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0])

	 [1] 		 7.8 		 After a detailed discussion, informed written consent was obtained.
	 [2] 		 6.0 		 The site was marked.
	 [3] 		 5.6 		 5) Spiral Acquisition 12.
	 [4] 		 5.5 		 Dr. ___ supervised the trainee during the key components of the procedure and reviewed and agrees with the trainee's findings.
	 [5] 		 5.5 		 4) Spiral Acquisition 12.
#> Identify the sentence d




#> Starting...
#> Joined...
[Jul 05, 12:21:42] #> Loading codec...
[Jul 05, 12:21:42] #> Loading IVF...
[Jul 05, 12:21:42] #> Loading doclens...


100%|██████████| 1/1 [00:00<00:00, 4843.31it/s]

[Jul 05, 12:21:42] #> Loading codes and residuals...



100%|██████████| 1/1 [00:00<00:00, 1115.51it/s]

#> Identify the sentence discussing stage.

#> QueryTokenizer.tensorize(batch_text[0], batch_background[0], bsize) ==
#> Input: . Identify the sentence discussing stage., 		 True, 		 None
#> Output IDs: torch.Size([32]), tensor([  101,     1,  6709,  1996,  6251, 10537,  2754,  1012,   102,   103,
          103,   103,   103,   103,   103,   103,   103,   103,   103,   103,
          103,   103,   103,   103,   103,   103,   103,   103,   103,   103,
          103,   103])
#> Output Mask: torch.Size([32]), tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0])

	 [1] 		 5.7 		 Oral contrast was not administered.
	 [2] 		 5.5 		 Small bowel loops demonstrate normal caliber, wall thickness and enhancement throughout.
	 [3] 		 5.1 		 There is 0.
	 [4] 		 5.0 		 ___ year old woman with hx bladder cancer, s/p cystectomy and ileal conduit, post op course c/b pelvic fluid collections and PE, now with frankly blood drain output and wors




#> Starting...
#> Joined...
[Jul 05, 12:22:17] #> Loading codec...
[Jul 05, 12:22:17] #> Loading IVF...
[Jul 05, 12:22:17] #> Loading doclens...


100%|██████████| 1/1 [00:00<00:00, 5562.74it/s]

[Jul 05, 12:22:17] #> Loading codes and residuals...



100%|██████████| 1/1 [00:00<00:00, 1099.42it/s]

#> Identify the sentence discussing stage.

#> QueryTokenizer.tensorize(batch_text[0], batch_background[0], bsize) ==
#> Input: . Identify the sentence discussing stage., 		 True, 		 None
#> Output IDs: torch.Size([32]), tensor([  101,     1,  6709,  1996,  6251, 10537,  2754,  1012,   102,   103,
          103,   103,   103,   103,   103,   103,   103,   103,   103,   103,
          103,   103,   103,   103,   103,   103,   103,   103,   103,   103,
          103,   103])
#> Output Mask: torch.Size([32]), tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0])

	 [1] 		 7.2 		 A fatty structure appears adherent to the lateral right atrial wall abutting an area of atrial calcification could be an interatrial lipoma.
	 [2] 		 6.1 		 The case was posted to the critical findings dashboard for direct notification of the referring physician.
	 [3] 		 5.4 		 The airways are widely patent to subsegmental level bilaterally.
	 [4] 		 5.1 




#> Starting...
#> Joined...
[Jul 05, 12:22:53] #> Loading codec...
[Jul 05, 12:22:53] #> Loading IVF...
[Jul 05, 12:22:53] #> Loading doclens...


100%|██████████| 1/1 [00:00<00:00, 5275.85it/s]

[Jul 05, 12:22:53] #> Loading codes and residuals...



100%|██████████| 1/1 [00:00<00:00, 1072.16it/s]

#> Identify the sentence discussing stage.

#> QueryTokenizer.tensorize(batch_text[0], batch_background[0], bsize) ==
#> Input: . Identify the sentence discussing stage., 		 True, 		 None
#> Output IDs: torch.Size([32]), tensor([  101,     1,  6709,  1996,  6251, 10537,  2754,  1012,   102,   103,
          103,   103,   103,   103,   103,   103,   103,   103,   103,   103,
          103,   103,   103,   103,   103,   103,   103,   103,   103,   103,
          103,   103])
#> Output Mask: torch.Size([32]), tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0])

	 [1] 		 6.4 		 There is a heterogeneous 3.
	 [2] 		 6.3 		 BONE WINDOWS No suspicious lytic or sclerotic osseous lesion is identified.
	 [3] 		 6.0 		 The opacified stomach and intra-abdominal loops of bowel are unremarkable.
	 [4] 		 4.8 		 Two calcified densities within the left buttock are noted, likely injection granulomas.
	 [5] 		 4.7 		 Airways are patent to the s




In [None]:
file_path = 'data.json'

with open(file_path, 'w') as file:
    json.dump(Data, file,indent=4)