# This file adds the cross reference email ids, RxNorm drugs list and enriched content using Qwen to JSON structure
- This file utilizes the utility file emailProcessor.py to import necessary functions

### STEP 1: Adding cross reference emails id to JSONL file obtained after OpenAPI Process

In [None]:
from emailProcessor import add_cross_references_emailIds

input_file = "output_data/all_emails_structured.jsonl"
output_file = "output_data/json_with_crossRefs.jsonl"

data, crossRefIds = add_cross_references_emailIds(input_file,output_file, 0.25)

Extracted 649 items with email bodies
Skipped 3 items without bodies: ['mrwf0232', 'xxpw0232', 'nynw0232']

Cross-references added to 649 items!


### STEP 2: Adding rxNorm drugs list to the jsonl obtained after STEP 1
- install spacy and its biomedical model library

In [2]:
!pip install "spacy>=3.7.0,<3.8.0" "scispacy>=0.5.3,<0.6.0"
!pip install \
  https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.4/en_ner_bc5cdr_md-0.5.4.tar.gz

Collecting spacy<3.8.0,>=3.7.0
  Downloading spacy-3.7.5-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (27 kB)
Collecting scispacy<0.6.0,>=0.5.3
  Downloading scispacy-0.5.5-py3-none-any.whl.metadata (18 kB)
Collecting spacy-legacy<3.1.0,>=3.0.11 (from spacy<3.8.0,>=3.7.0)
  Downloading spacy_legacy-3.0.12-py2.py3-none-any.whl.metadata (2.8 kB)
Collecting spacy-loggers<2.0.0,>=1.0.0 (from spacy<3.8.0,>=3.7.0)
  Downloading spacy_loggers-1.0.5-py3-none-any.whl.metadata (23 kB)
Collecting murmurhash<1.1.0,>=0.28.0 (from spacy<3.8.0,>=3.7.0)
  Downloading murmurhash-1.0.15-cp312-cp312-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl.metadata (2.3 kB)
Collecting cymem<2.1.0,>=2.0.2 (from spacy<3.8.0,>=3.7.0)
  Downloading cymem-2.0.13-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (9.7 kB)
Collecting preshed<3.1.0,>=3.0.2 (from spacy<3.8.0,>=3.7.0)
  Downloading preshed-3.0.12-cp312-cp312-manylinux1_x86_64.many

In [None]:
import importlib
import emailProcessor

# Reloading module
importlib.reload(emailProcessor)

  deserializers["tokenizer"] = lambda p: self.tokenizer.from_disk(  # type: ignore[union-attr]


<module 'emailProcessor' from '/home/jovyan/work/Group-3/emailProcessor.py'>

In [None]:
from emailProcessor import extractRXnormDrugs

input_file = 'output_data/json_with_crossRefs.jsonl'
output_file = 'output_data/json_with_crossRefs_rxnorm.jsonl'
extractor = extractRXnormDrugs(input_file,output_file)
extractor.add_rxnorm_drugs_name()

### STEP 3: Extracting additional semantic information using Qwen API and adding these details to JSONL obtained after STEP 2

In [20]:
from dotenv import load_dotenv
import os

env_path = os.getcwd()+'/.env'
print(env_path)
load_dotenv(env_path)

/home/jovyan/work/Group-3/.env


True

#### *Prerequisite* : 
1. Qwen API key and Model name - Stored here in .env file
2. Create two folder to store batch files before and after processing

#### **STEP 3a**. Splitting the JSONL file into batches

In [None]:
from emailProcessor import QwenEntityExtractor

api_key2 = os.getenv('QWEN_API_KEY')
model = os.getenv('QWEN_MODEL')
input_file = "output_data/json_with_crossRefs_rxnorm.jsonl" #add path to your input file
extractor = QwenEntityExtractor(api_key=api_key2,model=model)
batch_dir = 'output_data/enriched_batches' # add path to your folder where you want to store the batch files.

# spliting into batches
batch_files = extractor.split_into_batches(
    input_file=input_file,         
    output_dir=batch_dir
)


Batch Planning:
   Total items: 652
   Items per batch: 10
   Total batches needed: 66

Created 66 batch files in 'enriched_batches/' directory



### STEP 3b. Processing each batch one by one using LLM

In [None]:

output_batch_dir = 'output_data/processed_batches' # add path to your folder where you want to store the batch files after processing
total_api_calls = 0
for i in range(1,len(batch_files)+1):
  print(f'Processing batch {i:03d}\n')
  api_calls = extractor.process_batch(
      batch_file = f'{batch_dir}/batch_{i:03d}.json',
      output_file = f'{output_batch_dir}/processed_batch_{i:03d}.json'
  )
  total_api_calls += api_calls
  print(f'Api calls this batch: {api_calls}')
print(f'Total api calls: {total_api_calls}\n')

### STEP 4: Find out the failed batches and re-process them again

In [None]:
from emailProcessor import reprocessFailedBatch

reprocessor = reprocessFailedBatch(api_key2)
batch_dir = 'output_data/enriched_batches'           # add path to your input batch directory
enriched_batch_dir = 'output_data/processed_batches' # add path to your output batch directory which contains the processed failed batches
errors = reprocessor.find_error_inBatches(enriched_batch_dir)
print('Number of failed batches',len(errors))

if len(errors)>0:
    print('\nRe-processing failed batch')
    reprocessor.reprocess_failed_batches(batch_dir,enriched_batch_dir)

### STEP 5: Merge all the json file in a processed batch file to single JSONL file

In [None]:
# merge all the batches into single jsonl file
from emailProcessor import merge_batches_to_jsonl

enriched_batch = 'output_data/processed_batches'  # path to where all processed batches are stored
op_path = 'output_data/enriched_output.jsonl'     # path to final output file 

merged_items = merge_batches_to_jsonl(
    enriched_folder=enriched_batch,
    output_file=op_path
)

print(f"All {len(merged_items)} items merged to JSONL!")