# Medical Coding Assignment - RAG Solution

This notebook implements a RAG-based solution for medical coding using:
- **Falcon 7B**: Local LLM for intelligent code generation
- **ChromaDB**: Vector database for storing medical knowledge
- **Sentence Transformers**: For generating embeddings
- **Real ICD and CPT codes**: Loaded from provided Excel files

## Problem Overview
The task involves extracting and mapping medical codes (ICD and CPT) from clinical text using RAG with local LLMs and vector databases.


In [3]:
# Import required libraries
import pandas as pd
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
import spacy
import json
import re
from typing import List, Dict, Tuple, Optional
import warnings
warnings.filterwarnings('ignore')

print("Libraries imported successfully!")


  from .autonotebook import tqdm as notebook_tqdm


Libraries imported successfully!


In [2]:
# Load medical codes from Excel files
print("Loading medical codes from Excel files...")

# Load ICD codes
icd_df = pd.read_excel('ICD_code_Assignment.xlsx')
print(f"Loaded {len(icd_df)} ICD codes")

# Load CPT codes  
cpt_df = pd.read_excel('cpt_code_assignment.xlsx')
print(f"Loaded {len(cpt_df)} CPT codes")

# Display sample data
print("\nSample ICD codes:")
print(icd_df.head())
print("\nSample CPT codes:")
print(cpt_df.head())


Loading medical codes from Excel files...
Loaded 295 ICD codes
Loaded 20 CPT codes

Sample ICD codes:
  ICD Code                                        Description
0    A04.8    Other specified bacterial intestinal infections
1    A63.0                        Anogenital (venereal) warts
2   B37.81                               Candidal esophagitis
3    B82.9                 Intestinal parasitism, unspecified
4   B96.81  Helicobacter pylori [H. pylori] as the cause o...

Sample CPT codes:
   CPT Code                                      Description
0     43200      Esophagoscopy Flexible Transoral Diagnostic
1     43202     Esophagoscopy Flexible Transoral With Biopsy
2     43220     Esophagoscopy Flex Balloon Dilat <30 Mm Diam
3     43235  Esophagogastroduodenoscopy Transoral Diagnostic
4     43236  Esophagogastroduodenoscopy Submucosal Injection


In [5]:
# Load Falcon 7B model
print("Loading Falcon 7B model...")
model_name = "tiiuae/falcon-7b-instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    trust_remote_code=True,
    dtype=torch.bfloat16,    # ← replaced torch_dtype with dtype
    device_map="auto",
    low_cpu_mem_usage=True
)


# Create text generation pipeline
falcon_pipeline = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=512,
    do_sample=True,
    temperature=0.7,
    top_p=0.9
)
print("Falcon 7B model loaded successfully!")


Loading Falcon 7B model...


Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Fetching 2 files: 100%|██████████| 2/2 [00:31<00:00, 15.86s/it]
Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  4.17it/s]
Some parameters are on the meta device because they were offloaded to the cpu and disk.
Device set to use cpu


Falcon 7B model loaded successfully!


In [7]:
import pandas as pd
import json
import re
from typing import List, Dict
import chromadb
from sentence_transformers import SentenceTransformer
from datetime import datetime

class MedicalCodeExtractor:
    """Enhanced rule-based and RAG-based medical code extraction"""
    
    def __init__(self, icd_excel_path: str, cpt_excel_path: str):
        # Initialize ChromaDB
        self.client = chromadb.Client()
        
        # Initialize embedding model
        self.embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
        
        # Clear existing collections and create new ones
        try:
            self.client.delete_collection("icd_codes")
        except:
            pass
        try:
            self.client.delete_collection("cpt_codes")
        except:
            pass
            
        # Create collections
        self.icd_collection = self.client.create_collection(name="icd_codes")
        self.cpt_collection = self.client.create_collection(name="cpt_codes")
        
        # Load data from Excel files
        self._load_medical_codes(icd_excel_path, cpt_excel_path)
        
        # Enhanced entity lexicon for rule-based extraction
        self.entity_lexicon = self._create_entity_lexicon()
    
    def _create_entity_lexicon(self):
        """Create comprehensive entity lexicon for rule-based extraction"""
        return {
            "CLINICAL": [
                # Multi-word phrases FIRST (order matters for precedence)
                "colon cancer screening",
                "personal history of colonic polyps", 
                "right upper quadrant abdominal pain",
                "atypical chest pain",
                "rectal bleeding",
                "no immediate complications",
                "minimal estimated blood loss", 
                "good bowel preparation",
                "no polyps found",
                "polyp removal",
                "sessile polyp",
                "abdominal pain",
                "chest pain",
                # Single words
                "screening", "bleeding", "hemorrhoids", "history"
            ],
            
            "ANATOMY": [
                # Multi-word phrases FIRST
                "right upper quadrant",
                "2nd portion of duodenum", 
                "appendiceal orifice",
                "sigmoid colon",
                "ascending colon",
                "descending colon",
                "transverse colon",
                "splenic flexure",
                "hepatic flexure", 
                "terminal ileum",
                "distal rectum",
                "proximal colon",
                "distal esophagus",
                "duodenal bulb",
                "ileocecal valve",
                "anal canal",
                "anal verge",
                "stomach body", 
                "left colon",
                "right colon",
                # Single words
                "rectum", "sigmoid", "colon", "cecum", "ileum", "anus",
                "esophagus", "stomach", "duodenum", "antrum", "pylorus", "z-line"
            ],
            
            "DIAGNOSIS": [
                # Multi-word phrases FIRST  
                "moderate sigmoid diverticulosis",
                "sigmoid diverticulosis",
                "internal hemorrhoids",
                "barrett's esophagus",
                "mild antral and body gastritis", 
                "ulcer of anus and rectum",
                "hemorrhage of anus and rectum",
                "localized erosion",
                "sessile polyp",
                "colon polyps",
                "colonic polyps",
                "melanosis coli",
                "colorectal cancer",
                "mild gastritis",
                # Two word combos
                "colon cancer",
                # Single words
                "diverticulosis", "hemorrhoids", "gastritis", "proctitis", 
                "erosion", "polyp", "polyps", "ulcer", "cancer"
            ],
            
            "PROCEDURE": [
                # Multi-word phrases FIRST
                "cold snare polypectomy",
                "esophagogastroduodenoscopy", 
                "boston bowel preparation scoring",
                "monitored anesthesia care",
                "intravenous medication administration",
                "narrow band imaging",
                "retroflexion in rectum",
                "scope passage to cecum",
                "bowel preparation",
                "cold forceps", 
                "cold snare",
                # Single words
                "colonoscopy", "polypectomy", "retroflexion", "sigmoidoscopy",
                "endoscopy", "biopsy", "biopsies", "egd", "dilation", "suctioning"
            ]
        }
    
    def _load_medical_codes(self, icd_path: str, cpt_path: str):
        """Load both ICD and CPT codes into vector database"""
        print("Loading medical codes into vector database...")
        
        # Load ICD codes
        try:
            icd_df = pd.read_excel(icd_path)
            icd_data = []
            for _, row in icd_df.iterrows():
                code = str(row.iloc[0]).strip()
                description = str(row.iloc[1]).strip()
                if code and description and code != 'nan' and len(code) > 1:
                    icd_data.append({
                        "code": code,
                        "description": description,
                        "type": "ICD-10"
                    })
            print(f"📊 Found {len(icd_data)} ICD codes")
        except Exception as e:
            print(f"❌ Error loading ICD codes: {e}")
            icd_data = []
        
        # Load CPT codes
        try:
            cpt_df = pd.read_excel(cpt_path)
            cpt_data = []
            for _, row in cpt_df.iterrows():
                code = str(row.iloc[0]).strip()
                description = str(row.iloc[1]).strip()
                if code and description and code != 'nan' and len(code) > 1:
                    clean_code = re.sub(r'\D', '', code).zfill(5)
                    if len(clean_code) == 5:  # Ensure valid CPT code
                        cpt_data.append({
                            "code": clean_code,
                            "description": description,
                            "type": "CPT"
                        })
            print(f"📊 Found {len(cpt_data)} CPT codes")
        except Exception as e:
            print(f"❌ Error loading CPT codes: {e}")
            cpt_data = []
        
        # Add to ChromaDB
        if icd_data:
            self._add_codes_to_collection(self.icd_collection, icd_data, "icd")
        if cpt_data:
            self._add_codes_to_collection(self.cpt_collection, cpt_data, "cpt")
        
        print(f"✅ Loaded {len(icd_data)} ICD codes and {len(cpt_data)} CPT codes")
    
    def _add_codes_to_collection(self, collection, code_data, code_type: str):
        """Add codes to ChromaDB collection"""
        documents = []
        metadatas = []
        ids = []
        
        for i, item in enumerate(code_data):
            enhanced_doc = f"{item['code']}: {item['description']}"
            documents.append(enhanced_doc)
            metadatas.append(item)
            ids.append(f"{code_type}_{i}")
        
        # Process in batches
        batch_size = 100
        for i in range(0, len(documents), batch_size):
            batch_docs = documents[i:i+batch_size]
            batch_metas = metadatas[i:i+batch_size]
            batch_ids = ids[i:i+batch_size]
            
            embeddings = self.embedding_model.encode(batch_docs).tolist()
            
            collection.add(
                embeddings=embeddings,
                documents=batch_docs,
                metadatas=batch_metas,
                ids=batch_ids
            )
    
    def extract_entities_rule_based(self, text: str) -> Dict:
        """Enhanced rule-based extraction using lexicon approach"""
        text_lower = text.lower()
        
        found_entities = {
            "Clinical_Terms": set(),
            "Anatomical_Locations": set(),
            "Diagnosis": set(),
            "Procedure": set()
        }
        
        # Map lexicon categories to output categories
        category_map = {
            "CLINICAL": "Clinical_Terms",
            "ANATOMY": "Anatomical_Locations", 
            "DIAGNOSIS": "Diagnosis",
            "PROCEDURE": "Procedure"
        }
        
        # Extract using lexicon - process multi-word phrases first
        for entity_type, terms in self.entity_lexicon.items():
            output_category = category_map[entity_type]
            
            # Sort by length (longest first) to handle nested phrases correctly
            sorted_terms = sorted(terms, key=len, reverse=True)
            
            for term in sorted_terms:
                # Create pattern with word boundaries
                pattern = r'\b' + re.escape(term.lower()) + r'\b'
                
                # Find matches
                if re.search(pattern, text_lower):
                    found_entities[output_category].add(term)
        
        # Enhanced section-based extraction for diagnoses
        self._extract_structured_sections(text, found_entities)
        
        # Clean and format results
        return self._clean_entities(found_entities)
    
    def _extract_structured_sections(self, text: str, entities: Dict):
        """Extract entities from structured report sections"""
        text_lower = text.lower()
        
        # Extract from diagnosis sections
        diagnosis_patterns = [
            r'diagnosis\s*:([^•]+?)(?=procedure|impression|$)',
            r'pre-operative diagnosis\s*:([^•]+?)(?=procedure|post-operative|$)',
            r'post-operative diagnosis\s*:([^•]+?)(?=procedure|$)',
            r'impression\s*:([^•]+?)(?=plan|$)',
            r'indication\s*:([^•]+?)(?=diagnosis codes|procedure|$)'
        ]
        
        for pattern in diagnosis_patterns:
            matches = re.findall(pattern, text_lower, re.IGNORECASE | re.DOTALL)
            for match in matches:
                # Extract individual diagnoses
                diagnoses = re.split(r'[,;]', match)
                for diag in diagnoses:
                    diag_clean = diag.strip()
                    if len(diag_clean) > 5:
                        entities["Diagnosis"].add(diag_clean)
        
        # Extract from procedure sections  
        procedure_patterns = [
            r'procedure\s*:([^•]+?)(?=findings|impression|$)',
            r'procedures\s*:([^•]+?)(?=findings|impression|$)',
            r'colonoscopy procedure\s*:([^•]+?)(?=findings|impression|$)',
            r'egd procedure\s*:([^•]+?)(?=findings|impression|$)'
        ]
        
        for pattern in procedure_patterns:
            matches = re.findall(pattern, text_lower, re.IGNORECASE | re.DOTALL)
            for match in matches:
                procedures = re.split(r'[,;]', match)
                for proc in procedures:
                    proc_clean = proc.strip()
                    if len(proc_clean) > 5:
                        entities["Procedure"].add(proc_clean)
    
    def _clean_entities(self, entities: Dict) -> Dict:
        """Clean and format extracted entities"""
        cleaned = {}
        
        for category, entity_set in entities.items():
            # Convert to list and remove empty strings
            entity_list = [item for item in entity_set if item and len(item.strip()) > 2]
            
            # Remove duplicates (case-insensitive)
            seen = set()
            unique_entities = []
            for entity in entity_list:
                entity_lower = entity.lower()
                if entity_lower not in seen:
                    seen.add(entity_lower)
                    unique_entities.append(entity)
            
            # Sort and limit
            cleaned[category] = sorted(unique_entities)[:15]
        
        return cleaned
    
    def extract_clinical_context(self, report_text: str) -> str:
        """Extract relevant clinical context for RAG queries"""
        # Clean and normalize text
        text = re.sub(r'\s+', ' ', report_text)
        
        sections = []
        
        # Look for diagnosis sections
        diagnosis_patterns = [
            r'Diagnosis\s*:([^•]+?)(?=Procedure|IMPRESSION|$)',
            r'Pre-operative Diagnosis\s*:([^•]+?)(?=Procedure|Post-operative|$)',
            r'Post-operative Diagnosis\s*:([^•]+?)(?=Procedure|$)',
            r'IMPRESSION\s*:([^•]+?)(?=PLAN|$)',
            r'Indication\s*:([^•]+?)(?=Diagnosis Codes|Procedure|$)'
        ]
        
        for pattern in diagnosis_patterns:
            matches = re.findall(pattern, report_text, re.IGNORECASE | re.DOTALL)
            sections.extend(matches)
        
        # Extract procedure sections
        procedure_patterns = [
            r'Procedure\s*:([^•]+?)(?=Findings|IMPRESSION|$)',
            r'Procedures\s*:([^•]+?)(?=Findings|IMPRESSION|$)',
            r'Colonoscopy PROCEDURE\s*:([^•]+?)(?=Findings|IMPRESSION|$)',
            r'EGD PROCEDURE\s*:([^•]+?)(?=Findings|IMPRESSION|$)'
        ]
        
        for pattern in procedure_patterns:
            matches = re.findall(pattern, report_text, re.IGNORECASE | re.DOTALL)
            sections.extend(matches)
        
        # Extract findings sections
        findings_matches = re.findall(
            r'Findings\s*:([^•]+?)(?=IMPRESSION|PLAN|$)',
            report_text, re.IGNORECASE | re.DOTALL
        )
        sections.extend(findings_matches)
        
        # If no structured sections found, use the entire text but limit length
        if not sections:
            context = text[:1000]
        else:
            context = " ".join([section.strip() for section in sections])
            if len(context) > 1500:
                context = context[:1500]
        
        return context
    
    def retrieve_codes_with_rag(self, report_text: str, top_k: int = 5) -> Dict:
        """Retrieve relevant codes using RAG approach"""
        
        # Extract clinical context
        clinical_context = self.extract_clinical_context(report_text)
        query_embedding = self.embedding_model.encode([clinical_context]).tolist()
        
        # Retrieve ICD codes
        try:
            icd_results = self.icd_collection.query(
                query_embeddings=query_embedding,
                n_results=top_k
            )
        except Exception as e:
            print(f"❌ Error retrieving ICD codes: {e}")
            icd_results = {'documents': [[]], 'metadatas': [[]], 'distances': [[]]}
        
        # Retrieve CPT codes
        try:
            cpt_results = self.cpt_collection.query(
                query_embeddings=query_embedding,
                n_results=top_k
            )
        except Exception as e:
            print(f"❌ Error retrieving CPT codes: {e}")
            cpt_results = {'documents': [[]], 'metadatas': [[]], 'distances': [[]]}
        
        # Format results
        icd_codes = []
        for i, doc in enumerate(icd_results['documents'][0]):
            confidence = 1 - icd_results['distances'][0][i]
            icd_codes.append({
                "code": icd_results['metadatas'][0][i]['code'],
                "description": icd_results['metadatas'][0][i]['description'],
                "confidence": round(confidence, 3)
            })
        
        cpt_codes = []
        for i, doc in enumerate(cpt_results['documents'][0]):
            confidence = 1 - cpt_results['distances'][0][i]
            cpt_codes.append({
                "code": cpt_results['metadatas'][0][i]['code'],
                "description": cpt_results['metadatas'][0][i]['description'],
                "confidence": round(confidence, 3)
            })
        
        return {
            "ICD_10": sorted(icd_codes, key=lambda x: x['confidence'], reverse=True)[:3],
            "CPT": sorted(cpt_codes, key=lambda x: x['confidence'], reverse=True)[:3]
        }
    
    def process_report(self, report_text: str) -> Dict:
        """Complete pipeline for medical report processing"""
        try:
            # Step 1: Extract clinical entities using enhanced rule-based patterns
            print("🔍 Extracting clinical entities with enhanced rule-based approach...")
            clinical_entities = self.extract_entities_rule_based(report_text)
            
            # Step 2: Retrieve codes using RAG
            print("🔍 Retrieving medical codes with RAG...")
            medical_codes = self.retrieve_codes_with_rag(report_text)
            
            # Step 3: Prepare final output in exact requested format
            result = {
                "Clinical_Terms": clinical_entities["Clinical_Terms"],
                "Anatomical_Locations": clinical_entities["Anatomical_Locations"],
                "Diagnosis": clinical_entities["Diagnosis"],
                "Procedure": clinical_entities["Procedure"],
                "ICD_10": [code["code"] for code in medical_codes["ICD_10"]],
                "CPT": [code["code"] for code in medical_codes["CPT"]],
                "HCPCS": []  # Empty for now as requested
            }
            
            return result
            
        except Exception as e:
            print(f"❌ Error processing report: {e}")
            return {
                "Clinical_Terms": [],
                "Anatomical_Locations": [],
                "Diagnosis": [],
                "Procedure": [],
                "ICD_10": [],
                "CPT": [],
                "HCPCS": []
            }

def save_results_to_json(results: List[Dict], output_path: str = "medical_report_analysis.json"):
    """Save analysis results to JSON file with metadata"""
    
    output_data = {
        "metadata": {
            "generated_at": datetime.now().isoformat(),
            "total_reports_processed": len(results),
            "system": "Medical Code Extraction Pipeline",
            "methodology": "Rule-based entity extraction + RAG for medical codes"
        },
        "results": results
    }
    
    try:
        with open(output_path, 'w', encoding='utf-8') as f:
            json.dump(output_data, f, indent=2, ensure_ascii=False)
        
        print(f"\n💾 Results successfully saved to: {output_path}")
        print(f"📊 Processed {len(results)} medical reports")
        
        # Print summary
        total_entities = sum(len(report.get("Clinical_Terms", [])) + 
                           len(report.get("Anatomical_Locations", [])) + 
                           len(report.get("Diagnosis", [])) + 
                           len(report.get("Procedure", [])) for report in results)
        total_codes = sum(len(report.get("ICD_10", [])) + 
                         len(report.get("CPT", [])) for report in results)
        
        print(f"📈 Summary: {total_entities} entities extracted, {total_codes} medical codes found")
        
    except Exception as e:
        print(f"❌ Error saving results to JSON: {e}")

# Sample medical reports
sample_reports = [
    {
        "id": "report_1",
        "text": """Report 1:Diagnosis:Z86.0100 History of colon polypsZ86.0100 -Personal history of colonic polyps K64.8 -Internal hemorrhoids K57.90 –DiverticulosisProcedure:Procedure Code ColonoscopyAnesthesia Type : Monitored AnesthesiaCare ASA Class : II Lactated Ringers -Solution, Intravenous as directed -350 00 , Last Administered By: Smith, George At 1041 on 07/07/2025 Lidocaine HCI 2 % Solution, IV -20 00 , Last Administered By: Smith, George At 1023 on 07/07/2025 Propofol 500 MG/50ML Emulsion, Intravenous -240 00 , Last Administered By: Smith, George At 1041 on 07/07/2025Colonoscopy PROCEDURE : There was nothing precluding endoscopy on history or physical exam. Informed consent was obtained with risks and benefits explained to the patient. The patient tolerated the procedure well. There no immediate complications. The patient was placed in left lateral decubitus position. A rectal exam was performed. The pediatric colonoscope was inserted into the rectum and carefully advanced to the cecum. The cecum was identified by the ileocecal valve, the triradiate fold and appendicealorifice. Careful inspection was made as the colonoscope was removed including retroflexion in the rectum. Findings-The preparation was good. There was melanosis coli in the proximal colon. There was moderate sigmoid diverticulosis. Internal hemorrhoids were seen. IMPRESSION : The patient is an 82-year-old female with history of colon polyps. Today's exam did not reveal any polyps. she did have melanosis coli, diverticulosis and internal hemorrhoids. PLAN : No routine colonoscopyColonoscopy The patient tolerated the procedure without complications .The colonoscopy was uneventful"""
    },
    {
        "id": "report_2", 
        "text": """Report 2:Diagnosis:Pre-operative Diagnosis: · Z12.11 [Colon cancer screening]Procedure:Procedure: Colonoscopy -Anal Canal: K64.9 -Hemorrhoids, unspecified (without mention of degree) -Rectum: Normal -Sigmoid Colon: Normal -Descending Colon: Normal -Splenic Flexure: Normal -Transverse Colon: Normal -Hepatic Flexure: Normal -Ascending Colon: Normal -Cecum: Normal -Terminal Ileum: Normal -Polyps: -Site: Descending Colon -Size: 6 mm -Type: Sessile Polyp -Device/Method: Cold Snare -Polyp completely removed and retrievedComplications: No Immediate Complication."""
    },
    {
        "id": "report_3",
        "text": """Report 3:Diagnosis:Indication : Last colonoscopy 3 years ago, Rectal bleedingDiagnosis Codes :K62.6, Ulcer of anus and rectum K64.8, Other hemorrhoids K62.5, Hemorrhage of anus and rectumProcedure:Colonoscopy. Before the procedure, time out was performed, the patient was identified, and the procedure was verified. After I obtained informed consent, the scope was passed under direct vision. Throughout the procedure, the patient's blood pressure, pulse, and oxygensaturations were monitored continuously. The Colonoscope was introduced through the anus and advanced to the cecum, identified by appendiceal orifice and ileocecal valve. The colonoscopy was performed without diiiculty. The patient tolerated the procedure well. Procedure and risks explained to patient which include but not limited to medication reaction, bleeding, perforation, aspiration and misssed lesions. Prep was good. Washes and suctioning done as needed also so overall good visualization of mucosa. Slow withdrawal on of scope with careful inspection of mucosa for abnormalities. Judicious gas insuilation used on way in. Gas removal was done as much as possible on way out. . Retroflexion done in the rectum. Scope only advanced when lumen identified. The quality of the bowel preparation was evaluated using the BBPS (Boston Bowel Preparation Scale) with scores of: Right Colon = 3 (entire mucosa seen well with no residual staining, small fragments of stool or opaque liquid), Transverse Colon = 3 (entire mucosa seen well with no residual staining, small fragments of stool or opaque liquid) and Left Colon = 3 (entire mucosa seen well with no residual staining, small fragments of stool or opaque liquid). The total BBPS score equals 9.Findings :-A single localized erosion (proctitis) was found in the distal rectum at the anal verge .. Biopsies were taken with a cold forceps for histology. Estimated blood loss was minimal. -Internal hemorrhoids were found during retroflexion. The hemorrhoids were moderate. -The exam was otherwise without abnormality on direct and retroflexion views.Impression :-A single erosion in the distal rectum. Biopsied. -Internal hemorrhoids. -The examination was otherwise normal on direct and retroflexion views.Procedure Codes :45380, Colonoscopy, flexible; with biopsy, single or multiple"""
    },
    {
        "id": "report_4", 
        "text": """Report 4:Diagnosis:Pre-operative DiagnosisR07.89 -Atypical chest pain R10.11 -Right upper quadrant abdominal painPost-operative DiagnosisR07.89 -Atypical chest pain R10.11 -RUQ pain K29.70 -GastritisProcedures:Procedure Code EGD w/BiopsyAnesthesia Type : Monitored Anesthesia Care Lactated Ringers -Solution, Intravenous as directed -200 00 , Last Administered By: Smith, George At 1419 on 07/07/2025 Lidocaine HCI 2 % Solution, IV -60 00 , Last Administered By: Smith, George At 1408 on 07/07/2025 Propofol 500 MG/50ML Emulsion, Intravenous -350 00 , Last Administered By: Smith, George At 1419 on 07/07/2025EGD PROCEDURE : There was nothing precluding endoscopy on history or physical exam. Informed consent was obtained with risks and benefits explained to the patient. The patient tolerated the procedure well. There were no immediate complications The patient was placed in the left lateraldecubitus position. The Olympus endoscope was inserted into the esophagus under direct visualization. It was advanced through the esophagus, into the stomach and through the pylorus to the duodenal bulb and 2nd portion of the duodenum. Careful inspection was made as the endoscope was removed including retroflexion in the stomach. Findings-In the distal esophagus there was an irregular Z-line from 38-39 cm suggestive of Barrett's esophagus. This was examined both white light and narrow band imaging. Biopsies were obtained of the distal esophagus. In the stomach there was mild antral and body gastritis. Biopsies were obtained for H.pylori. There were no ulcers or masses seen. The visualized portion of the duodenal appeared normal without ulcers or inflammation. IMPRESSION : The patient is 55-year-old male with atypical chest pain and right upper quadrant pain. Biopsies today were obtained for H.pylori and Barrett's esophagus. If the patient does Barrett's esophagus and a repeat EGD with biopsy in 6 months will be recommended.Post Operative ImpressionEGD The patient tolerated the procedure without complications. The EGD was uneventful."""
    }
]

def main():
    try:
        # Initialize the extractor
        print("🔄 Initializing Enhanced Medical Code Extractor...")
        extractor = MedicalCodeExtractor(
            icd_excel_path=r"C:\GenAi\Medicodio-Assignment-2\ICD_code_Assignment.xlsx",
            cpt_excel_path=r"C:\GenAi\Medicodio-Assignment-2\cpt_code_assignment.xlsx"
        )
        
        print("\n" + "="*60)
        print("ENHANCED MEDICAL REPORT ANALYSIS RESULTS")
        print("="*60)
        
        # Process each report and collect results
        all_results = []
        
        for report in sample_reports:
            print(f"\n📄 Processing Report: {report['id']}")
            print("-" * 40)
            
            result = extractor.process_report(report['text'])
            
            # Add report ID to result
            result["ReportID"] = report["id"]
            
            # Print results to console
            print("{\n")
            print(f'  "Clinical_Terms": {json.dumps(result["Clinical_Terms"], indent=4)},')
            print(f'  "Anatomical_Locations": {json.dumps(result["Anatomical_Locations"], indent=4)},')
            print(f'  "Diagnosis": {json.dumps(result["Diagnosis"], indent=4)},')
            print(f'  "Procedure": {json.dumps(result["Procedure"], indent=4)},')
            print(f'  "ICD_10": {json.dumps(result["ICD_10"], indent=4)},')
            print(f'  "CPT": {json.dumps(result["CPT"], indent=4)},')
            print(f'  "HCPCS": {json.dumps(result["HCPCS"], indent=4)}')
            print("\n}")
            
            # Add to results collection
            all_results.append(result)
        
        # Save all results to JSON file
        save_results_to_json(all_results, "medical_report_analysis_results.json")
            
    except Exception as e:
        print(f"❌ Fatal error: {e}")
        import traceback
        traceback.print_exc()

if __name__ == "__main__":
    main()

🔄 Initializing Enhanced Medical Code Extractor...
Loading medical codes into vector database...
📊 Found 295 ICD codes
📊 Found 20 CPT codes
✅ Loaded 295 ICD codes and 20 CPT codes

ENHANCED MEDICAL REPORT ANALYSIS RESULTS

📄 Processing Report: report_1
----------------------------------------
🔍 Extracting clinical entities with enhanced rule-based approach...
🔍 Retrieving medical codes with RAG...
{

  "Clinical_Terms": [
    "hemorrhoids",
    "history",
    "no immediate complications",
    "personal history of colonic polyps"
],
  "Anatomical_Locations": [
    "cecum",
    "colon",
    "ileocecal valve",
    "proximal colon",
    "rectum",
    "sigmoid"
],
  "Diagnosis": [
    "colon polyps",
    "colonic polyps",
    "diverticulosis",
    "diverticulosis and internal hemorrhoids.",
    "hemorrhoids",
    "internal hemorrhoids",
    "melanosis coli",
    "moderate sigmoid diverticulosis",
    "polyps",
    "sigmoid diverticulosis",
    "the patient is an 82-year-old female with histo