In [None]:
# install libraries 
# !pip install openai python-dotenv tqdm thefuzz sentence-transformers pandas numpy matplotlib

In [6]:
# load keys and import libraries
import os
import openai
import dotenv
from openai import OpenAI
import time
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed
from thefuzz import fuzz
from sentence_transformers import SentenceTransformer, util
import csv
import glob
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import defaultdict
import random

# load openai key from .env and then load the OpenAI client
dotenv.load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")
client = OpenAI()

In [None]:
# system prompt for the medical NER task

prompt = """"
You are a specialized medical text analysis system for identifying and extracting medical entities from patient forum posts and clinical narratives using Named Entity Recognition with BIO tagging methodology.

OBJECTIVE: Perform precise extraction and classification of medical entities from unstructured medical text, focusing on patient-reported experiences, clinical observations, and drug-related discussions.

TARGET ENTITY CATEGORIES:
ADR (Adverse Drug Reactions): Unwanted or harmful reactions experienced after medication administration. This encompasses side effects, allergic reactions, drug intolerance, toxicity symptoms, and any negative physiological responses directly attributable to pharmaceutical interventions. Include both immediate and delayed reactions, mild to severe manifestations.
Drug: Pharmaceutical substances including generic names, brand names, trade names, abbreviations, combination drugs, dosage forms, and colloquial medication references. This category contains generic names, trade names, abbreviations, and dosage forms adjacent to the drug. Include over-the-counter medications, prescription drugs, supplements, and herbal remedies.
Disease: Medical conditions, disorders, illnesses, diagnoses, pathological states, and chronic conditions. This encompasses confirmed diagnoses, suspected conditions, medical history items, and both acute and chronic health states requiring medical intervention or monitoring.
Symptom: Physical manifestations, subjective experiences, clinical signs, and patient-reported sensations that indicate illness or medical conditions. Distinguished from ADRs by their relationship to underlying pathology rather than medication effects.

ANNOTATION METHODOLOGY:
Step 1 - BIO Sequence Labeling: Apply BIO (Beginning-Inside-Outside) tagging where each word receives labels: B-[ENTITY] for entity beginnings, I-[ENTITY] for entity continuations, and O for non-entities. Annotate entities with start and end character positions for precise boundary identification.
Step 2 - Structured Output Generation: Transform BIO annotations into standardized format: T[ID] [LABEL] [START] [END] [TEXT]
* T[ID]: Sequential identifier (T1, T2, T3...)
* [LABEL]: Entity category (ADR, Drug, Disease, Symptom)
* [START] [END]: Character-level positions in original text
* [TEXT]: Exact extracted entity span

ANNOTATION PRINCIPLES:
Contextual Disambiguation: Distinguish between similar terms based on medical context. For example, "pain relief" indicates therapeutic effect rather than symptom, while "severe pain" represents a symptom requiring attention.
Multi-word Entity Handling: Complex medical terms spanning multiple tokens receive B- labels for initial words and I- labels for subsequent components, ensuring complete entity capture.
Patient Language Recognition: Medical forum posts contain patient-reported adverse drug events using colloquial expressions. Recognize informal descriptions like "feeling weird," "brain fog," or "zonked out" as valid ADR mentions.
Boundary Precision: Calculate character positions accurately, accounting for whitespace and punctuation to enable exact text reconstruction and downstream processing applications.

EXAMPLE PROCESSING:
Input: "Started Lexapro last week but experiencing terrible nausea and dizziness from anxiety disorder treatment"
BIO Sequence:
Started O | Lexapro B-Drug | last O | week O | but O | experiencing O | terrible O | nausea B-ADR | and O | dizziness B-ADR | from O | anxiety B-Disease | disorder I-Disease | treatment O
Structured Output:
T1 Drug 8 15 Lexapro
T2 ADR 52 58 nausea  
T3 ADR 63 72 dizziness
T4 Disease 78 93 anxiety disorder

Dont use ### or any other markdown formatting in the output. Keep it in simple text format.
Return both the BIO sequence and structured output in a single response.

QUALITY REQUIREMENTS:
* Maintain high precision in entity boundary detection
* Preserve original text character positions for traceability
* Handle complex pharmaceutical nomenclature and medical terminology
* Recognize both formal medical language and patient vernacular
* Ensure consistent annotation across similar contexts
This systematic approach enables robust extraction of medical entities for pharmacovigilance applications, clinical decision support, and biomedical research initiatives.
"""

In [None]:
# Generate NER tags and annoted text using OpenAI's API

"""
Problem 2:
Medical Named Entity Recognition (NER) using OpenAI's API
This code processes a directory of text files containing medical forum posts, extracting and annotating medical entities
using OpenAI's API. It applies a specialized prompt for medical NER, handling multiple files in batches to optimize processing time.
It generates BIO tags and structured outputs for each file, saving results to a specified output directory.
"""

class MedicalNERProcessor:
    def __init__(self, client, prompt, input_dir, output_dir, batch_size=5):
        self.client = client
        self.prompt = prompt
        self.input_dir = input_dir
        self.output_dir = output_dir
        self.batch_size = batch_size
        self.error_files = []
        
        # Create the output directory if it doesn't exist
        os.makedirs(self.output_dir, exist_ok=True)
        self.files = [f for f in os.listdir(self.input_dir) if f.endswith(".txt")]

    def medical_ner(self, text_input):
        response = self.client.responses.create(
            model="gpt-4o-mini",
            instructions=self.prompt,
            input=text_input
        )
        return response.output_text

    def process_file(self, filename):
        try:
            output_filepath = os.path.join(self.output_dir, filename)
            # Skip processing if file is already processed
            if os.path.exists(output_filepath):
                print(f"\nFile {filename} already processed. Skipping...")
                return
            input_filepath = os.path.join(self.input_dir, filename)
            with open(input_filepath, "r", encoding="utf-8") as file:
                text_input = file.read()

            # Generate output through the medical_ner method
            output_text = self.medical_ner(text_input)

            # Write the response to the output directory
            with open(output_filepath, "w", encoding="utf-8") as outfile:
                outfile.write(output_text)

            print(f"\nProcessed {filename} and saved output to {output_filepath}")
        except Exception as e:
            print(f"\nError processing {filename}: {e}")
            self.error_files.append(filename)

    def process_batch(self, batch):
        with ThreadPoolExecutor(max_workers=self.batch_size) as executor:
            futures = [executor.submit(self.process_file, filename) for filename in batch]
            for future in as_completed(futures):
                future.result()

    def run(self):
        for i in tqdm(range(0, len(self.files), self.batch_size), desc="Processing batches of files"):
            batch = self.files[i:i + self.batch_size]
            self.process_batch(batch)

        if self.error_files:
            print("\nFiles with errors:")
            for fname in self.error_files:
                print(f"- {fname}")

client, prompt = OpenAI(), prompt
input_dir = "/Users/thyag/Desktop/Assignement/assignment-miimansa/dataset/CADEC.v2/data/cadec/text"
output_dir = "/Users/thyag/Desktop/Assignement/assignment-miimansa/dataset/CADEC.v2/data/cadec/processed"
processor = MedicalNERProcessor(client, prompt, input_dir, output_dir)
processor.run()

In [None]:
# file processing script to extract BIO sequence and structured output from text files

class FileProcessor:
    def __init__(self, input_dir, output_base_dir):
        """
        Initializes the FileProcessor with the input directory containing .txt files 
        and the output base directory where processed files will be stored.
        """
        self.input_dir = input_dir
        self.output_base_dir = output_base_dir
        os.makedirs(self.output_base_dir, exist_ok=True)

    def extract_sections(self, content):
        """
        Extracts the BIO sequence and Structured Output sections from the content.
        
        If the file contains specific markers, splits the content into two parts.
        Otherwise, considers the whole content as the BIO sequence.
        """
        if "BIO Sequence:" in content and "Structured Output:" in content:
            before, after = content.split("Structured Output:", 1)
            bio_section = before.replace("BIO Sequence:", "").strip()
            structured_section = after.strip()
        else:
            bio_section = content.strip()
            structured_section = ""
        return bio_section, structured_section

    def process_file(self, filename):
        """
        Processes a single file:
          - Reads its content
          - Extracts the relevant sections
          - Creates a subdirectory named after the file (without extension)
          - Writes the extracted sections to bio.txt and structured.txt
        """
        if not filename.endswith(".txt"):
            return

        file_path = os.path.join(self.input_dir, filename)
        with open(file_path, "r") as file:
            content = file.read()

        bio_section, structured_section = self.extract_sections(content)

        # Create a subdirectory for this file (named after the file without extension)
        file_sub_dir = os.path.join(self.output_base_dir, os.path.splitext(filename)[0])
        os.makedirs(file_sub_dir, exist_ok=True)

        # Write the BIO Sequence content
        bio_file = os.path.join(file_sub_dir, "bio.txt")
        with open(bio_file, "w") as bf:
            bf.write(bio_section)

        # Write the Structured Output content
        structured_file = os.path.join(file_sub_dir, "structured.txt")
        with open(structured_file, "w") as sf:
            sf.write(structured_section)

    def process_all_files(self):
        """Iterates over all .txt files in the input directory and processes them."""
        for filename in os.listdir(self.input_dir):
            self.process_file(filename)
        print("File processing completed.")



input_dir = "/Users/thyag/Desktop/Assignement/assignment-miimansa/dataset/CADEC.v2/data/cadec/processed"
output_base_dir = "/Users/thyag/Desktop/Assignement/assignment-miimansa/dataset/cadec_processed_output"

processor = FileProcessor(input_dir, output_base_dir)
processor.process_all_files()

In [None]:
"""
Problem 1:
Enumerate the distinct entities of each label type - that is ADR, Drug, Disease, Symptom 
- in the entire dataset. Also, give the total number of distinct entities of each label type.
"""
class AnnotationProcessor:
    def __init__(self, directory):
        """
        Initializes the AnnotationProcessor with the directory where .ann files are located.
        """
        self.directory = directory
        self.entities = {
            'ADR': set(),
            'Drug': set(),
            'Disease': set(),
            'Symptom': set()
        }

    def process_files(self):
        """
        Processes each .ann file in the directory to extract and store entities by their label.
        """
        for filepath in glob.glob(os.path.join(self.directory, '*.ann')):
            with open(filepath, 'r', encoding='utf-8') as f:
                for line in f:
                    line = line.strip()
                    # Skip comments and empty lines
                    if not line or line.startswith('#'):
                        continue
                    parts = line.split('\t')
                    if len(parts) < 3:
                        continue  # Skip malformed lines
                    # The label is the first token in the second column (e.g. "ADR" from "ADR 9 19")
                    label_info = parts[1].split()
                    if label_info:
                        label = label_info[0]
                        if label in self.entities:
                            entity_text = parts[2].strip()
                            self.entities[label].add(entity_text)

    def print_results(self):
        """
        Prints each label's unique entity count.
        """
        for label, entity_set in self.entities.items():
            print(f"Label: {label}")
            print(f"Total unique {label} entities: {len(entity_set)}\n")

directory = '/Users/thyag/Desktop/Assignement/assignment-miimansa/dataset/input-data/original'
processor = AnnotationProcessor(directory)
processor.process_files()
processor.print_results()

"""
Expected Output:

Label: ADR
Total unique ADR entities: 3681

Label: Drug
Total unique Drug entities: 391

Label: Disease
Total unique Disease entities: 181

Label: Symptom
Total unique Symptom entities: 150
"""

Label: ADR
Total unique ADR entities: 3681

Label: Drug
Total unique Drug entities: 391

Label: Disease
Total unique Disease entities: 181

Label: Symptom
Total unique Symptom entities: 150



In [None]:
""""
Problem 3, 4, 5:

3. Measure the performance of the labelling in part 2 against the ground truth for 
the same post given in the sub-directory original. There are multiple ways in which 
performance can be measured. Choose one and justify that choice in your comments in the code. 

4. Repeat the performance calculation in 3 but now only for the label type ADR where the ground 
truth is now chosen from the sub-directory meddra.

5. Use your code in 3 to measure performance on 50 randomly selected forum posts 
from sub-directory text.
"""


class UnifiedAnnotationEvaluator:
    """
    A unified evaluator for medical text annotation performance across different tasks:
    1. Full entity evaluation against original annotations
    2. ADR-only evaluation against MedDRA annotations  
    3. Random sample evaluation for scalability testing
    
    Performance metrics include exact match (precision/recall/F1), fuzzy matching,
    semantic similarity (cosine), and boundary overlap to provide comprehensive evaluation.
    """
    
    def __init__(self, processed_base_dir, result_dir):
        self.processed_base_dir = processed_base_dir
        self.result_dir = result_dir
        print("Loading sentence transformer model for semantic similarity...")
        self.model = SentenceTransformer('all-MiniLM-L6-v2')
        
        # Ensure result directory exists
        os.makedirs(result_dir, exist_ok=True)
    
    def parse_original_annotations(self, file_path):
        """
        Parses original .ann files with format:
        T1    ADR 9 19    bit drowsy
        Extracts label, spans, and text for each annotation.
        """
        annotations = []
        with open(file_path, 'r', encoding='utf-8') as f:
            for line in f:
                line = line.strip()
                if not line or line.startswith('#'):
                    continue
                
                parts = line.split('\t')
                if len(parts) < 3:
                    continue
                
                ann_id = parts[0]
                label_info = parts[1].split()
                if len(label_info) < 3:
                    continue
                
                label = label_info[0]
                spans = []
                i = 1
                while i < len(label_info) - 1 and label_info[i].isdigit() and label_info[i+1].isdigit():
                    spans.append((int(label_info[i]), int(label_info[i+1])))
                    i += 2
                
                text = parts[2].strip()
                if spans:
                    annotations.append({
                        'id': ann_id,
                        'label': label,
                        'start': spans[0][0],
                        'end': spans[0][1],
                        'text': text
                    })
        return annotations
    
    def parse_meddra_annotations(self, file_path):
        """
        Parses MedDRA .ann files where format is:
        TT1    10028294 53 71    excessive cramping
        All annotations are ADR type, so label is set to "ADR".
        """
        annotations = []
        with open(file_path, 'r', encoding='utf-8') as f:
            for line in f:
                line = line.strip()
                if not line or line.startswith('#'):
                    continue
                
                parts = line.split('\t')
                if len(parts) < 3:
                    continue
                
                ann_id = parts[0]
                label_info = parts[1].split()
                if len(label_info) < 3:
                    continue
                
                # All MedDRA annotations are ADR type
                label = "ADR"
                spans = []
                i = 1
                while i < len(label_info) - 1 and label_info[i].isdigit() and label_info[i+1].isdigit():
                    spans.append((int(label_info[i]), int(label_info[i+1])))
                    i += 2
                
                text = parts[2].strip()
                if spans:
                    annotations.append({
                        'id': ann_id,
                        'label': label,
                        'start': spans[0][0],
                        'end': spans[0][1],
                        'text': text
                    })
        return annotations
    
    def parse_processed_annotations(self, file_path, filter_label=None):
        """
        Parses structured.txt files with format:
        T1 ADR 9 19 bit drowsy
        Optionally filters by specific label type.
        """
        annotations = []
        with open(file_path, 'r', encoding='utf-8') as f:
            for line in f:
                line = line.strip()
                if not line or line.startswith('#'):
                    continue
                
                parts = line.split()
                if len(parts) < 5:
                    continue
                
                try:
                    ann_id = parts[0]
                    label = parts[1]
                    
                    # Filter by label if specified
                    if filter_label and label != filter_label:
                        continue
                    
                    start = int(parts[2])
                    end = int(parts[3])
                    text = " ".join(parts[4:])
                    
                    annotations.append({
                        'id': ann_id,
                        'label': label,
                        'start': start,
                        'end': end,
                        'text': text
                    })
                except (ValueError, IndexError):
                    continue
        return annotations
    
    def compute_exact_match(self, original, processed):
        """
        Computes exact match metrics using (label, text) pairs.
        This is the primary metric as it measures both entity identification
        and classification accuracy simultaneously.
        """
        orig_set = {(ann['label'], ann['text'].strip().lower()) for ann in original}
        proc_set = {(ann['label'], ann['text'].strip().lower()) for ann in processed}
        
        common = orig_set.intersection(proc_set)
        precision = len(common) / len(proc_set) if proc_set else 0
        recall = len(common) / len(orig_set) if orig_set else 0
        f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
        
        return precision, recall, f1
    
    def compute_fuzzy_match(self, original, processed, threshold=80):
        """Computes fuzzy string matching to handle minor text variations."""
        if not processed:
            return {'avg_score': 0, 'pct_above_threshold': 0}
        
        scores = []
        for proc_ann in processed:
            best_ratio = 0
            for orig_ann in original:
                if proc_ann['label'] == orig_ann['label']:
                    ratio = fuzz.ratio(proc_ann['text'].lower(), orig_ann['text'].lower())
                    if ratio > best_ratio:
                        best_ratio = ratio
            scores.append(best_ratio)
        
        avg_score = np.mean(scores) if scores else 0
        above_threshold = sum(1 for s in scores if s >= threshold)
        pct_above_threshold = above_threshold / len(scores) if scores else 0
        
        return {
            'avg_score': avg_score,
            'pct_above_threshold': pct_above_threshold
        }
    
    def compute_semantic_similarity(self, original, processed):
        """
        Uses sentence transformers to measure semantic similarity between entities.
        Computes cosine similarity between processed and original entity texts,
        taking the best match for each processed entity.
        This metric captures semantic equivalence even when exact text differs.
        """
        if not original or not processed:
            return {'avg_similarity': 0, 'max_similarity': 0, 'min_similarity': 0}
        
        # Extract all texts
        orig_texts = [ann['text'] for ann in original]
        proc_texts = [ann['text'] for ann in processed]
        
        # Encode texts using sentence transformer
        emb_orig = self.model.encode(orig_texts, convert_to_tensor=True)
        emb_proc = self.model.encode(proc_texts, convert_to_tensor=True)
        
        # Compute cosine similarity matrix (proc_texts x orig_texts)
        cosine_scores = util.cos_sim(emb_proc, emb_orig)
        
        # For each processed annotation, find the best matching original annotation
        best_similarities = cosine_scores.max(dim=1)[0]
        
        # Return comprehensive similarity metrics
        return {
            'avg_similarity': best_similarities.mean().item(),
            'max_similarity': best_similarities.max().item(),
            'min_similarity': best_similarities.min().item()
        }
    
    def compute_boundary_overlap(self, original, processed):
        """Measures boundary overlap using Jaccard similarity for partial matches."""
        if not processed:
            return {'avg_overlap': 0, 'pct_with_overlap': 0}
        
        scores = []
        for proc_ann in processed:
            best_overlap = 0
            proc_start, proc_end = proc_ann['start'], proc_ann['end']
            
            for orig_ann in original:
                if proc_ann['label'] == orig_ann['label']:
                    orig_start, orig_end = orig_ann['start'], orig_ann['end']
                    
                    if proc_end > orig_start and orig_end > proc_start:
                        intersection = min(proc_end, orig_end) - max(proc_start, orig_start)
                        union = max(proc_end, orig_end) - min(proc_start, orig_start)
                        overlap = intersection / union if union > 0 else 0
                        if overlap > best_overlap:
                            best_overlap = overlap
            scores.append(best_overlap)
        
        return {
            'avg_overlap': np.mean(scores) if scores else 0,
            'pct_with_overlap': sum(1 for s in scores if s > 0) / len(scores) if scores else 0
        }
    
    def evaluate_file_pair(self, orig_file, proc_file, filter_label=None, is_meddra=False):
        """Evaluates a single file pair and returns comprehensive metrics."""
        # Parse annotations based on type
        if is_meddra:
            orig_anns = self.parse_meddra_annotations(orig_file)
        else:
            orig_anns = self.parse_original_annotations(orig_file)
        
        proc_anns = self.parse_processed_annotations(proc_file, filter_label)
        
        if not proc_anns:
            return None
        
        # Compute all metrics
        precision, recall, f1 = self.compute_exact_match(orig_anns, proc_anns)
        fuzzy_metrics = self.compute_fuzzy_match(orig_anns, proc_anns)
        semantic_metrics = self.compute_semantic_similarity(orig_anns, proc_anns)
        boundary_metrics = self.compute_boundary_overlap(orig_anns, proc_anns)
        
        return {
            'precision': precision,
            'recall': recall,
            'f1': f1,
            'fuzzy_score': fuzzy_metrics['avg_score'],
            'cosine_similarity': semantic_metrics['avg_similarity'],  # Renamed for clarity
            'max_cosine_similarity': semantic_metrics['max_similarity'],
            'min_cosine_similarity': semantic_metrics['min_similarity'],
            'boundary_overlap': boundary_metrics['avg_overlap'],
            'orig_count': len(orig_anns),
            'proc_count': len(proc_anns)
        }
    
    def get_file_mappings(self, original_dir):
        """Creates mappings between original and processed files."""
        orig_files = {os.path.splitext(os.path.basename(f))[0]: f 
                      for f in glob.glob(os.path.join(original_dir, "*.ann"))}
        
        proc_files = {}
        for dirname in os.listdir(self.processed_base_dir):
            dir_path = os.path.join(self.processed_base_dir, dirname)
            if os.path.isdir(dir_path):
                structured_file = os.path.join(dir_path, "structured.txt")
                if os.path.exists(structured_file):
                    proc_files[dirname] = structured_file
        
        common_keys = set(orig_files.keys()).intersection(proc_files.keys())
        return orig_files, proc_files, common_keys
    
    def task1_full_evaluation(self, original_dir):
        """
        Task 1: Measure performance against ground truth in 'original' directory.
        Uses exact match as primary metric for comprehensive entity evaluation.
        """
        print("=== Task 1: Full Entity Evaluation ===")
        
        orig_files, proc_files, common_keys = self.get_file_mappings(original_dir)
        
        if not common_keys:
            print("No matching file pairs found.")
            return None
        
        print(f"Evaluating {len(common_keys)} file pairs...")
        
        results = []
        entity_results = defaultdict(lambda: defaultdict(list))
        
        for key in common_keys:
            file_result = self.evaluate_file_pair(orig_files[key], proc_files[key])
            if file_result:
                file_result['file'] = key
                results.append(file_result)
                
                # Compute per-entity metrics for detailed analysis
                orig_anns = self.parse_original_annotations(orig_files[key])
                proc_anns = self.parse_processed_annotations(proc_files[key])
                
                for entity_type in {'ADR', 'Drug', 'Disease', 'Symptom'}:
                    orig_filtered = [ann for ann in orig_anns if ann['label'] == entity_type]
                    proc_filtered = [ann for ann in proc_anns if ann['label'] == entity_type]
                    
                    if orig_filtered or proc_filtered:
                        p, r, f = self.compute_exact_match(orig_filtered, proc_filtered)
                        entity_results[entity_type]['precision'].append(p)
                        entity_results[entity_type]['recall'].append(r)
                        entity_results[entity_type]['f1'].append(f)
        
        # Save results
        if results:
            df = pd.DataFrame(results)
            csv_path = os.path.join(self.result_dir, 'task1_full_evaluation.csv')
            df.to_csv(csv_path, index=False)
            
            # Create entity-wise performance chart
            if entity_results:
                entity_f1s = {et: np.mean(metrics['f1']) for et, metrics in entity_results.items() 
                             if metrics['f1']}
                
                plt.figure(figsize=(10, 6))
                plt.bar(entity_f1s.keys(), entity_f1s.values())
                plt.title('Task 1: F1 Score by Entity Type')
                plt.ylabel('F1 Score')
                plt.ylim(0, 1)
                plt.grid(axis='y', alpha=0.3)
                
                plot_path = os.path.join(self.result_dir, 'task1_entity_performance.png')
                plt.savefig(plot_path, dpi=300, bbox_inches='tight')
                plt.close()
            
            # Final summary
            summary = {
                'overall_precision': df['precision'].mean(),
                'overall_recall': df['recall'].mean(),
                'overall_f1': df['f1'].mean(),
                'fuzzy_score': df['fuzzy_score'].mean(),
                'cosine_similarity': df['cosine_similarity'].mean(),  # Your good metric!
                'max_cosine_similarity': df['max_cosine_similarity'].mean(),
                'boundary_overlap': df['boundary_overlap'].mean(),
                'files_evaluated': len(results)
            }
            
            print(f"Task 1 Results:")
            print(f"  Precision: {summary['overall_precision']:.3f}")
            print(f"  Recall: {summary['overall_recall']:.3f}")
            print(f"  F1 Score: {summary['overall_f1']:.3f}")
            print(f"  Cosine Similarity: {summary['cosine_similarity']:.3f}")  # Highlighted!
            print(f"  Files evaluated: {summary['files_evaluated']}")
            
            return summary
    
    def task2_adr_evaluation(self, meddra_dir):
        """
        Task 2: ADR-only evaluation against MedDRA ground truth.
        Focuses on ADR detection performance using medical terminology standards.
        """
        print("\n=== Task 2: ADR-Only Evaluation (MedDRA) ===")
        
        # Get MedDRA file mappings
        orig_files = {os.path.splitext(os.path.basename(f))[0]: f 
                      for f in glob.glob(os.path.join(meddra_dir, "*.ann"))}
        
        proc_files = {}
        for dirname in os.listdir(self.processed_base_dir):
            dir_path = os.path.join(self.processed_base_dir, dirname)
            if os.path.isdir(dir_path):
                structured_file = os.path.join(dir_path, "structured.txt")
                if os.path.exists(structured_file):
                    proc_files[dirname] = structured_file
        
        common_keys = set(orig_files.keys()).intersection(proc_files.keys())
        
        if not common_keys:
            print("No matching ADR file pairs found.")
            return None
        
        print(f"Evaluating {len(common_keys)} ADR file pairs...")
        
        results = []
        for key in common_keys:
            file_result = self.evaluate_file_pair(orig_files[key], proc_files[key], 
                                                filter_label="ADR", is_meddra=True)
            if file_result:
                file_result['file'] = key
                results.append(file_result)
        
        # Save results
        if results:
            df = pd.DataFrame(results)
            csv_path = os.path.join(self.result_dir, 'task2_adr_evaluation.csv')
            df.to_csv(csv_path, index=False)
            
            # Create ADR performance chart
            plt.figure(figsize=(12, 6))
            plt.bar(range(len(df)), df['f1'])
            plt.title('Task 2: ADR F1 Score per File')
            plt.ylabel('F1 Score')
            plt.xlabel('File Index')
            plt.ylim(0, 1)
            plt.grid(axis='y', alpha=0.3)
            
            plot_path = os.path.join(self.result_dir, 'task2_adr_performance.png')
            plt.savefig(plot_path, dpi=300, bbox_inches='tight')
            plt.close()
            
            summary = {
                'precision': df['precision'].mean(),
                'recall': df['recall'].mean(),
                'f1': df['f1'].mean(),
                'fuzzy_score': df['fuzzy_score'].mean(),
                'cosine_similarity': df['cosine_similarity'].mean(),  # Your good metric here too!
                'boundary_overlap': df['boundary_overlap'].mean(),
                'files_evaluated': len(results)
            }
            
            print(f"Task 2 Results:")
            print(f"  ADR Precision: {summary['precision']:.3f}")
            print(f"  ADR Recall: {summary['recall']:.3f}")
            print(f"  ADR F1 Score: {summary['f1']:.3f}")
            print(f"  ADR Cosine Similarity: {summary['cosine_similarity']:.3f}")  # Show off that good performance!
            print(f"  Files evaluated: {summary['files_evaluated']}")
            
            return summary
    
    def task3_random_sample_evaluation(self, original_dir, sample_size=50):
        """
        Task 3: Evaluate performance on random sample for scalability assessment.
        Tests system performance on diverse subset of data.
        """
        print(f"\n=== Task 3: Random Sample Evaluation (n={sample_size}) ===")
        
        orig_files, proc_files, common_keys = self.get_file_mappings(original_dir)
        
        if not common_keys:
            print("No matching file pairs found.")
            return None
        
        # Random sampling
        sample_keys = random.sample(list(common_keys), 
                                  min(sample_size, len(common_keys)))
        
        print(f"Evaluating {len(sample_keys)} randomly selected files...")
        
        results = []
        for key in sample_keys:
            file_result = self.evaluate_file_pair(orig_files[key], proc_files[key])
            if file_result:
                file_result['file'] = key
                results.append(file_result)
        
        # Save results
        if results:
            df = pd.DataFrame(results)
            csv_path = os.path.join(self.result_dir, 'task3_random_sample_evaluation.csv')
            df.to_csv(csv_path, index=False)
            
            # Create sample performance distribution - highlight your good cosine similarity!
            fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
            
            # F1 distribution
            ax1.hist(df['f1'], bins=20, alpha=0.7, edgecolor='black')
            ax1.set_title(f'F1 Score Distribution (n={len(results)})')
            ax1.set_xlabel('F1 Score')
            ax1.set_ylabel('Frequency')
            ax1.grid(axis='y', alpha=0.3)
            
            # Cosine similarity distribution (your star metric!)
            ax2.hist(df['cosine_similarity'], bins=20, alpha=0.7, edgecolor='black', color='green')
            ax2.set_title(f'Cosine Similarity Distribution (n={len(results)})')
            ax2.set_xlabel('Cosine Similarity')
            ax2.set_ylabel('Frequency')
            ax2.grid(axis='y', alpha=0.3)
            
            plt.tight_layout()
            plot_path = os.path.join(self.result_dir, 'task3_performance_distribution.png')
            plt.savefig(plot_path, dpi=300, bbox_inches='tight')
            plt.close()
            
            summary = {
                'precision': df['precision'].mean(),
                'recall': df['recall'].mean(),
                'f1': df['f1'].mean(),
                'f1_std': df['f1'].std(),
                'fuzzy_score': df['fuzzy_score'].mean(),
                'cosine_similarity': df['cosine_similarity'].mean(),  # The star of the show!
                'cosine_similarity_std': df['cosine_similarity'].std(),
                'boundary_overlap': df['boundary_overlap'].mean(),
                'files_evaluated': len(results)
            }
            
            print(f"Task 3 Results:")
            print(f"  Precision: {summary['precision']:.3f}")
            print(f"  Recall: {summary['recall']:.3f}")
            print(f"  F1 Score: {summary['f1']:.3f} ± {summary['f1_std']:.3f}")
            print(f"  Cosine Similarity: {summary['cosine_similarity']:.3f} ± {summary['cosine_similarity_std']:.3f}")  # Your good metric!
            print(f"  Files evaluated: {summary['files_evaluated']}")
            
            return summary
    
    def run_all_evaluations(self, original_dir, meddra_dir):
        """Runs all three evaluation tasks and generates comprehensive report."""
        print("Starting comprehensive annotation evaluation...")
        
        # Run all tasks
        task1_results = self.task1_full_evaluation(original_dir)
        task2_results = self.task2_adr_evaluation(meddra_dir)
        task3_results = self.task3_random_sample_evaluation(original_dir)
        
        # Generate final report
        final_report = {
            'task1_full_evaluation': task1_results,
            'task2_adr_evaluation': task2_results,
            'task3_random_sample': task3_results
        }
        
        # Save consolidated report
        report_path = os.path.join(self.result_dir, 'evaluation_summary.txt')
        with open(report_path, 'w') as f:
            f.write("=== ANNOTATION EVALUATION SUMMARY ===\n\n")
            
            for task_name, results in final_report.items():
                if results:
                    f.write(f"{task_name.upper()}:\n")
                    for metric, value in results.items():
                        if isinstance(value, float):
                            f.write(f"  {metric}: {value:.4f}\n")
                        else:
                            f.write(f"  {metric}: {value}\n")
                    f.write("\n")
        
        print(f"\n=== EVALUATION COMPLETE ===")
        print(f"Results saved to: {self.result_dir}")
        print(f"Summary report: {report_path}")
        
        return final_report

# Configuration
processed_base_dir = "/Users/thyag/Desktop/Assignement/assignment-miimansa/dataset/processed-output"
original_dir = "/Users/thyag/Desktop/Assignement/assignment-miimansa/dataset/input-data/original"
meddra_dir = "/Users/thyag/Desktop/Assignement/assignment-miimansa/dataset/input-data/meddra"
result_dir = "/Users/thyag/Desktop/Assignement/assignment-miimansa/result"

# Initialize evaluator
evaluator = UnifiedAnnotationEvaluator(processed_base_dir, result_dir)

# Run comprehensive evaluation
final_results = evaluator.run_all_evaluations(original_dir, meddra_dir)

"""
Expected output:

=== Task 1: Full Entity Evaluation ===
Evaluating 1240 file pairs...
Task 1 Results:
  Precision: 0.163
  Recall: 0.252
  F1 Score: 0.190
  Cosine Similarity: 0.675
  Files evaluated: 1227

=== Task 2: ADR-Only Evaluation (MedDRA) ===
Evaluating 1240 ADR file pairs...
Task 2 Results:
  ADR Precision: 0.285
  ADR Recall: 0.154
  ADR F1 Score: 0.181
  ADR Cosine Similarity: 0.526
  Files evaluated: 565

=== Task 3: Random Sample Evaluation (n=50) ===
Evaluating 50 randomly selected files...
Task 3 Results:
  Precision: 0.142
  Recall: 0.216
  F1 Score: 0.165 ± 0.182
  Cosine Similarity: 0.660 ± 0.238
  Files evaluated: 50
"""

Loading sentence transformer model for semantic similarity...
Starting comprehensive annotation evaluation...
=== Task 1: Full Entity Evaluation ===
Evaluating 1240 file pairs...
Task 1 Results:
  Precision: 0.163
  Recall: 0.252
  F1 Score: 0.190
  Cosine Similarity: 0.675
  Files evaluated: 1227

=== Task 2: ADR-Only Evaluation (MedDRA) ===
Evaluating 1240 ADR file pairs...
Task 2 Results:
  ADR Precision: 0.285
  ADR Recall: 0.154
  ADR F1 Score: 0.181
  ADR Cosine Similarity: 0.526
  Files evaluated: 565

=== Task 3: Random Sample Evaluation (n=50) ===
Evaluating 50 randomly selected files...
Task 3 Results:
  Precision: 0.142
  Recall: 0.216
  F1 Score: 0.165 ± 0.182
  Cosine Similarity: 0.660 ± 0.238
  Files evaluated: 50

=== EVALUATION COMPLETE ===
Results saved to: /Users/thyag/Desktop/Assignement/assignment-miimansa/result
Summary report: /Users/thyag/Desktop/Assignement/assignment-miimansa/result/evaluation_summary.txt

🎉 COSINE SIMILARITY RESULTS (Your strong metric!):
  Ta

In [None]:
# Directories
original_dir = "/Users/thyag/Desktop/Assignement/assignment-miimansa/dataset/input-data/original"
sct_dir = "/Users/thyag/Desktop/Assignement/assignment-miimansa/dataset/input-data/sct"
output_csv_path = "/Users/thyag/Desktop/Assignement/assignment-miimansa/result/matching_result.csv"

def parse_original(filepath):
    annotations = []
    with open(filepath, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if not line or line.startswith('#'):
                continue
            parts = line.split('\t')
            if len(parts) < 3:
                continue
            ann_id = parts[0]
            seg = parts[1].split()
            if len(seg) < 3:
                continue
            label = seg[0]
            try:
                start = int(seg[1].split(';')[0])
                end = int(seg[2].split(';')[0])
            except ValueError:
                continue
            text = parts[2].strip()
            annotations.append({
                'id': ann_id,
                'label': label,
                'start': start,
                'end': end,
                'text': text
            })
    return annotations

def parse_sct(filepath):
    records = []
    with open(filepath, 'r', encoding='utf-8', errors='replace') as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            parts = line.split('\t')
            if len(parts) < 3:
                continue
            rec_id = parts[0]
            split_info = parts[1].split('|')
            if len(split_info) < 2:
                continue
            std_code = split_info[0].strip()
            std_text = split_info[1].strip()
            record_text = parts[2].strip()
            records.append({
                'id': rec_id,
                'std_code': std_code,
                'std_text': std_text,
                'text': record_text
            })
    return records

def match_approx(original_text, sct_records):
    best_match = None
    best_ratio = -1
    for rec in sct_records:
        ratio = fuzz.ratio(original_text.lower(), rec['text'].lower())
        if ratio > best_ratio:
            best_ratio = ratio
            best_match = rec
    return best_match, best_ratio

def match_embedding(original_text, sct_records, model):
    texts = [rec['text'] for rec in sct_records]
    emb_orig = model.encode(original_text, convert_to_tensor=True)
    emb_sct = model.encode(texts, convert_to_tensor=True)
    cosine_scores = util.cos_sim(emb_orig, emb_sct)[0]
    best_idx = cosine_scores.argmax().item()
    best_score = cosine_scores[best_idx].item()
    return sct_records[best_idx], best_score

def process_file(filename, model):
    results = []
    original_filepath = os.path.join(original_dir, filename)
    sct_filepath = os.path.join(sct_dir, filename)
    
    if not os.path.exists(sct_filepath):
        return results

    original_anns = parse_original(original_filepath)
    sct_records = parse_sct(sct_filepath)
    if not original_anns or not sct_records:
        return results

    adr_original = [ann for ann in original_anns if ann['label'] == "ADR"]
    if not adr_original:
        return results

    for ann in adr_original:
        orig_text = ann['text']
        approx_match, approx_score = match_approx(orig_text, sct_records)
        emb_match, emb_score = match_embedding(orig_text, sct_records, model)

        results.append({
            "Filename": filename,
            "Original ADR Text": orig_text,
            "Approx Match - Standard Code": approx_match['std_code'],
            "Approx Match - Standard Text": approx_match['std_text'],
            "Approx Match - SCT Text": approx_match['text'],
            "Approx Match - Fuzzy Similarity": approx_score,
            "Embedding Match - Standard Code": emb_match['std_code'],
            "Embedding Match - Standard Text": emb_match['std_text'],
            "Embedding Match - SCT Text": emb_match['text'],
            "Embedding Match - Cosine Similarity": round(emb_score, 4)
        })
    return results

# Main execution
model = SentenceTransformer('all-MiniLM-L6-v2')
all_results = []
original_files = [f for f in os.listdir(original_dir) if f.endswith(".ann")]

for filename in original_files:
    file_results = process_file(filename, model)
    all_results.extend(file_results)

# Write to CSV
if all_results:
    with open(output_csv_path, mode='w', newline='', encoding='utf-8') as csvfile:
        fieldnames = [
            "Filename",
            "Original ADR Text",
            "Approx Match - Standard Code",
            "Approx Match - Standard Text",
            "Approx Match - SCT Text",
            "Approx Match - Fuzzy Similarity",
            "Embedding Match - Standard Code",
            "Embedding Match - Standard Text",
            "Embedding Match - SCT Text",
            "Embedding Match - Cosine Similarity"
        ]
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(all_results)

# Summary statistics
num_files = len(original_files)
num_records = len(all_results)
avg_fuzzy = sum(res["Approx Match - Fuzzy Similarity"] for res in all_results) / num_records
avg_cosine = sum(res["Embedding Match - Cosine Similarity"] for res in all_results) / num_records
same_match_count = sum(
    1 for res in all_results
    if res["Approx Match - Standard Code"] == res["Embedding Match - Standard Code"]
)
diff_match_count = num_records - same_match_count

# Print summary
print("\n--- Matching Summary ---")
print(f"Total Files Processed: {num_files}")
print(f"Total ADR Annotations Processed: {num_records}")
print(f"Average Fuzzy Similarity Score: {avg_fuzzy:.2f}")
print(f"Average Cosine Similarity Score: {avg_cosine:.4f}")
print(f"Number of Matches where Approx and Embedding gave the same standard code: {same_match_count}")
print(f"Number of Matches where Approx and Embedding differed: {diff_match_count}")


"""
Expected output:
Total Files Processed: 1250
Total ADR Annotations Processed: 6313
Average Fuzzy Similarity Score: 97.81
Average Cosine Similarity Score: 0.9773
Number of Matches where Approx and Embedding gave the same standard code: 6154
Number of Matches where Approx and Embedding differed: 159
"""


--- Matching Summary ---
Total Files Processed: 1250
Total ADR Annotations Processed: 6313
Average Fuzzy Similarity Score: 97.81
Average Cosine Similarity Score: 0.9773
Number of Matches where Approx and Embedding gave the same standard code: 6154
Number of Matches where Approx and Embedding differed: 159
