In [66]:
import re
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from collections import defaultdict
import networkx as nx
from sentence_transformers import SentenceTransformer
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/zainab/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [67]:
class EEGPreprocessor:
    def __init__(self):
        self.eeg_terms = {
            'gtc': 'generalized tonic-clonic',
            'sw': 'sharp wave',
            'psw': 'polyspike wave',
            'pdr': 'posterior dominant rhythm',
            'pda': 'photoparoxysmal response',
            'pleds': 'periodic lateralized epileptiform discharges',
            'gsw': 'generalized spike wave',
            'peds': 'periodic epileptiform discharges',
            'swd': 'spike wave discharge',
            'pnd': 'periodic lateralized epileptiform discharge'
        }
        
        self.stop_words = set(stopwords.words('english'))
        self.clinical_stopwords = {'patient', 'study', 'eeg', 'record', 'recording', 'findings', 'report'}
        
    def preprocess(self, text):
        text = text.lower()
        
        for abbr, full in self.eeg_terms.items():
            text = re.sub(rf'\b{abbr}\b', full, text)
            
        text = re.sub(r'[^\w\s\-/]', ' ', text)
        
        tokens = text.split()
        filtered_tokens = [word for word in tokens 
                         if word not in self.stop_words 
                         and word not in self.clinical_stopwords
                         and len(word) > 2]
        
        return ' '.join(filtered_tokens)


In [68]:
class EEGKnowledgeGraph:
    def __init__(self):
        self.graph = nx.DiGraph()
        self.build_base_graph()
        
    def build_base_graph(self):
        waveforms = {
            'epileptiform': {
                'spike': 0.9,
                'sharp wave': 0.85,
                'polyspike': 0.95,
                'spike wave complex': 0.92,
                'generalized spike wave': 0.93
            },
            'non_epileptiform': {
                'slow wave': 0.6,
                'delta': 0.5,
                'theta': 0.4,
                'alpha': 0.3,
                'beta': 0.3
            },
            'ictal': {
                'electrographic seizure': 1.0,
                'electrodecremental': 0.95
            },
            'interictal': {
                'interictal epileptiform': 0.88,
                'periodic patterns': 0.82
            }
        }
        
        locations = ['frontal', 'temporal', 'parietal', 'occipital', 'central', 'generalized']
        
        for category, members in waveforms.items():
            self.graph.add_node(category, type='category')
            for member, significance in members.items():
                self.graph.add_node(member, type='waveform', significance=significance)
                self.graph.add_edge(category, member)
                
        for loc in locations:
            self.graph.add_node(loc, type='location')
            
    def get_waveform_significance(self, waveform):
        try:
            return self.graph.nodes[waveform]['significance']
        except KeyError:
            return 0.5
            
    def get_semantic_distance(self, term1, term2):
        try:
            return nx.shortest_path_length(self.graph, term1, term2)
        except:
            return float('inf')


In [69]:
class EEGReportSimilarity:
    def __init__(self):
        self.preprocessor = EEGPreprocessor()
        self.embedder = SentenceTransformer('all-MiniLM-L6-v2')
        self.knowledge_graph = EEGKnowledgeGraph()
        
    def extract_key_phrases(self, text):
        patterns = {
            'background': r'(background|awake|sleep|posterior dominant rhythm|pdr|alpha|beta|theta|delta)',
            'epileptiform': r'(spike|sharp wave|polyspike|epileptiform|spike[\-\s]wave|sw|psw|gsw|swd)',
            'slowing': r'(slow|delta|theta|slowing|focal slow|generalized slow)',
            'seizure': r'(seizure|ictal|electrographic seizure|clinical correlation)',
            'location': r'(frontal|temporal|parietal|occipital|central|generalized|bilateral)'
        }
        
        results = defaultdict(list)
        for category, pattern in patterns.items():
            matches = re.finditer(pattern, text.lower())
            for match in matches:
                matched_term = match.group().lower()
                if matched_term in self.preprocessor.eeg_terms:
                    matched_term = self.preprocessor.eeg_terms[matched_term]
                results[category].append(matched_term)
                
        return dict(results)
        
    def calculate_similarity(self, report1, report2):
        preprocessed1 = self.preprocessor.preprocess(report1)
        preprocessed2 = self.preprocessor.preprocess(report2)
        
        emb1 = self.embedder.encode([preprocessed1])
        emb2 = self.embedder.encode([preprocessed2])
        
        base_sim = cosine_similarity(emb1, emb2)[0][0]
        
        phrases1 = self.extract_key_phrases(preprocessed1)
        phrases2 = self.extract_key_phrases(preprocessed2)
        
        concept_sim = self.calculate_concept_similarity(phrases1, phrases2)
        
        struct_sim = self.calculate_structural_similarity(report1, report2)
        
        final_score = 0.4 * base_sim + 0.4 * concept_sim + 0.2 * struct_sim
        
        return {
            'final_score': final_score,
            'base_similarity': base_sim,
            'concept_similarity': concept_sim,
            'structural_similarity': struct_sim,
            'extracted_phrases_1': phrases1,
            'extracted_phrases_2': phrases2
        }
        
    def calculate_concept_similarity(self, phrases1, phrases2):
        category_weights = {
            'background': 0.3,
            'epileptiform': 0.9,
            'slowing': 0.6,
            'seizure': 1.0,
            'location': 0.7
        }
        
        total_sim = 0
        total_weight = 0
        
        for category in category_weights.keys():
            set1 = set(phrases1.get(category, []))
            set2 = set(phrases2.get(category, []))
            
            if not set1 and not set2:
                continue
                
            intersection = set1.intersection(set2)
            union = set1.union(set2)
            
            jaccard = len(intersection) / len(union) if union else 0

            if category in ['epileptiform', 'seizure']:
                max_sig = max([self.knowledge_graph.get_waveform_significance(term) 
                             for term in intersection], default=0)
                jaccard *= (1 + max_sig) 
                
            category_sim = jaccard * category_weights[category]
            total_sim += category_sim
            total_weight += category_weights[category]
            
        return total_sim / total_weight if total_weight > 0 else 0
        
    def calculate_structural_similarity(self, report1, report2):
        sections = ['background', 'description', 'impression']
        section_sims = []
        
        for section in sections:
            text1 = self.extract_section(report1, section)
            text2 = self.extract_section(report2, section)
            
            if text1 and text2:
                emb1 = self.embedder.encode([text1])
                emb2 = self.embedder.encode([text2])
                sim = cosine_similarity(emb1, emb2)[0][0]
                section_sims.append(sim)
                
        return np.mean(section_sims) if section_sims else 0
        
    def extract_section(self, text, section):
        section_keywords = {
            'background': ['background', 'clinical history', 'technician notes'],
            'description': ['description', 'findings', 'observed', 'during the recording'],
            'impression': ['impression', 'interpretation', 'conclusion', 'summary']
        }
        
        keywords = section_keywords.get(section, [])
        for keyword in keywords:
            if keyword in text.lower():
                paragraphs = text.split('\n')
                for para in paragraphs:
                    if keyword in para.lower():
                        return para
        return ''

In [70]:
if __name__ == "__main__":
    actual_report = """
    TECHNIQUE:   This is a multichannel digital EEG recording using the estimated international 10-20 electrode 
    placement system. EEG started with machine calibration the patient was awake and cooperative during the procedure.

    FACTUAL REPORT:

    Background rhythm during awake stage shows well-organized, well-developed, average voltage alpha activity
    in the posterior regions which is appropriate for age. It blocks with eye opening and it is bilaterally synchronous 
    and symmetrical. Beta activity in the frontal or central areas is seen with average voltage and amplitude.
        Hyperventilation & Photic stimulation was performed.
        During recording intermittent spike and wave discharges were noted in bilateral frontal region.
    Muscle movement, eye movement, electrode pop artifacts were also noted. 

    IMPRESSION:


    This is an abnormal EEG with:
    Intermittent spike and wave discharges seen in bilateral frontal region.
    Further clinical correlation is advised.

    """ 
    
    generated_report = """
    TECHNIQUE:
    This is a multichannel digital EEG recording utilizing the international 10-20 electrode placement system. The patient remained awake and cooperative throughout the session. Only the first 3 minutes of a longer 29-minute study were provided for review.

    FACTUAL REPORT:
    The background rhythm demonstrates well-organized, posterior-dominant alpha activity appropriate for the awake state, with preservation of symmetry and reactivity. Beta frequencies are superimposed, especially anteriorly. Notably, there is frequent, persistent, rhythmic high-amplitude artifact over the bifrontal regions (Fp1, Fp2), suggestive of muscle or eye movement, which partially obscures underlying activity in these channels but leaves other areas relatively interpretable. No epileptiform discharges, sharp waves, or focal slowing are evident elsewhere in the record. No periodic or paroxysmal abnormalities are observed within these segments.

    IMPRESSION:

    Abnormal study due to persistent, rhythmic, high-amplitude artifact predominantly localized to bifrontal regions (Fp1, Fp2), potentially obscuring underlying cerebral activity in these leads.
    Background alpha rhythm remains well-preserved and symmetric posteriorly, with no clear evidence of epileptiform activity or focal cortical dysfunction.
    No generalized or focal epileptiform discharges, periodic discharges, or clear abnormal slowing apparent outside the artifact-prone regions.
    Artifacts (most likely muscle or eye movement) significantly limit evaluation of bifrontal activity, but posterior and midline regions are interpretable and appear within normal limits.
    Recommend clinical correlation and, if clinically indicated, a repeat EEG with attention to minimizing frontal artifacts for optimal assessment of frontal lobe activity.    """
    
    similarity_engine = EEGReportSimilarity()
    results = similarity_engine.calculate_similarity(actual_report, generated_report)
    
    print(f"Final Similarity Score: {results['final_score']:.3f}")
    print(f"Base Semantic Similarity: {results['base_similarity']:.3f}")
    print(f"Clinical Concept Similarity: {results['concept_similarity']:.3f}")
    print(f"Structural Similarity: {results['structural_similarity']:.3f}")
    
    print("\nExtracted Phrases from Report 1:")
    for category, phrases in results['extracted_phrases_1'].items():
        print(f"{category.upper()}: {', '.join(phrases)}")
    
    print("\nExtracted Phrases from Report 2:")
    for category, phrases in results['extracted_phrases_2'].items():
        print(f"{category.upper()}: {', '.join(phrases)}")


Final Similarity Score: 0.674
Base Semantic Similarity: 0.692
Clinical Concept Similarity: 0.564
Structural Similarity: 0.858

Extracted Phrases from Report 1:
BACKGROUND: awake, background, awake, alpha, beta
EPILEPTIFORM: spike, spike
SEIZURE: clinical correlation
LOCATION: bilateral, frontal, central, bilateral, frontal, bilateral, frontal

Extracted Phrases from Report 2:
BACKGROUND: awake, background, alpha, awake, beta, background, alpha
EPILEPTIFORM: epileptiform, sharp wave, epileptiform, epileptiform
SLOWING: focal slow, slow
SEIZURE: clinical correlation
LOCATION: frontal, frontal, generalized, frontal, frontal, frontal
