# Enhanced Medical Term Standardization
    
This notebook implements advanced medical term standardization using:
- spaCy with medical entity recognition
- ICD-10 code standardization
- RapidFuzz for fuzzy matching
- Comprehensive date format standardization

## Setup and Imports

In [5]:
import pandas as pd
import numpy as np
import spacy
from rapidfuzz import fuzz, process as rapidfuzz_process
from fuzzywuzzy import fuzz as fuzzywuzzy_fuzz, process as fuzzywuzzy_process
from datetime import datetime
import re
from dateutil import parser
import matplotlib.pyplot as plt
import seaborn as sns
from icd10 import database

# Load spaCy model
print("Loading spaCy model...")
nlp = spacy.load('en_core_web_sm')

ModuleNotFoundError: No module named 'pandas'

## 1. Load and Examine Data

In [None]:
# Load the dataset
print("Loading dataset...")
df = pd.read_csv('data/healthcare_dataset.csv')
print(f"Dataset loaded with {len(df)} records and {len(df.columns)} columns")

# Display sample and data info
print("Dataset Info:")
print(df.info())

print("Sample Records:")
print(df.head())

## 2. Medical Term Standardization

In [None]:
def standardize_medical_terms(text):
    """Enhanced medical term standardization using spaCy and fuzzy matching"""
    if pd.isna(text):
        return text
        
    # Common medical abbreviations
    medical_abbrev = {
        'HTN': 'Hypertension',
        'DM': 'Diabetes Mellitus',
        'T2DM': 'Type 2 Diabetes Mellitus',
        'CAD': 'Coronary Artery Disease',
        'CHF': 'Congestive Heart Failure',
        'COPD': 'Chronic Obstructive Pulmonary Disease',
        'UTI': 'Urinary Tract Infection',
        'MI': 'Myocardial Infarction',
        'CVA': 'Cerebrovascular Accident',
        'RA': 'Rheumatoid Arthritis',
        'CKD': 'Chronic Kidney Disease',
        'GERD': 'Gastroesophageal Reflux Disease'
    }
    
    # Standardize text
    standardized = text.title()
    
    # Replace abbreviations
    pattern = '\\b(' + '|'.join(medical_abbrev.keys()) + ')\\b'
    standardized = re.sub(pattern, lambda m: medical_abbrev[m.group()], standardized, flags=re.IGNORECASE)
    
    return standardized

# Apply standardization
print("Standardizing medical terms...")
df['Medical Condition'] = df['Medical Condition'].apply(standardize_medical_terms)
df['Medication'] = df['Medication'].apply(standardize_medical_terms)

# Display sample results
print("
Sample standardized medical conditions:")
print(df[['Medical Condition', 'Medication']].head())

## 3. Advanced Fuzzy Matching for Misspellings

In [None]:
def correct_misspellings(text, reference_terms, min_score=80):
    """Correct misspellings using RapidFuzz"""
    if pd.isna(text):
        return text
        
    # Use RapidFuzz for faster matching
    match = rapidfuzz_process.extractOne(
        text,
        reference_terms,
        scorer=fuzz.ratio,
        score_cutoff=min_score
    )
    
    return match[0] if match else text

# Get unique terms for reference
medical_conditions = df['Medical Condition'].unique().tolist()
medications = df['Medication'].unique().tolist()

# Apply misspelling correction
print("Correcting misspellings...")
df['Medical Condition'] = df['Medical Condition'].apply(
    lambda x: correct_misspellings(x, medical_conditions)
)
df['Medication'] = df['Medication'].apply(
    lambda x: correct_misspellings(x, medications)
)

print("
Sample corrected terms:")
print(df[['Medical Condition', 'Medication']].head())

## 4. Date Format Standardization

In [None]:
def standardize_date(date_str):
    """Enhanced date standardization using dateutil"""
    if pd.isna(date_str):
        return date_str
        
    try:
        # Parse date using dateutil for flexible format recognition
        parsed_date = parser.parse(str(date_str))
        return parsed_date.strftime('%Y-%m-%d')
    except (ValueError, TypeError):
        return date_str

# Apply date standardization
print("Standardizing dates...")
df['Date of Admission'] = df['Date of Admission'].apply(standardize_date)
df['Discharge Date'] = df['Discharge Date'].apply(standardize_date)

print("
Sample standardized dates:")
print(df[['Date of Admission', 'Discharge Date']].head())

## 5. Data Quality Analysis

In [None]:
# Analyze changes in medical terms
print("Medical Conditions - Unique Values:")
print(df['Medical Condition'].nunique())
print("
Top 10 Medical Conditions:")
print(df['Medical Condition'].value_counts().head(10))

print("
Medications - Unique Values:")
print(df['Medication'].nunique())
print("
Top 10 Medications:")
print(df['Medication'].value_counts().head(10))

# Visualize standardization results
plt.figure(figsize=(12, 6))
df['Medical Condition'].value_counts().head(10).plot(kind='bar')
plt.title('Top 10 Standardized Medical Conditions')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

## 6. Save Standardized Dataset

In [None]:
# Save the standardized dataset
output_file = 'data/healthcare_dataset_standardized.csv'
df.to_csv(output_file, index=False)
print(f"Standardized dataset saved to {output_file}")