# Injury Diagnosis Extraction

This notebook is meant to extract all the diagnosis information from the Notes column in the injury_list table.

**Diagnosis types extracted:**
- `break` (broken bones, breaks)
- `chip` (chip, chipped bones/bone chips)
- `fracture` (fractures, stress fractures, hairline fractures)
- `tear` (torn ligaments/tendons/muscles, ruptures, lacerations)
- `sprain` (sprains, ligament damage/injury)
- `pulled` (pulled muscles)
- `strain` (strains, strained muscles)
- `dislocation` (dislocations, subluxations, separations)
- `inflammation` (tendinitis, bursitis, swelling, inflammation)
- `bruise` (bruises, contusions, hematomas)
- `nerve` (nerve damage, neuropathy, pinched nerves)
- `infection` (infections, viral illness, flu, pneumonia)
- `soreness` (soreness, pain, discomfort, tenderness)
- `spasm` (spasms, charley horse)
- `concussion` (concussions, head injuries)
- `hernia` (hernias, herniated)
- `degeneration` (degenerative conditions, arthritis, bone spurs)
- `blood-clot` (blood clots, thrombosis, embolism)

Output is saved to `Diagnosis_IL_movement.csv` without modifying the original file.
The diagnosis field in the injury_list table is also updated directly when running this notebook

In [None]:
import pandas as pd
import sqlite3
import re
from collections import Counter, OrderedDict

In [None]:
CSV_PATH = "./data/ID_IL_movement.csv"
df = pd.read_csv(CSV_PATH)

print(f"Loaded {len(df)} rows")

In [None]:
# Order matters - more specific patterns first to avoid false matches
DIAGNOSIS_PATTERNS = OrderedDict([
    # Breaks/Chips - MOST SEVERE, check first
    ('break', [r'\bbroken\b', r'\bbreak\b(?!\s+up)']),
    
    # Chip fractures - separate from regular fractures
    ('chip', [r'\bchip fracture\b', r'\bchipped\b', r'\bchip\b', r'\bchips\b', r'\bbone chips\b']),
    
    # Fractures (excluding breaks and chips which are already caught)
    ('fracture', [r'\bfracture\b', r'\bfractured\b', r'\bhairline fracture\b', r'\bstress fracture\b',
                  r'\bcomminuted\b', r'\bavulsion fracture\b']),
    
    # Tears (ligament, tendon, muscle, cartilage)
    ('tear', [r'\btorn\b', r'\btear\b', r'\brupture\b', r'\bruptured\b', r'\bpartially torn\b',
              r'\bcompletely torn\b', r'\blacerated\b', r'\blaceration\b']),
    
    # Sprains (ligament injuries)
    ('sprain', [r'\bsprain\b', r'\bsprained\b', r'\bligament damage\b', r'\bligament injury\b',
                r'\bstretched ligament\b']),
    
    ('pulled', [r'\bpulled\b', r'\bpull\b']),
    
    # Strains (muscle/tendon injuries)
    ('strain', [r'\bstrain\b', r'\bstrained\b']),
    
    # Dislocations
    ('dislocation', [r'\bdislocation\b', r'\bdislocated\b', r'\bsubluxation\b', r'\bsubluxated\b',
                     r'\bseparated\b', r'\bseparation\b(?!\s+from)']),
    
    # Inflammation/Tendinitis
    ('inflammation', [r'\btendinitis\b', r'\btendonitis\b', r'\btendinosis\b', r'\btendinopathy\b',
                      r'\binflammation\b', r'\binflamed\b', r'\bbursitis\b', r'\bsynovitis\b',
                      r'\bswelling\b', r'\bswollen\b', r'\beffusion\b']),
    
    # Bruises/Contusions
    ('bruise', [r'\bbruise\b', r'\bbruised\b', r'\bcontusion\b', r'\bbone bruise\b', r'\bhematoma\b']),
    
    # Nerve/Neurological
    ('nerve', [r'\bnerve\b', r'\bnerve damage\b', r'\bneuropathy\b', r'\bpinched nerve\b',
               r'\bnerve injury\b', r'\bneurological\b']),
    
    # Infections/Illness
    ('infection', [r'\binfection\b', r'\binfected\b', r'\bviral\b', r'\bvirus\b', r'\bflu\b',
                   r'\binfluenza\b', r'\bpneumonia\b', r'\bbronchitis\b', r'\billness\b',
                   r'\bsick\b', r'\bdisease\b', r'\bstrep\b', r'\basthma\b']),
    
    # Soreness/Pain (general)
    ('soreness', [r'\bsore\b', r'\bsoreness\b', r'\bpain\b', r'\bpainful\b', r'\baching\b',
                  r'\bdiscomfort\b', r'\btenderness\b']),
    
    # Spasms
    ('spasm', [r'\bspasm\b', r'\bspasms\b', r'\bcharley horse\b']),
    
    # Concussion
    ('concussion', [r'\bconcussion\b', r'\bconcussed\b', r'\bhead injury\b', r'\bhead trauma\b']),
    
    # Hernia
    ('hernia', [r'\bhernia\b', r'\bherniated\b']),
    
    # Degeneration/Chronic
    ('degeneration', [r'\bdegenerat\w+\b', r'\barthriti\w*\b', r'\bosteoarthritis\b',
                      r'\bchronic\b', r'\bdegenerative\b', r'\bbone spur\b', r'\bspur\b']),
    
    # Blood clots
    ('blood-clot', [r'\bblood clot\b', r'\bblood clots\b', r'\bclot\b', r'\bthrombosis\b', r'\bembolism\b']),
    
    # Hemmorhage
    ('hemmorhage', [r'\bhemmorhage\b', r'\bhemorrage\b']),
])

In [None]:
def extract_diagnosis(notes):
    """
    Extract diagnosis from Notes column.
    Returns a comma-separated string of detected diagnosis types, or None.
    """
    
    notes_str = str(notes).lower()
    detected_diagnoses = []
    
    for diagnosis, patterns in DIAGNOSIS_PATTERNS.items():
        for pattern in patterns:
            if re.search(pattern, notes_str, re.IGNORECASE):
                detected_diagnoses.append(diagnosis)
                break
    
    seen = set()
    unique_diagnoses = []
    for diagnosis in detected_diagnoses:
        if diagnosis not in seen:
            seen.add(diagnosis)
            unique_diagnoses.append(diagnosis)
    
    return ','.join(unique_diagnoses) if unique_diagnoses else None

In [None]:
df['diagnosis'] = df['Notes'].apply(extract_diagnosis)

OUTPUT_CSV_PATH = "./data/Diagnosis_IL_movement.csv"
df.to_csv(OUTPUT_CSV_PATH, index=False)
print(f"Saved new CSV to {OUTPUT_CSV_PATH}")
print(f"Columns inside Diagnosis_IL_movement.csv: {df.columns.tolist()}")

In [None]:
DB_PATH = "../BALL.db"

with sqlite3.connect(DB_PATH) as conn:
    # Read existing data from database
    existing_df = pd.read_sql("SELECT * FROM injury_list", conn)
    
    # prep diagnosis dataframe from our processed data
    diagnosis_df = df[['injury_id', 'diagnosis']].copy()
    diagnosis_df['injury_id'] = diagnosis_df['injury_id'].astype(str)
    
    # Convert injury_id to string for merging
    existing_df['injury_id'] = existing_df['injury_id'].astype(str)
    
    # Drop the old diagnosis column if it exists
    if 'diagnosis' in existing_df.columns:
        print("Dropping old 'diagnosis' column and getting the new one ready")
        existing_df = existing_df.drop(columns=['diagnosis'])
    
    # Merge in the new diagnosis values
    existing_df = existing_df.merge(diagnosis_df, on='injury_id', how='left')
    
    # Write back to database
    existing_df.to_sql('injury_list', conn, if_exists='replace', index=False)

# Print summary statistics (moved outside the connection context)
print(f"\nUpdated database with diagnosis values")
print(f"Rows with diagnosis: {existing_df['diagnosis'].notna().sum()}")
print(f"Rows without diagnosis: {existing_df['diagnosis'].isna().sum()}")

# Show diagnosis distribution
print(f"\nDiagnosis distribution:")
all_diagnoses = []
for diagnoses in df['diagnosis'].dropna():
    all_diagnoses.extend(diagnoses.split(','))
diagnosis_counts = Counter(all_diagnoses)
for diagnosis, count in diagnosis_counts.most_common():
    print(f"  {diagnosis}: {count}")


print("\nDatabase update complete!")