# Injury Body Region Extraction

This notebook is meant to extract all the body region information from the Notes column in the injury_list table.

**Body regions extracted:**
- Catches MOST body region. The dataset is quite large, and there are some very obscure injury regions in there that will not be precisely detected.

Note: Some injuries involve multiple body regions. When detected, the body_region value will be formatted as "body_region1,body_region2,body_region3,..."

Output is saved to `Body_Region_IL_movement.csv` without modifying the original file.
The body_region field in the injury_list table is also updated directly when running this notebook

In [None]:
import pandas as pd
import sqlite3
import re
from collections import Counter
from collections import OrderedDict

In [None]:
CSV_PATH = "./data/ID_IL_movement.csv"
df = pd.read_csv(CSV_PATH)

print(f"Loaded {len(df)} rows")

In [None]:
BODY_PART_PATTERNS = OrderedDict([
    
    # back terms
    ('back', [r'\blumbro[- ]?sacral\b', r'\blumbar\b', r'\bsacrum\b', r'\bcoccyx\b', r'\btailbone\b',
              r'\bback\b', r'\blower back\b', r'\bupper back\b', r'\bmid[- ]?back\b', 
              r'\bspine\b', r'\bspinal\b', r'\bvertebra\b', r'\bvertebrae\b', r'\bcervical spine\b',
              r'\bthoracic\b', r'\bdisk\b', r'\bdisc\b', r'\bsciatica\b', r'\blats\b', r'\blatissimus\b',
              r'\bparavertebral\b', r'\bfacet\b']),
    
    # head / face terms
    ('eye', [r'\beye\b', r'\beyes\b', r'\borbital\b', r'\beye socket\b', r'\borbit\b', r'\bretina\b', r'\bcornea\b']),
    ('nose', [r'\bnose\b', r'\bnasal\b', r'\bseptum\b']),
    ('jaw', [r'\bjaw\b', r'\bmandible\b', r'\btemporomandibular\b', r'\bTMJ\b']),
    ('face', [r'\bface\b', r'\bfacial\b', r'\bcheek\b', r'\bcheekbone\b', r'\bforehead\b', r'\bzygomatic\b']),
    ('head', [r'\bhead\b', r'\bskull\b', r'\bconcussion\b', r'\bconcussed\b', r'\btemple\b', r'\bcranial\b', r'\bbrain\b']),
    ('mouth', [r'\bmouth\b', r'\blip\b', r'\blips\b', r'\btooth\b', r'\bteeth\b', r'\bdental\b', r'\btongue\b', r'\bgum\b', r'\bgums\b', r'\boral\b']),
    ('ear', [r'\bear\b', r'\bears\b', r'\beardrum\b']),
    ('neck', [r'\bneck\b', r'\bcervical\b(?!\s+spine)', r'\bthroat\b', r'\btrachea\b', r'\blarynx\b', r'\bpharynx\b', r'\bcervican\b']),
    
    # Upper body specific
    ('finger', [r'\bthumb\b', r'\bpinky\b', r'\bpinkie\b', r'\bindex finger\b', r'\bmiddle finger\b', 
                r'\bring finger\b', r'\bfinger\b', r'\bfingers\b', r'\bdigit\b', r'\bphalanges\b', r'\bphalangeal\b']),
    ('wrist', [r'\bwrist\b', r'\bwrists\b', r'\bcarpal\b', r'\bradial\b', r'\bulnar\b']),
    ('hand', [r'\bhand\b', r'\bhands\b', r'\bpalm\b', r'\bmetacarpal\b']),
    ('elbow', [r'\belbow\b', r'\belbows\b', r'\bolecranon\b']),
    ('shoulder', [r'\bshoulder\b', r'\bshoulders\b', r'\brotator cuff\b', r'\bclavicle\b', 
                  r'\bcollarbone\b', r'\bscapula\b', r'\bac joint\b', r'\bacromioclavicular\b',
                  r'\bdeltoid\b', r'\bsupraspinatus\b', r'\binfraspinatus\b', r'\bsubscapularis\b', r'\bteres\b']),
    ('arm', [r'\barm\b', r'\barms\b', r'\bupper arm\b', r'\bforearm\b', r'\bbicep\b', r'\bbiceps\b', 
             r'\btricep\b', r'\btriceps\b', r'\bhumerus\b', r'\bulna\b', r'\bradius\b', r'\bbrachial\b']),
    ('chest', [r'\bchest\b', r'\bpectoral\b', r'\bpec\b', r'\bpecs\b', r'\bsternum\b', r'\bbreastbone\b']),
    ('ribs', [r'\brib\b', r'\bribs\b', r'\bcostal\b', r'\bintercostal\b']),
    
    # Internal organs/body systems
    ('side', [r'\bside muscle\b', r'\bside\b(?!\s+muscle)', r'\bsides\b', r'\bflank\b']),
    ('respiratory', [r'\blung\b', r'\blungs\b', r'\bpulmonary\b', r'\bbronchial\b', r'\bbronchitis\b', 
                     r'\brespiratory\b', r'\bbreathing\b', r'\bpneumonia\b']),
    ('heart', [r'\bheart\b', r'\bcardiac\b', r'\bheart[- ]?beat\b', r'\birregular heart\b', r'\bcardiovascular\b', r'\bheart beat\b', r'\bheartbeat\b']),
    ('stomach', [r'\bstomach\b', r'\bgastric\b', r'\bgastrointestinal\b', r'\bgastritis\b', 
                 r'\bulcer\b', r'\bgastro\b', r'\bepigastric\b']),
    ('intestine', [r'\bintestine\b', r'\bintestinal\b', r'\bbowel\b', r'\bcolon\b', r'\bcolitis\b']),
    ('kidney', [r'\bkidney\b', r'\bkidneys\b', r'\brenal\b']),
    ('liver', [r'\bliver\b', r'\bhepatic\b', r'\bhepatitis\b']),
    ('spleen', [r'\bspleen\b', r'\bsplenic\b']),
    ('pancreas', [r'\bpancreas\b', r'\bpancreatitis\b', r'\bpancreatic\b']),
    ('gallbladder', [r'\bgall\s*bladder\b', r'\bgallbladder\b']),
    
    # Core/Torso
    ('abdomen', [r'\babdomen\b', r'\babdominal\b', r'\babs\b', r'\bcore\b', r'\boblique\b', 
                 r'\bobliques\b', r'\bhernia\b', r'\binguinal hernia\b', r'\bumbilical\b']),
    ('groin', [r'\bgroin\b', r'\badductor\b', r'\binguinal\b', r'\bhip flexor\b', r'\biliopsoas\b']),
    ('buttocks', [r'\bbuttocks\b', r'\bglute\b', r'\bglutes\b', r'\bgluteal\b', r'\bgluteus\b', 
                  r'\bpiriformis\b']),
    ('pelvis', [r'\bpelvis\b', r'\bpelvic\b', r'\bhip pointer\b', r'\bsacroiliac\b', r'\bsi joint\b']),
    
    # Lower body
    ('toe', [r'\btoe\b', r'\btoes\b', r'\bbig toe\b', r'\bmetatarsalgia\b', r'\bhallux\b', r'\bturf toe\b']),
    ('heel', [r'(?<!achilles\s)\bheel\b', r'(?<!achilles\s)\bheels\b', r'\bcalcaneus\b', r'\bcalcaneal\b']),
    ('achilles', [r'\bachilles\b', r'\bachilles tendon\b']),
    ('foot', [r'\bfoot\b', r'\bfeet\b', r'\barch\b', r'\bmetatarsal\b', r'\bplantar\b', 
              r'\bfasciitis\b', r'\bfascia\b', r'\bforefoot\b', r'\bmidfoot\b', r'\bhindfoot\b', 
              r'\bnavicular\b', r'\btarsal\b', r'\bsesamoid\b', r'\bsesamoiditis\b', r'\bcuneiform\b',
              r'\bcuboid\b', r'\btalus\b', r'\binstep\b']),
    ('ankle', [r'\bankle\b', r'\bankles\b', r'\bmalleolus\b']),
    ('shin', [r'\bshin\b', r'\bshins\b', r'\btibia\b', r'\btibial\b', r'\bfibula\b', r'\bfibular\b', 
              r'\bshin splints\b', r'\bperiostitis\b', r'\bcompartment syndrome\b']),
    ('calf', [r'\bcalf\b', r'\bcalves\b', r'\bgastrocnemius\b', r'\bsoleus\b']),
    ('acl', [r'\bACL\b', r'\banterior cruciate\b']),
    ('mcl', [r'\bMCL\b', r'\bmedial collateral\b']),
    ('meniscus', [r'\bmeniscus\b', r'\bmeniscal\b']),
    ('knee', [r'\bknee\b', r'\bknees\b', r'\bpatella\b', r'\bpatellar\b', 
              r'\bPCL\b', r'\bLCL\b', r'\bkneecap\b',
              r'\bposterior cruciate\b', 
              r'\blateral collateral\b', r'\bchondromalacia\b', r'\bsynovitis\b', r'\binfra[- ]?patella(r)?\b']),
    ('hamstring', [r'\bhamstring\b', r'\bhamstrings\b', r'\bbiceps femoris\b', r'\bsemitendinosus\b', 
                   r'\bsemimembranosus\b']),
    ('quadriceps', [r'\bquad\b', r'\bquads\b', r'\bquadriceps\b', r'\bquadricep\b', 
                    r'\brectus femoris\b', r'\bvastus\b']),
    ('thigh', [r'\bthigh\b', r'\bthighs\b', r'\bfemur\b', r'\bfemoral\b', r'\bfemur bone\b']),
    ('hip', [r'\bhip\b', r'\bhips\b', r'\btrochanter\b', r'\bacetabulum\b', r'\blabrum\b', r'\blabral\b', r'\babductor\b']),
    ('leg', [r'\bleg\b', r'\blegs\b', r'\blower leg\b', r'\bupper leg\b', r'\blimb\b']),
])

In [None]:
def extract_body_parts(notes):
    notes_str = str(notes).lower()
    detected_parts = []
    
    # Important to note, there are hiearchy rules: if we find a more specific part, skip the general one
    exclusion_rules = {
        'finger': ['hand'],
        'wrist': ['hand', 'arm'],
        'toe': ['foot'],
        'heel': ['foot'],
        'achilles': ['foot', 'calf', 'leg'],
        'ankle': ['foot', 'leg'],
        'shin': ['leg'],
        'calf': ['leg'],
        'acl': ['knee', 'leg'],
        'mcl': ['knee', 'leg'],
        'meniscus': ['knee', 'leg'],
        'knee': ['leg'],
        'hamstring': ['thigh', 'leg'],
        'quadriceps': ['thigh', 'leg'],
        'thigh': ['leg'],
        'elbow': ['arm'],
        'shoulder': ['arm', 'chest'],
        'groin': ['hip', 'abdomen'],
    }
    
    # check each body part's patterns in order
    for body_part, patterns in BODY_PART_PATTERNS.items():
        for pattern in patterns:
            if re.search(pattern, notes_str, re.IGNORECASE):
                detected_parts.append(body_part)
                break
    
    # apply exclusion rules!
    excluded = set()
    for part in detected_parts:
        if part in exclusion_rules:
            for exclude in exclusion_rules[part]:
                excluded.add(exclude)
    
    # remove duplicates and excluded parts
    seen = set()
    unique_parts = []
    for part in detected_parts:
        if part not in excluded and part not in seen:
            seen.add(part)
            unique_parts.append(part)
    
    return ','.join(unique_parts) if unique_parts else None

In [None]:
# extract body regions
df['body_region'] = df['Notes'].apply(extract_body_parts)

# save CSV
OUTPUT_CSV_PATH = "./data/Body_Region_IL_movement.csv"
df.to_csv(OUTPUT_CSV_PATH, index=False)
print(f"Saved new CSV to {OUTPUT_CSV_PATH}")
print(f"Columns inside Body_IL_movement.csv: {df.columns.tolist()}")

In [None]:
# Update the database
DB_PATH = "../BALL.db"

with sqlite3.connect(DB_PATH) as conn:
    # read existing data from database
    existing_df = pd.read_sql("SELECT * FROM injury_list", conn)
    
    # prep body_region dataframe from our processed data
    body_region_df = df[['injury_id', 'body_region']].copy()
    body_region_df['injury_id'] = body_region_df['injury_id'].astype(str)
    
    # convert injury_id to string for merging
    existing_df['injury_id'] = existing_df['injury_id'].astype(str)
    
    # drop the old body_region column if it exists
    if 'body_region' in existing_df.columns:
        print("Dropping old 'body_region' column and getting the new one ready")
        existing_df = existing_df.drop(columns=['body_region'])
    
    # merge in the new body_region values
    existing_df = existing_df.merge(body_region_df, on='injury_id', how='left')
    
    # write back to database
    existing_df.to_sql('injury_list', conn, if_exists='replace', index=False)

# print some of the summary statistics
print(f"\nUpdated database with body_region values")
print(f"Rows with body_region: {existing_df['body_region'].notna().sum()}")
print(f"Rows without body_region: {existing_df['body_region'].isna().sum()}")

# Show body_region distribution
print(f"\nBody region distribution:")
all_parts = []
for parts in df['body_region'].dropna():
    all_parts.extend(parts.split(','))
part_counts = Counter(all_parts)
for part, count in part_counts.most_common():
    print(f"  {part}: {count}")

print("\nDatabase update completed")