# Batch 4 DEXA Data Cleaning

**Objective**: Extract and organize DEXA scan data from Batch 4 text files

**Data Source**: /Sample Data/DEXA Scans/Batch 4

**Output**: Organized Excel file with all measurements by subject, gender, and timepoint

**Process**: Scan → Parse → Organize → Export

In [1]:
# Import libraries
import pandas as pd
import numpy as np
from pathlib import Path
import re

# Data paths
batch4_path = Path("/Users/aviado/Documents/GDG WashU Medicine/Sample Data/DEXA Scans/Batch 4")
output_dir = Path("../../../cleaned_output")
output_dir.mkdir(exist_ok=True)

## Data Scanning

In [2]:
# Scan all txt files in Batch 4
def scan_batch4_files():
    txt_files = []
    
    # Define timepoint directories for Batch 4
    timepoints = {
        'Pre-Scan': 'Pre_Scan',
        '1 week post-treatment': 'Week_1',
        '2 weeks post-treatment': 'Week_2', 
        '3 weeks post-treatment': 'Week_3',
        'Post-scan': 'Post_Scan'
    }
    
    # Scan each timepoint directory
    for timepoint_dir, timepoint_name in timepoints.items():
        timepoint_path = batch4_path / timepoint_dir
        if timepoint_path.exists():
            # Check Male and Female subdirectories
            for gender in ['Male', 'Female']:
                gender_path = timepoint_path / gender
                if gender_path.exists():
                    # Get all txt files
                    for txt_file in gender_path.glob('*.txt'):
                        txt_files.append({
                            'file_path': txt_file,
                            'timepoint': timepoint_name,
                            'gender': gender,
                            'filename': txt_file.name
                        })
    
    # Also check root directory for any txt files
    for txt_file in batch4_path.glob('*.txt'):
        txt_files.append({
            'file_path': txt_file,
            'timepoint': 'Root',
            'gender': 'Unknown',
            'filename': txt_file.name
        })
    
    return txt_files

# Scan files
batch4_files = scan_batch4_files()
print(f"Found {len(batch4_files)} txt files in Batch 4")

# Show sample of found files
for i, file_info in enumerate(batch4_files[:5]):
    print(f"{i+1}. {file_info['timepoint']} - {file_info['gender']} - {file_info['filename']}")
if len(batch4_files) > 5:
    print(f"... and {len(batch4_files) - 5} more files")

Found 47 txt files in Batch 4
1. Pre_Scan - Male - B4_M_2.txt
2. Pre_Scan - Male - B4_M_3.txt
3. Pre_Scan - Male - B4_M_1.txt
4. Pre_Scan - Male - B4_M_0.txt
5. Pre_Scan - Male - B4_M_4.txt
... and 42 more files


## Data Parsing

In [3]:
# Parse DEXA txt file content
def parse_dexa_txt(file_path):
    """Extract measurements from DEXA txt file"""
    try:
        with open(file_path, 'r') as f:
            content = f.read()
        
        # Extract subject ID from filename (adapt for Batch 4 naming)
        filename = file_path.name
        subject_match = re.search(r'(B4_[MF]_\d+)', filename)
        if not subject_match:
            # Try alternative patterns for Batch 4
            subject_match = re.search(r'([MF]\d+)', filename)
        subject_id = subject_match.group(1) if subject_match else filename.replace('.txt', '')
        
        # Extract measurements using regex patterns
        measurements = {'subject_id': subject_id}
        
        # Find WHOLE TISSUE STATISTICS section (more comprehensive data)
        whole_section = re.search(r'WHOLE TISSUE STATISTICS:(.*?)(?=\n\s*-|$)', content, re.DOTALL)
        if whole_section:
            section_text = whole_section.group(1)
        else:
            # Fallback to INSIDE ROI if WHOLE not found
            section_text = content
        
        # Define patterns for key measurements
        patterns = {
            'sample_area': r'Sample Area:\s*([\d.]+)\s*cm',
            'bone_area': r'Bone Area:\s*([\d.]+)\s*cm',
            'total_weight': r'Total Weight:\s*([\d.]+)\s*g',
            'soft_weight': r'Soft Weight:\s*([\d.]+)\s*g',
            'lean_weight': r'Lean Weight:\s*([\d.]+)\s*g',
            'fat_weight': r'Fat Weight:\s*([\d.]+)\s*g',
            'fat_percent': r'Fat Percent:\s*([\d.]+)',
            'bmc': r'BMC:\s*([\d.]+)\s*g',
            'bmd': r'BMD:\s*([\d.]+)\s*mg/cm'
        }
        
        # Extract each measurement
        for key, pattern in patterns.items():
            match = re.search(pattern, section_text)
            if match:
                measurements[key] = float(match.group(1))
            else:
                measurements[key] = None
        
        return measurements
        
    except Exception as e:
        print(f"Error parsing {file_path}: {e}")
        return None

# Test parsing with first file
if batch4_files:
    sample_file = batch4_files[0]['file_path']
    sample_data = parse_dexa_txt(sample_file)
    print(f"Sample parsing result:")
    print(f"File: {sample_file.name}")
    for key, value in sample_data.items():
        print(f"  {key}: {value}")

Sample parsing result:
File: B4_M_2.txt
  subject_id: B4_M_2
  sample_area: 31.079
  bone_area: 8.944
  total_weight: 31.8933
  soft_weight: 31.2364
  lean_weight: 23.2806
  fat_weight: 7.9558
  fat_percent: 25.47
  bmc: 0.65686
  bmd: 73.446


In [4]:
# Process all Batch 4 files
def process_all_batch4_files():
    all_data = []
    
    for file_info in batch4_files:
        # Parse the txt file
        measurements = parse_dexa_txt(file_info['file_path'])
        
        if measurements:
            # Add metadata
            measurements.update({
                'batch': 'Batch_4',
                'timepoint': file_info['timepoint'], 
                'gender': file_info['gender'],
                'filename': file_info['filename']
            })
            all_data.append(measurements)
    
    return pd.DataFrame(all_data)

# Process all files
batch4_df = process_all_batch4_files()

print(f"Processed {len(batch4_df)} files")
print(f"Columns: {list(batch4_df.columns)}")
print(f"Shape: {batch4_df.shape}")

# Show sample data
batch4_df.head()

Processed 47 files
Columns: ['subject_id', 'sample_area', 'bone_area', 'total_weight', 'soft_weight', 'lean_weight', 'fat_weight', 'fat_percent', 'bmc', 'bmd', 'batch', 'timepoint', 'gender', 'filename']
Shape: (47, 14)


Unnamed: 0,subject_id,sample_area,bone_area,total_weight,soft_weight,lean_weight,fat_weight,fat_percent,bmc,bmd,batch,timepoint,gender,filename
0,B4_M_2,31.079,8.944,31.8933,31.2364,23.2806,7.9558,25.47,0.65686,73.446,Batch_4,Pre_Scan,Male,B4_M_2.txt
1,B4_M_3,32.921,9.455,34.7664,34.0095,25.1797,8.8298,25.963,0.75694,80.054,Batch_4,Pre_Scan,Male,B4_M_3.txt
2,B4_M_1,33.178,10.401,33.241,32.255,24.6999,7.5551,23.423,0.98604,94.799,Batch_4,Pre_Scan,Male,B4_M_1.txt
3,B4_M_0,32.612,11.143,35.5652,34.4501,25.0691,9.381,27.231,1.11502,100.066,Batch_4,Pre_Scan,Male,B4_M_0.txt
4,B4_M_4,30.297,9.402,31.5731,30.8248,22.4249,8.3999,27.25,0.74839,79.599,Batch_4,Pre_Scan,Male,B4_M_4.txt


## Data Analysis

In [5]:
# Analyze the organized data
print("Batch 4 Data Summary:")
print(f"Total subjects: {batch4_df['subject_id'].nunique()}")
print(f"Timepoints: {batch4_df['timepoint'].unique()}")
print(f"Gender distribution: {batch4_df['gender'].value_counts().to_dict()}")

# Check for missing values
missing_data = batch4_df.isnull().sum()
numeric_cols = batch4_df.select_dtypes(include=[np.number]).columns
print(f"\nMissing values in key measurements:")
for col in numeric_cols:
    if missing_data[col] > 0:
        print(f"  {col}: {missing_data[col]} missing")

# Subject tracking across timepoints
subject_timepoints = batch4_df.groupby('subject_id')['timepoint'].nunique().sort_values(ascending=False)
print(f"\nSubject longitudinal tracking:")
print(f"Subjects with multiple timepoints: {len(subject_timepoints[subject_timepoints > 1])}")
print(f"Most tracked subject has {subject_timepoints.iloc[0]} timepoints")

# Show distribution by timepoint and gender
timepoint_gender = batch4_df.groupby(['timepoint', 'gender']).size().unstack(fill_value=0)
print(f"\nScans by timepoint and gender:")
print(timepoint_gender)

Batch 4 Data Summary:
Total subjects: 10
Timepoints: ['Pre_Scan' 'Week_1' 'Week_2' 'Week_3' 'Post_Scan']
Gender distribution: {'Female': 25, 'Male': 22}

Missing values in key measurements:

Subject longitudinal tracking:
Subjects with multiple timepoints: 10
Most tracked subject has 5 timepoints

Scans by timepoint and gender:
gender     Female  Male
timepoint              
Post_Scan       5     4
Pre_Scan        5     5
Week_1          5     5
Week_2          5     4
Week_3          5     4


## Data Cleaning

In [6]:
# Clean the Batch 4 dataset
def clean_batch4_data(df):
    df_clean = df.copy()
    
    # Fill missing numeric values with median (more appropriate for DEXA measurements)
    numeric_cols = df_clean.select_dtypes(include=[np.number]).columns
    for col in numeric_cols:
        if df_clean[col].isnull().sum() > 0:
            if df_clean[col].notna().sum() > 0:
                median_val = df_clean[col].median()
                df_clean[col] = df_clean[col].fillna(median_val)
            else:
                df_clean[col] = df_clean[col].fillna(0)
    
    # Fill missing categorical values
    categorical_cols = df_clean.select_dtypes(include=['object']).columns
    for col in categorical_cols:
        if df_clean[col].isnull().sum() > 0:
            df_clean[col] = df_clean[col].fillna("Unknown")
    
    # Organize columns in logical order
    column_order = [
        'batch', 'subject_id', 'timepoint', 'gender', 'filename',
        'total_weight', 'soft_weight', 'lean_weight', 'fat_weight', 'fat_percent',
        'bmc', 'bmd', 'bone_area', 'sample_area'
    ]
    
    # Reorder columns (keep any extra columns at the end)
    available_cols = [col for col in column_order if col in df_clean.columns]
    extra_cols = [col for col in df_clean.columns if col not in column_order]
    df_clean = df_clean[available_cols + extra_cols]
    
    return df_clean

# Clean the data
batch4_cleaned = clean_batch4_data(batch4_df)

print(f"Data cleaning complete")
print(f"Missing values remaining: {batch4_cleaned.isnull().sum().sum()}")
print(f"Final shape: {batch4_cleaned.shape}")

# Show cleaned data sample
batch4_cleaned.head()

Data cleaning complete
Missing values remaining: 0
Final shape: (47, 14)


Unnamed: 0,batch,subject_id,timepoint,gender,filename,total_weight,soft_weight,lean_weight,fat_weight,fat_percent,bmc,bmd,bone_area,sample_area
0,Batch_4,B4_M_2,Pre_Scan,Male,B4_M_2.txt,31.8933,31.2364,23.2806,7.9558,25.47,0.65686,73.446,8.944,31.079
1,Batch_4,B4_M_3,Pre_Scan,Male,B4_M_3.txt,34.7664,34.0095,25.1797,8.8298,25.963,0.75694,80.054,9.455,32.921
2,Batch_4,B4_M_1,Pre_Scan,Male,B4_M_1.txt,33.241,32.255,24.6999,7.5551,23.423,0.98604,94.799,10.401,33.178
3,Batch_4,B4_M_0,Pre_Scan,Male,B4_M_0.txt,35.5652,34.4501,25.0691,9.381,27.231,1.11502,100.066,11.143,32.612
4,Batch_4,B4_M_4,Pre_Scan,Male,B4_M_4.txt,31.5731,30.8248,22.4249,8.3999,27.25,0.74839,79.599,9.402,30.297


## Export Results

In [7]:
# Export to Excel with multiple sheets
excel_output_path = output_dir / "batch4_dexa_cleaned.xlsx"

with pd.ExcelWriter(excel_output_path, engine='openpyxl') as writer:
    # Main data sheet
    batch4_cleaned.to_excel(writer, sheet_name='Batch4_All_Data', index=False)
    
    # Summary by timepoint
    timepoint_summary = batch4_cleaned.groupby(['timepoint', 'gender']).agg({
        'subject_id': 'nunique',
        'total_weight': 'mean',
        'fat_percent': 'mean',
        'bmd': 'mean',
        'lean_weight': 'mean'
    }).round(3)
    timepoint_summary.to_excel(writer, sheet_name='Timepoint_Summary')
    
    # Subject tracking sheet
    subject_summary = batch4_cleaned.groupby('subject_id').agg({
        'timepoint': 'nunique',
        'gender': 'first',
        'total_weight': ['min', 'max', 'mean'],
        'fat_percent': ['min', 'max', 'mean']
    }).round(3)
    subject_summary.columns = ['_'.join(col).strip() for col in subject_summary.columns]
    subject_summary.to_excel(writer, sheet_name='Subject_Summary')

print(f"Excel file exported: {excel_output_path}")
print(f"Sheets created: Batch4_All_Data, Timepoint_Summary, Subject_Summary")

# Also save as CSV for backup
csv_output_path = output_dir / "batch4_dexa_cleaned.csv"
batch4_cleaned.to_csv(csv_output_path, index=False)
print(f"CSV backup saved: {csv_output_path}")

print(f"\nFinal Results Summary:")
print(f"- Total records: {len(batch4_cleaned)}")
print(f"- Unique subjects: {batch4_cleaned['subject_id'].nunique()}")
print(f"- Timepoints: {list(batch4_cleaned['timepoint'].unique())}")
print(f"- Gender distribution: {batch4_cleaned['gender'].value_counts().to_dict()}")
print(f"- Key measurements: total_weight, fat_percent, bmd, lean_weight, fat_weight")

Excel file exported: ../../../cleaned_output/batch4_dexa_cleaned.xlsx
Sheets created: Batch4_All_Data, Timepoint_Summary, Subject_Summary
CSV backup saved: ../../../cleaned_output/batch4_dexa_cleaned.csv

Final Results Summary:
- Total records: 47
- Unique subjects: 10
- Timepoints: ['Pre_Scan', 'Week_1', 'Week_2', 'Week_3', 'Post_Scan']
- Gender distribution: {'Female': 25, 'Male': 22}
- Key measurements: total_weight, fat_percent, bmd, lean_weight, fat_weight
