# Batch 3 DEXA Data Cleaning

**Objective**: Extract and organize DEXA scan data from Batch 3 text files

**Data Source**: /Sample Data/DEXA Scans/Batch 3

**Output**: Organized Excel file with all measurements by subject, gender, and timepoint

**Process**: Scan → Parse → Organize → Export

In [1]:
# Import libraries
import pandas as pd
import numpy as np
from pathlib import Path
import re

# Data paths
batch3_path = Path("/Users/aviado/Documents/GDG WashU Medicine/Sample Data/DEXA Scans/Batch 3")
output_dir = Path("../../../cleaned_output")
output_dir.mkdir(exist_ok=True)

## Data Scanning

In [2]:
# Scan all txt files in Batch 3
def scan_batch3_files():
    txt_files = []
    
    # Define timepoint directories for Batch 3
    timepoints = {
        'Pre-Scan': 'Pre_Scan',
        '1 week post-treatment': 'Week_1',
        '2 week post-treatment': 'Week_2', 
        '3 week post-treatment': 'Week_3',
        'Post-scan': 'Post_Scan'
    }
    
    # Scan each timepoint directory
    for timepoint_dir, timepoint_name in timepoints.items():
        timepoint_path = batch3_path / timepoint_dir
        if timepoint_path.exists():
            # Check Male and Female subdirectories
            for gender in ['Male', 'Female']:
                gender_path = timepoint_path / gender
                if gender_path.exists():
                    # Get all txt files
                    for txt_file in gender_path.glob('*.txt'):
                        txt_files.append({
                            'file_path': txt_file,
                            'timepoint': timepoint_name,
                            'gender': gender,
                            'filename': txt_file.name
                        })
    
    # Also check root directory for any txt files
    for txt_file in batch3_path.glob('*.txt'):
        txt_files.append({
            'file_path': txt_file,
            'timepoint': 'Root',
            'gender': 'Unknown',
            'filename': txt_file.name
        })
    
    return txt_files

# Scan files
batch3_files = scan_batch3_files()
print(f"Found {len(batch3_files)} txt files in Batch 3")

# Show sample of found files
for i, file_info in enumerate(batch3_files[:5]):
    print(f"{i+1}. {file_info['timepoint']} - {file_info['gender']} - {file_info['filename']}")
if len(batch3_files) > 5:
    print(f"... and {len(batch3_files) - 5} more files")

Found 48 txt files in Batch 3
1. Pre_Scan - Male - B3_M_0.txt
2. Pre_Scan - Male - B3_M_1.txt
3. Pre_Scan - Male - B3_M_3.txt
4. Pre_Scan - Male - B3_M_2.txt
5. Pre_Scan - Male - B3_M_4.txt
... and 43 more files


## Data Parsing

In [3]:
# Parse DEXA txt file content
def parse_dexa_txt(file_path):
    """Extract measurements from DEXA txt file"""
    try:
        with open(file_path, 'r') as f:
            content = f.read()
        
        # Extract subject ID from filename (adapt for Batch 3 naming)
        filename = file_path.name
        subject_match = re.search(r'(B3_[MF]_\d+)', filename)
        if not subject_match:
            # Try alternative patterns for Batch 3
            subject_match = re.search(r'([MF]\d+)', filename)
        subject_id = subject_match.group(1) if subject_match else filename.replace('.txt', '')
        
        # Extract measurements using regex patterns
        measurements = {'subject_id': subject_id}
        
        # Find WHOLE TISSUE STATISTICS section (more comprehensive data)
        whole_section = re.search(r'WHOLE TISSUE STATISTICS:(.*?)(?=\n\s*-|$)', content, re.DOTALL)
        if whole_section:
            section_text = whole_section.group(1)
        else:
            # Fallback to INSIDE ROI if WHOLE not found
            section_text = content
        
        # Define patterns for key measurements
        patterns = {
            'sample_area': r'Sample Area:\s*([\d.]+)\s*cm',
            'bone_area': r'Bone Area:\s*([\d.]+)\s*cm',
            'total_weight': r'Total Weight:\s*([\d.]+)\s*g',
            'soft_weight': r'Soft Weight:\s*([\d.]+)\s*g',
            'lean_weight': r'Lean Weight:\s*([\d.]+)\s*g',
            'fat_weight': r'Fat Weight:\s*([\d.]+)\s*g',
            'fat_percent': r'Fat Percent:\s*([\d.]+)',
            'bmc': r'BMC:\s*([\d.]+)\s*g',
            'bmd': r'BMD:\s*([\d.]+)\s*mg/cm'
        }
        
        # Extract each measurement
        for key, pattern in patterns.items():
            match = re.search(pattern, section_text)
            if match:
                measurements[key] = float(match.group(1))
            else:
                measurements[key] = None
        
        return measurements
        
    except Exception as e:
        print(f"Error parsing {file_path}: {e}")
        return None

# Test parsing with first file
if batch3_files:
    sample_file = batch3_files[0]['file_path']
    sample_data = parse_dexa_txt(sample_file)
    print(f"Sample parsing result:")
    print(f"File: {sample_file.name}")
    for key, value in sample_data.items():
        print(f"  {key}: {value}")

Sample parsing result:
File: B3_M_0.txt
  subject_id: B3_M_0
  sample_area: 35.091
  bone_area: 10.221
  total_weight: 35.5679
  soft_weight: 34.7521
  lean_weight: 23.7603
  fat_weight: 10.9918
  fat_percent: 31.629
  bmc: 0.81583
  bmd: 79.823


In [4]:
# Process all Batch 3 files
def process_all_batch3_files():
    all_data = []
    
    for file_info in batch3_files:
        # Parse the txt file
        measurements = parse_dexa_txt(file_info['file_path'])
        
        if measurements:
            # Add metadata
            measurements.update({
                'batch': 'Batch_3',
                'timepoint': file_info['timepoint'], 
                'gender': file_info['gender'],
                'filename': file_info['filename']
            })
            all_data.append(measurements)
    
    return pd.DataFrame(all_data)

# Process all files
batch3_df = process_all_batch3_files()

print(f"Processed {len(batch3_df)} files")
print(f"Columns: {list(batch3_df.columns)}")
print(f"Shape: {batch3_df.shape}")

# Show sample data
batch3_df.head()

Processed 48 files
Columns: ['subject_id', 'sample_area', 'bone_area', 'total_weight', 'soft_weight', 'lean_weight', 'fat_weight', 'fat_percent', 'bmc', 'bmd', 'batch', 'timepoint', 'gender', 'filename']
Shape: (48, 14)


Unnamed: 0,subject_id,sample_area,bone_area,total_weight,soft_weight,lean_weight,fat_weight,fat_percent,bmc,bmd,batch,timepoint,gender,filename
0,B3_M_0,35.091,10.221,35.5679,34.7521,23.7603,10.9918,31.629,0.81583,79.823,Batch_3,Pre_Scan,Male,B3_M_0.txt
1,B3_M_1,37.581,11.712,37.4815,36.3929,24.4471,11.9458,32.824,1.08861,92.95,Batch_3,Pre_Scan,Male,B3_M_1.txt
2,B3_M_3,34.005,10.678,35.2593,34.2374,21.9613,12.276,35.856,1.0219,95.698,Batch_3,Pre_Scan,Male,B3_M_3.txt
3,B3_M_2,35.085,11.21,37.12,36.1086,22.4425,13.6662,37.847,1.01141,90.226,Batch_3,Pre_Scan,Male,B3_M_2.txt
4,B3_M_4,34.495,11.583,35.7049,34.6436,23.5031,11.1406,32.158,1.06125,91.62,Batch_3,Pre_Scan,Male,B3_M_4.txt


## Data Analysis

In [5]:
# Analyze the organized data
print("Batch 3 Data Summary:")
print(f"Total subjects: {batch3_df['subject_id'].nunique()}")
print(f"Timepoints: {batch3_df['timepoint'].unique()}")
print(f"Gender distribution: {batch3_df['gender'].value_counts().to_dict()}")

# Check for missing values
missing_data = batch3_df.isnull().sum()
numeric_cols = batch3_df.select_dtypes(include=[np.number]).columns
print(f"\nMissing values in key measurements:")
for col in numeric_cols:
    if missing_data[col] > 0:
        print(f"  {col}: {missing_data[col]} missing")

# Subject tracking across timepoints
subject_timepoints = batch3_df.groupby('subject_id')['timepoint'].nunique().sort_values(ascending=False)
print(f"\nSubject longitudinal tracking:")
print(f"Subjects with multiple timepoints: {len(subject_timepoints[subject_timepoints > 1])}")
print(f"Most tracked subject has {subject_timepoints.iloc[0]} timepoints")

# Show distribution by timepoint and gender
timepoint_gender = batch3_df.groupby(['timepoint', 'gender']).size().unstack(fill_value=0)
print(f"\nScans by timepoint and gender:")
print(timepoint_gender)

Batch 3 Data Summary:
Total subjects: 10
Timepoints: ['Pre_Scan' 'Week_1' 'Week_2' 'Week_3' 'Post_Scan']
Gender distribution: {'Male': 25, 'Female': 23}

Missing values in key measurements:

Subject longitudinal tracking:
Subjects with multiple timepoints: 10
Most tracked subject has 5 timepoints

Scans by timepoint and gender:
gender     Female  Male
timepoint              
Post_Scan       4     4
Pre_Scan        5     5
Week_1          5     5
Week_2          5     6
Week_3          4     5


## Data Cleaning

In [6]:
# Clean the Batch 3 dataset
def clean_batch3_data(df):
    df_clean = df.copy()
    
    # Fill missing numeric values with median (more appropriate for DEXA measurements)
    numeric_cols = df_clean.select_dtypes(include=[np.number]).columns
    for col in numeric_cols:
        if df_clean[col].isnull().sum() > 0:
            if df_clean[col].notna().sum() > 0:
                median_val = df_clean[col].median()
                df_clean[col] = df_clean[col].fillna(median_val)
            else:
                df_clean[col] = df_clean[col].fillna(0)
    
    # Fill missing categorical values
    categorical_cols = df_clean.select_dtypes(include=['object']).columns
    for col in categorical_cols:
        if df_clean[col].isnull().sum() > 0:
            df_clean[col] = df_clean[col].fillna("Unknown")
    
    # Organize columns in logical order
    column_order = [
        'batch', 'subject_id', 'timepoint', 'gender', 'filename',
        'total_weight', 'soft_weight', 'lean_weight', 'fat_weight', 'fat_percent',
        'bmc', 'bmd', 'bone_area', 'sample_area'
    ]
    
    # Reorder columns (keep any extra columns at the end)
    available_cols = [col for col in column_order if col in df_clean.columns]
    extra_cols = [col for col in df_clean.columns if col not in column_order]
    df_clean = df_clean[available_cols + extra_cols]
    
    return df_clean

# Clean the data
batch3_cleaned = clean_batch3_data(batch3_df)

print(f"Data cleaning complete")
print(f"Missing values remaining: {batch3_cleaned.isnull().sum().sum()}")
print(f"Final shape: {batch3_cleaned.shape}")

# Show cleaned data sample
batch3_cleaned.head()

Data cleaning complete
Missing values remaining: 0
Final shape: (48, 14)


Unnamed: 0,batch,subject_id,timepoint,gender,filename,total_weight,soft_weight,lean_weight,fat_weight,fat_percent,bmc,bmd,bone_area,sample_area
0,Batch_3,B3_M_0,Pre_Scan,Male,B3_M_0.txt,35.5679,34.7521,23.7603,10.9918,31.629,0.81583,79.823,10.221,35.091
1,Batch_3,B3_M_1,Pre_Scan,Male,B3_M_1.txt,37.4815,36.3929,24.4471,11.9458,32.824,1.08861,92.95,11.712,37.581
2,Batch_3,B3_M_3,Pre_Scan,Male,B3_M_3.txt,35.2593,34.2374,21.9613,12.276,35.856,1.0219,95.698,10.678,34.005
3,Batch_3,B3_M_2,Pre_Scan,Male,B3_M_2.txt,37.12,36.1086,22.4425,13.6662,37.847,1.01141,90.226,11.21,35.085
4,Batch_3,B3_M_4,Pre_Scan,Male,B3_M_4.txt,35.7049,34.6436,23.5031,11.1406,32.158,1.06125,91.62,11.583,34.495


## Export Results

In [7]:
# Export to Excel with multiple sheets
excel_output_path = output_dir / "batch3_dexa_cleaned.xlsx"

with pd.ExcelWriter(excel_output_path, engine='openpyxl') as writer:
    # Main data sheet
    batch3_cleaned.to_excel(writer, sheet_name='Batch3_All_Data', index=False)
    
    # Summary by timepoint
    timepoint_summary = batch3_cleaned.groupby(['timepoint', 'gender']).agg({
        'subject_id': 'nunique',
        'total_weight': 'mean',
        'fat_percent': 'mean',
        'bmd': 'mean',
        'lean_weight': 'mean'
    }).round(3)
    timepoint_summary.to_excel(writer, sheet_name='Timepoint_Summary')
    
    # Subject tracking sheet
    subject_summary = batch3_cleaned.groupby('subject_id').agg({
        'timepoint': 'nunique',
        'gender': 'first',
        'total_weight': ['min', 'max', 'mean'],
        'fat_percent': ['min', 'max', 'mean']
    }).round(3)
    subject_summary.columns = ['_'.join(col).strip() for col in subject_summary.columns]
    subject_summary.to_excel(writer, sheet_name='Subject_Summary')

print(f"Excel file exported: {excel_output_path}")
print(f"Sheets created: Batch3_All_Data, Timepoint_Summary, Subject_Summary")

# Also save as CSV for backup
csv_output_path = output_dir / "batch3_dexa_cleaned.csv"
batch3_cleaned.to_csv(csv_output_path, index=False)
print(f"CSV backup saved: {csv_output_path}")

print(f"\nFinal Results Summary:")
print(f"- Total records: {len(batch3_cleaned)}")
print(f"- Unique subjects: {batch3_cleaned['subject_id'].nunique()}")
print(f"- Timepoints: {list(batch3_cleaned['timepoint'].unique())}")
print(f"- Gender distribution: {batch3_cleaned['gender'].value_counts().to_dict()}")
print(f"- Key measurements: total_weight, fat_percent, bmd, lean_weight, fat_weight")

Excel file exported: ../../../cleaned_output/batch3_dexa_cleaned.xlsx
Sheets created: Batch3_All_Data, Timepoint_Summary, Subject_Summary
CSV backup saved: ../../../cleaned_output/batch3_dexa_cleaned.csv

Final Results Summary:
- Total records: 48
- Unique subjects: 10
- Timepoints: ['Pre_Scan', 'Week_1', 'Week_2', 'Week_3', 'Post_Scan']
- Gender distribution: {'Male': 25, 'Female': 23}
- Key measurements: total_weight, fat_percent, bmd, lean_weight, fat_weight
