# Batch 1 DEXA Data Cleaning

**Objective**: Extract and organize DEXA scan data from Batch 1 text files

**Data Source**: /Sample Data/DEXA Scans/Batch 1

**Output**: Organized Excel file with all measurements by subject, gender, and timepoint

**Process**: Scan → Parse → Organize → Export

In [1]:
# Import libraries
import pandas as pd
import numpy as np
from pathlib import Path
import re

# Data paths
batch1_path = Path("/Users/aviado/Documents/GDG WashU Medicine/Sample Data/DEXA Scans/Batch 1")
output_dir = Path("../../../cleaned_output")
output_dir.mkdir(exist_ok=True)

## Data Scanning

In [2]:
# Scan all txt files in Batch 1
def scan_batch1_files():
    txt_files = []
    
    # Define timepoint directories
    timepoints = {
        'Week 0 DEXA': 'Week_0',
        'Week 1 DEXA': 'Week_1', 
        'Week 2 DEXA': 'Week_2',
        'Week 3 DEXA (Named Week 4)': 'Week_3',
        'Post-Scan': 'Post_Scan'
    }
    
    # Scan each timepoint directory
    for timepoint_dir, timepoint_name in timepoints.items():
        timepoint_path = batch1_path / timepoint_dir
        if timepoint_path.exists():
            # Check Male and Female subdirectories
            for gender in ['Male', 'Female']:
                gender_path = timepoint_path / gender
                if gender_path.exists():
                    # Get all txt files
                    for txt_file in gender_path.glob('*.txt'):
                        txt_files.append({
                            'file_path': txt_file,
                            'timepoint': timepoint_name,
                            'gender': gender,
                            'filename': txt_file.name
                        })
    
    # Also check root directory for any txt files
    for txt_file in batch1_path.glob('*.txt'):
        txt_files.append({
            'file_path': txt_file,
            'timepoint': 'Root',
            'gender': 'Unknown',
            'filename': txt_file.name
        })
    
    return txt_files

# Scan files
batch1_files = scan_batch1_files()
print(f"Found {len(batch1_files)} txt files in Batch 1")

# Show sample of found files
for i, file_info in enumerate(batch1_files[:5]):
    print(f"{i+1}. {file_info['timepoint']} - {file_info['gender']} - {file_info['filename']}")
if len(batch1_files) > 5:
    print(f"... and {len(batch1_files) - 5} more files")

Found 51 txt files in Batch 1
1. Week_0 - Male - B1_M_1 Week -1.txt
2. Week_0 - Male - B1_M_4 Week -1.txt
3. Week_0 - Male - B1_M_2 Week -1.txt
4. Week_0 - Male - B1_M_0 Week -1.txt
5. Week_0 - Male - B1_M_3 Week -1.txt
... and 46 more files


## Data Parsing

In [3]:
# Parse DEXA txt file content
def parse_dexa_txt(file_path):
    """Extract measurements from DEXA txt file"""
    try:
        with open(file_path, 'r') as f:
            content = f.read()
        
        # Extract subject ID from filename
        filename = file_path.name
        subject_match = re.search(r'(B1_[MF]_\d+)', filename)
        subject_id = subject_match.group(1) if subject_match else filename.replace('.txt', '')
        
        # Extract measurements using regex patterns
        measurements = {'subject_id': subject_id}
        
        # Find WHOLE TISSUE STATISTICS section (more comprehensive data)
        whole_section = re.search(r'WHOLE TISSUE STATISTICS:(.*?)(?=\n\s*-|$)', content, re.DOTALL)
        if whole_section:
            section_text = whole_section.group(1)
        else:
            # Fallback to INSIDE ROI if WHOLE not found
            section_text = content
        
        # Define patterns for key measurements
        patterns = {
            'sample_area': r'Sample Area:\s*([\d.]+)\s*cm',
            'bone_area': r'Bone Area:\s*([\d.]+)\s*cm',
            'total_weight': r'Total Weight:\s*([\d.]+)\s*g',
            'soft_weight': r'Soft Weight:\s*([\d.]+)\s*g',
            'lean_weight': r'Lean Weight:\s*([\d.]+)\s*g',
            'fat_weight': r'Fat Weight:\s*([\d.]+)\s*g',
            'fat_percent': r'Fat Percent:\s*([\d.]+)',
            'bmc': r'BMC:\s*([\d.]+)\s*g',
            'bmd': r'BMD:\s*([\d.]+)\s*mg/cm'
        }
        
        # Extract each measurement
        for key, pattern in patterns.items():
            match = re.search(pattern, section_text)
            if match:
                measurements[key] = float(match.group(1))
            else:
                measurements[key] = None
        
        return measurements
        
    except Exception as e:
        print(f"Error parsing {file_path}: {e}")
        return None

# Test parsing with first file
if batch1_files:
    sample_file = batch1_files[0]['file_path']
    sample_data = parse_dexa_txt(sample_file)
    print(f"Sample parsing result:")
    print(f"File: {sample_file.name}")
    for key, value in sample_data.items():
        print(f"  {key}: {value}")

Sample parsing result:
File: B1_M_1 Week -1.txt
  subject_id: B1_M_1
  sample_area: 34.235
  bone_area: 9.621
  total_weight: 33.6111
  soft_weight: 32.8408
  lean_weight: 24.3109
  fat_weight: 8.5299
  fat_percent: 25.974
  bmc: 0.77028
  bmd: 80.063


In [4]:
# Process all Batch 1 files
def process_all_batch1_files():
    all_data = []
    
    for file_info in batch1_files:
        # Parse the txt file
        measurements = parse_dexa_txt(file_info['file_path'])
        
        if measurements:
            # Add metadata
            measurements.update({
                'batch': 'Batch_1',
                'timepoint': file_info['timepoint'], 
                'gender': file_info['gender'],
                'filename': file_info['filename']
            })
            all_data.append(measurements)
    
    return pd.DataFrame(all_data)

# Process all files
batch1_df = process_all_batch1_files()

print(f"Processed {len(batch1_df)} files")
print(f"Columns: {list(batch1_df.columns)}")
print(f"Shape: {batch1_df.shape}")

# Show sample data
batch1_df.head()

Processed 51 files
Columns: ['subject_id', 'sample_area', 'bone_area', 'total_weight', 'soft_weight', 'lean_weight', 'fat_weight', 'fat_percent', 'bmc', 'bmd', 'batch', 'timepoint', 'gender', 'filename']
Shape: (51, 14)


Unnamed: 0,subject_id,sample_area,bone_area,total_weight,soft_weight,lean_weight,fat_weight,fat_percent,bmc,bmd,batch,timepoint,gender,filename
0,B1_M_1,34.235,9.621,33.6111,32.8408,24.3109,8.5299,25.974,0.77028,80.063,Batch_1,Week_0,Male,B1_M_1 Week -1.txt
1,B1_M_4,35.829,10.153,36.7926,36.0004,27.309,8.6915,24.143,0.79218,78.021,Batch_1,Week_0,Male,B1_M_4 Week -1.txt
2,B1_M_2,33.848,10.45,34.6375,33.8267,25.9896,7.8371,23.168,0.81076,77.582,Batch_1,Week_0,Male,B1_M_2 Week -1.txt
3,B1_M_0,34.561,10.895,34.4279,33.5655,25.2839,8.2817,24.673,0.86234,79.147,Batch_1,Week_0,Male,B1_M_0 Week -1.txt
4,B1_M_3,34.703,10.176,36.2557,35.4778,28.3093,7.1685,20.205,0.77788,76.444,Batch_1,Week_0,Male,B1_M_3 Week -1.txt


## Data Analysis

In [5]:
# Analyze the organized data
print("Batch 1 Data Summary:")
print(f"Total subjects: {batch1_df['subject_id'].nunique()}")
print(f"Timepoints: {batch1_df['timepoint'].unique()}")
print(f"Gender distribution: {batch1_df['gender'].value_counts().to_dict()}")

# Check for missing values
missing_data = batch1_df.isnull().sum()
numeric_cols = batch1_df.select_dtypes(include=[np.number]).columns
print(f"\nMissing values in key measurements:")
for col in numeric_cols:
    if missing_data[col] > 0:
        print(f"  {col}: {missing_data[col]} missing")

# Subject tracking across timepoints
subject_timepoints = batch1_df.groupby('subject_id')['timepoint'].nunique().sort_values(ascending=False)
print(f"\nSubject longitudinal tracking:")
print(f"Subjects with multiple timepoints: {len(subject_timepoints[subject_timepoints > 1])}")
print(f"Most tracked subject has {subject_timepoints.iloc[0]} timepoints")

# Show distribution by timepoint and gender
timepoint_gender = batch1_df.groupby(['timepoint', 'gender']).size().unstack(fill_value=0)
print(f"\nScans by timepoint and gender:")
print(timepoint_gender)

Batch 1 Data Summary:
Total subjects: 10
Timepoints: ['Week_0' 'Week_1' 'Week_2' 'Week_3' 'Post_Scan' 'Root']
Gender distribution: {'Male': 25, 'Female': 25, 'Unknown': 1}

Missing values in key measurements:

Subject longitudinal tracking:
Subjects with multiple timepoints: 10
Most tracked subject has 6 timepoints

Scans by timepoint and gender:
gender     Female  Male  Unknown
timepoint                       
Post_Scan       5     5        0
Root            0     0        1
Week_0          5     5        0
Week_1          5     5        0
Week_2          5     5        0
Week_3          5     5        0


## Data Cleaning

In [6]:
# Clean the Batch 1 dataset
def clean_batch1_data(df):
    df_clean = df.copy()
    
    # Fill missing numeric values with median (more appropriate for DEXA measurements)
    numeric_cols = df_clean.select_dtypes(include=[np.number]).columns
    for col in numeric_cols:
        if df_clean[col].isnull().sum() > 0:
            if df_clean[col].notna().sum() > 0:
                median_val = df_clean[col].median()
                df_clean[col] = df_clean[col].fillna(median_val)
            else:
                df_clean[col] = df_clean[col].fillna(0)
    
    # Fill missing categorical values
    categorical_cols = df_clean.select_dtypes(include=['object']).columns
    for col in categorical_cols:
        if df_clean[col].isnull().sum() > 0:
            df_clean[col] = df_clean[col].fillna("Unknown")
    
    # Organize columns in logical order
    column_order = [
        'batch', 'subject_id', 'timepoint', 'gender', 'filename',
        'total_weight', 'soft_weight', 'lean_weight', 'fat_weight', 'fat_percent',
        'bmc', 'bmd', 'bone_area', 'sample_area'
    ]
    
    # Reorder columns (keep any extra columns at the end)
    available_cols = [col for col in column_order if col in df_clean.columns]
    extra_cols = [col for col in df_clean.columns if col not in column_order]
    df_clean = df_clean[available_cols + extra_cols]
    
    return df_clean

# Clean the data
batch1_cleaned = clean_batch1_data(batch1_df)

print(f"Data cleaning complete")
print(f"Missing values remaining: {batch1_cleaned.isnull().sum().sum()}")
print(f"Final shape: {batch1_cleaned.shape}")

# Show cleaned data sample
batch1_cleaned.head()

Data cleaning complete
Missing values remaining: 0
Final shape: (51, 14)


Unnamed: 0,batch,subject_id,timepoint,gender,filename,total_weight,soft_weight,lean_weight,fat_weight,fat_percent,bmc,bmd,bone_area,sample_area
0,Batch_1,B1_M_1,Week_0,Male,B1_M_1 Week -1.txt,33.6111,32.8408,24.3109,8.5299,25.974,0.77028,80.063,9.621,34.235
1,Batch_1,B1_M_4,Week_0,Male,B1_M_4 Week -1.txt,36.7926,36.0004,27.309,8.6915,24.143,0.79218,78.021,10.153,35.829
2,Batch_1,B1_M_2,Week_0,Male,B1_M_2 Week -1.txt,34.6375,33.8267,25.9896,7.8371,23.168,0.81076,77.582,10.45,33.848
3,Batch_1,B1_M_0,Week_0,Male,B1_M_0 Week -1.txt,34.4279,33.5655,25.2839,8.2817,24.673,0.86234,79.147,10.895,34.561
4,Batch_1,B1_M_3,Week_0,Male,B1_M_3 Week -1.txt,36.2557,35.4778,28.3093,7.1685,20.205,0.77788,76.444,10.176,34.703


## Export Results

In [7]:
# Export to Excel with multiple sheets
excel_output_path = output_dir / "batch1_dexa_cleaned.xlsx"

with pd.ExcelWriter(excel_output_path, engine='openpyxl') as writer:
    # Main data sheet
    batch1_cleaned.to_excel(writer, sheet_name='Batch1_All_Data', index=False)
    
    # Summary by timepoint
    timepoint_summary = batch1_cleaned.groupby(['timepoint', 'gender']).agg({
        'subject_id': 'nunique',
        'total_weight': 'mean',
        'fat_percent': 'mean',
        'bmd': 'mean',
        'lean_weight': 'mean'
    }).round(3)
    timepoint_summary.to_excel(writer, sheet_name='Timepoint_Summary')
    
    # Subject tracking sheet
    subject_summary = batch1_cleaned.groupby('subject_id').agg({
        'timepoint': 'nunique',
        'gender': 'first',
        'total_weight': ['min', 'max', 'mean'],
        'fat_percent': ['min', 'max', 'mean']
    }).round(3)
    subject_summary.columns = ['_'.join(col).strip() for col in subject_summary.columns]
    subject_summary.to_excel(writer, sheet_name='Subject_Summary')

print(f"Excel file exported: {excel_output_path}")
print(f"Sheets created: Batch1_All_Data, Timepoint_Summary, Subject_Summary")

# Also save as CSV for backup
csv_output_path = output_dir / "batch1_dexa_cleaned.csv"
batch1_cleaned.to_csv(csv_output_path, index=False)
print(f"CSV backup saved: {csv_output_path}")

print(f"\nFinal Results Summary:")
print(f"- Total records: {len(batch1_cleaned)}")
print(f"- Unique subjects: {batch1_cleaned['subject_id'].nunique()}")
print(f"- Timepoints: {list(batch1_cleaned['timepoint'].unique())}")
print(f"- Gender distribution: {batch1_cleaned['gender'].value_counts().to_dict()}")
print(f"- Key measurements: total_weight, fat_percent, bmd, lean_weight, fat_weight")

Excel file exported: ../../../cleaned_output/batch1_dexa_cleaned.xlsx
Sheets created: Batch1_All_Data, Timepoint_Summary, Subject_Summary
CSV backup saved: ../../../cleaned_output/batch1_dexa_cleaned.csv

Final Results Summary:
- Total records: 51
- Unique subjects: 10
- Timepoints: ['Week_0', 'Week_1', 'Week_2', 'Week_3', 'Post_Scan', 'Root']
- Gender distribution: {'Male': 25, 'Female': 25, 'Unknown': 1}
- Key measurements: total_weight, fat_percent, bmd, lean_weight, fat_weight
