# CSV Data Cleaning - Review SY-08002944

**Objective**: Clean and organize data from Review_SY-08002944_4_3_2025 10_31_21.csv

**Data Source**: Code/DSCwashumed/backend/data/Review_SY-08002944_4_3_2025 10_31_21.csv

**Output**: Cleaned data files in cleadned_data folder

**Process**: Load → Analyze → Clean → Export

In [8]:
# Import libraries
import pandas as pd
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns

# Data paths
data_file = Path("Review_SY-08002944_4_3_2025 10_31_21.csv")
output_dir = Path("cleadned_data")
output_dir.mkdir(exist_ok=True)

print(f"Data file: {data_file}")
print(f"Output directory: {output_dir}")
print(f"Data file exists: {data_file.exists()}")

Data file: Review_SY-08002944_4_3_2025 10_31_21.csv
Output directory: cleadned_data
Data file exists: True


## Data Loading and Initial Analysis

In [None]:
# Load the CSV data
def load_csv_data():
    try:
        df = pd.read_csv(data_file)
        print(f"Shape: {df.shape}")
        return df
    except Exception as e:
        print(f"Error loading data: {e}")
        return None

# Load data
raw_df = load_csv_data()

if raw_df is not None:
    raw_df.head()

Data loaded successfully!
Shape: (25, 43)
Columns: ['Sample ID', 'Patient ID', 'Patient', 'WBC (10^3/uL)', 'Neu # (10^3/uL)', 'Lym # (10^3/uL)', 'Mon # (10^3/uL)', 'Eos # (10^3/uL)', 'Bas # (10^3/uL)', 'Neu % (%)', 'Lym % (%)', 'Mon % (%)', 'Eos % (%)', 'Bas % (%)', 'RBC (10^6/uL)', 'HGB (g/dL)', 'HCT (%)', 'MCV (fL)', 'MCH (pg)', 'MCHC (g/dL)', 'RDW-CV (%)', 'PLT (10^3/uL)', 'MPV (fL)', 'Species', 'Sample State', 'Owner Last Name', 'Mode', 'Date', 'Time', 'Gender', 'Age', 'Ref. Group', 'Draw Date', 'Draw Time', 'Delivery Date', 'Delivery Time', 'Veterinarian', 'Operator', 'Comments', 'WBC Message', 'RBC Message', 'PLT Message', 'Unnamed: 42']

First few rows:


Unnamed: 0,Sample ID,Patient ID,Patient,WBC (10^3/uL),Neu # (10^3/uL),Lym # (10^3/uL),Mon # (10^3/uL),Eos # (10^3/uL),Bas # (10^3/uL),Neu % (%),...,Draw Time,Delivery Date,Delivery Time,Veterinarian,Operator,Comments,WBC Message,RBC Message,PLT Message,Unnamed: 42
0,5410,,,H 13.42,2.50,H 9.81,0.66,0.34,0.11,18.7,...,,,,,Schuettpelz,,Leukocytosis,,,
1,5409,,,H 11.14,1.41,H 8.99,0.50,0.18,0.06,12.7,...,,,,,Schuettpelz,,,,,
2,5408,,,H 12.75,2.46,H 9.14,0.84,0.22,0.09,19.3,...,,,,,Schuettpelz,,Leukocytosis,,,
3,5407,,,H 13.50,3.31,H 9.56,0.41,0.13,0.09,24.5,...,,,,,Schuettpelz,,Leukocytosis,,,
4,5406,,,H 20.72,H 4.26,H 14.39,H 1.60,0.36,0.11,20.6,...,,,,,Schuettpelz,,Leukocytosis\nLymphocytosis,,,



Data types:
Sample ID           object
Patient ID         float64
Patient            float64
WBC (10^3/uL)       object
Neu # (10^3/uL)     object
Lym # (10^3/uL)     object
Mon # (10^3/uL)     object
Eos # (10^3/uL)     object
Bas # (10^3/uL)     object
Neu % (%)           object
Lym % (%)           object
Mon % (%)           object
Eos % (%)           object
Bas % (%)           object
RBC (10^6/uL)       object
HGB (g/dL)          object
HCT (%)             object
MCV (fL)            object
MCH (pg)            object
MCHC (g/dL)         object
RDW-CV (%)          object
PLT (10^3/uL)       object
MPV (fL)            object
Species             object
Sample State        object
Owner Last Name    float64
Mode                object
Date                object
Time                object
Gender             float64
Age                float64
Ref. Group          object
Draw Date          float64
Draw Time          float64
Delivery Date      float64
Delivery Time      float64
Veterinarian   

## Data Quality Assessment

In [None]:
# Analyze data quality
def analyze_data_quality(df):
    missing_counts = df.isnull().sum()
    missing_percent = (missing_counts / len(df)) * 100
    duplicates = df.duplicated().sum()
    
    print(f"Missing values by column:")
    for col in df.columns:
        if missing_counts[col] > 0:
            print(f"  {col}: {missing_counts[col]} ({missing_percent[col]:.1f}%)")
    
    print(f"Duplicate rows: {duplicates}")
    
    return missing_counts, missing_percent

if raw_df is not None:
    missing_counts, missing_percent = analyze_data_quality(raw_df)

=== DATA QUALITY ASSESSMENT ===

Missing Values:
  Patient ID: 25 (100.0%)
  Patient: 25 (100.0%)
  Owner Last Name: 25 (100.0%)
  Gender: 25 (100.0%)
  Age: 25 (100.0%)
  Draw Date: 25 (100.0%)
  Draw Time: 25 (100.0%)
  Delivery Date: 25 (100.0%)
  Delivery Time: 25 (100.0%)
  Veterinarian: 25 (100.0%)
  Comments: 25 (100.0%)
  WBC Message: 10 (40.0%)
  RBC Message: 20 (80.0%)
  PLT Message: 21 (84.0%)
  Unnamed: 42: 25 (100.0%)

Duplicate rows: 0

Column Analysis:
  Sample ID: 24 unique values, dtype: object
  Patient ID: 0 unique values, dtype: float64
  Patient: 0 unique values, dtype: float64
  WBC (10^3/uL): 25 unique values, dtype: object
  Neu # (10^3/uL): 25 unique values, dtype: object
  Lym # (10^3/uL): 24 unique values, dtype: object
  Mon # (10^3/uL): 22 unique values, dtype: object
  Eos # (10^3/uL): 18 unique values, dtype: object
    Values: [' 0.34 ', ' 0.18 ', ' 0.22 ', ' 0.13 ', ' 0.36 ', ' 0.00 ', ' 0.30 ', ' 0.12 ', ' 0.21 ', ' 0.24 ', ' 0.06 ', ' 0.11 ', 'H 0.72 

## Data Cleaning

In [11]:
# Clean the data
def clean_csv_data(df):
    df_clean = df.copy()
    
    # Remove duplicate rows
    df_clean = df_clean.drop_duplicates()
    
    # Clean column names
    df_clean.columns = df_clean.columns.str.strip().str.replace(' ', '_').str.replace('[^a-zA-Z0-9_]', '', regex=True)
    
    # Handle missing values for lab data appropriately
    numeric_cols = df_clean.select_dtypes(include=[np.number]).columns
    categorical_cols = df_clean.select_dtypes(include=['object']).columns
    
    # For numeric lab values, use median imputation
    for col in numeric_cols:
        if df_clean[col].isnull().sum() > 0:
            if df_clean[col].notna().sum() > 0:
                median_val = df_clean[col].median()
                df_clean[col] = df_clean[col].fillna(median_val)
            else:
                df_clean[col] = df_clean[col].fillna(0)
    
    # For categorical values, use "Unknown"
    for col in categorical_cols:
        if df_clean[col].isnull().sum() > 0:
            df_clean[col] = df_clean[col].fillna("Unknown")
    
    # Convert data types where appropriate
    for col in df_clean.columns:
        if df_clean[col].dtype == 'object':
            try:
                numeric_series = pd.to_numeric(df_clean[col], errors='coerce')
                if not numeric_series.isnull().all():
                    df_clean[col] = numeric_series
            except:
                pass
    
    return df_clean

if raw_df is not None:
    cleaned_df = clean_csv_data(raw_df)
    cleaned_df.head()

## Data Validation

In [12]:
# Validate cleaned data
def validate_cleaned_data(df):
    missing_values = df.isnull().sum().sum()
    duplicate_rows = df.duplicated().sum()
    
    print(f"Missing values: {missing_values}")
    print(f"Duplicate rows: {duplicate_rows}")
    print(f"Shape: {df.shape}")
    
    return df.describe()

if 'cleaned_df' in locals() and cleaned_df is not None:
    summary_stats = validate_cleaned_data(cleaned_df)
    summary_stats

Missing values: 119
Duplicate rows: 0
Shape: (25, 43)


## Export Cleaned Data

In [13]:
# Export cleaned data to cleadned_data folder
def export_cleaned_data(df):
    base_name = "Review_SY-08002944_cleaned"
    csv_output = output_dir / f"{base_name}.csv"
    excel_output = output_dir / f"{base_name}.xlsx"
    
    # Export to CSV
    df.to_csv(csv_output, index=False)
    
    # Export to Excel with multiple sheets
    with pd.ExcelWriter(excel_output, engine='openpyxl') as writer:
        df.to_excel(writer, sheet_name='Cleaned_Data', index=False)
        
        numeric_cols = df.select_dtypes(include=[np.number]).columns
        if len(numeric_cols) > 0:
            summary_stats = df[numeric_cols].describe()
            summary_stats.to_excel(writer, sheet_name='Summary_Statistics')
        
        info_data = {
            'Column': df.columns,
            'Data_Type': [str(dtype) for dtype in df.dtypes],
            'Non_Null_Count': [df[col].count() for col in df.columns],
            'Null_Count': [df[col].isnull().sum() for col in df.columns],
            'Unique_Values': [df[col].nunique() for col in df.columns]
        }
        info_df = pd.DataFrame(info_data)
        info_df.to_excel(writer, sheet_name='Data_Info', index=False)
    
    print(f"Files exported to {output_dir}")
    return csv_output, excel_output

if 'cleaned_df' in locals() and cleaned_df is not None:
    csv_path, excel_path = export_cleaned_data(cleaned_df)

Files exported to cleadned_data
