# DISCLAIMER AND LIABILITY

This website provides clinical test data for informational purposes and the convenience of the public. CDISC does not control or guarantee the currency, accuracy, relevance, or completeness of the data. The data has been analyzed, cleansed, and aggregated where appropriate to facilitate use and discussion in research. By downloading / using this data, you agree to the Terms of use.

## Terms of Use

You shall not (and will not allow or assist any third party to), under any circumstances: (i) mislead, confuse, or cause misapprehension or confusion among users of the data as to the features, functionality, origin, capabilities, or other aspects of the data, (b) disassemble, reverse engineer, decompile, modify, or alter any part of the Data; (c) use the Data in any manner or for any purpose that violates any law or regulation, (d) use the Data in order to compete with CDISC, (e) sublicense or distribute the data for a fee, or (f) sublicense or distribute the data without an attribution back to CDISC.

UNLESS REQUIRED BY APPLICABLE LAW, ACCESS AND USE OF THE DATA IS PROVIDED BY CDISC AND ITS CONSTITUENT PARTS (INCLUDING, BUT NOT LIMITED TO THE CDISC BOARD OF DIRECTORS, CDISC EMPLOYEES, AND CDISC MEMBERS, PARTICIPANTS, CONTRACTORS, AND REPRESENTATIVES) "AS IS" AND WITHOUT ANY WARRANTIES WHATSOEVER, WHETHER EXPRESS, IMPLIED, STATUTORY, OR OTHERWISE, AND CDISC AND ITS CONSTITUENT PARTS (INCLUDING, BUT NOT LIMITED TO THE CDISC BOARD OF DIRECTORS, CDISC EMPLOYEES, AND CDISC MEMBERS, PARTICIPANTS, CONTRACTORS, AND REPRESENTATIVES) EXPRESSLY DISCLAIM ANY WARRANTY OF  MERCHANTABILITY, TITLE, NONINFRINGEMENT, FITNESS FOR A PARTICULAR OR INTENDED PURPOSE, OR ANY OTHER WARRANTY OTHERWISE ARISING OUT OF THIS LETTER AGREEMENT, INCLUDING ACCESS OR USE OF THE DATA. You are solely responsible for determining the appropriateness of accessing and/or using the data and assume any risks associated with your access and/or use.

IN NO EVENT AND UNDER NO LEGAL THEORY, WHETHER IN TORT (INCLUDING NEGLIGENCE), CONTRACT, OR OTHERWISE, UNLESS REQUIRED BY APPLICABLE LAW (SUCH AS DELIBERATE AND GROSSLY NEGLIGENT ACTS) OR AGREED TO IN WRITING, SHALL CDISC, ANY OF CDISC’S CONSTITUENT PARTS (INCLUDING, BUT NOT LIMITED TO THE CDISC BOARD OF DIRECTORS, THE CDISC EMPLOYEES, OR CDISC MEMBERS, PARTICIPANTS, CONTRACTORS, OR REPRESENTATIVES) BE LIABLE FOR DAMAGES, INCLUDING ANY DIRECT, INDIRECT, SPECIAL, EXEMPLARY, INCIDENTAL, OR CONSEQUENTIAL DAMAGES OF ANY CHARACTER ARISING IN ANY WAY AS A RESULT OF THIS LETTER AGREEMENT OR OUT OF THE USE OR INABILITY TO USE THE DATA (INCLUDING DAMAGES FOR LOSS OF GOODWILL, LOSS OF PROFITS, LOSS OF USE, OR BUSINESS INTERRUPTION), EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.  THIS LIMITATION OF DAMAGES AND CLAIMS IS INTENDED TO APPLY TO ALL CLAIMS WITHOUT REGARD TO WHICH OTHER PROVISIONS OF THIS LETTER AGREEMENT HAVE BEEN BREACHED OR PROVEN INEFFECTIVE.

---

# CDISC Oncology ETL: Transformations for DM, AE, and LB Datasets

### Imports

In [1]:
import pandas as pd
import pyreadstat
import os

### Load raw DM.xpt

In [2]:
def load_dm_data():
    data_path = r'updated-pilot-submission-package\900172\m5\datasets\cdiscpilot01\tabulations\sdtm'
    dm_df, dm_meta = pyreadstat.read_xport(os.path.join(data_path, 'dm.xpt'))
    return dm_df

dm_df = load_dm_data()
dm_df.head(5)

PyreadstatError: File updated-pilot-submission-package\900172\m5\datasets\cdiscpilot01\tabulations\sdtm\dm.xpt does not exist!

### 1. Missing Value Imputation

In [None]:
def impute_missing(df):
    df = df.copy()
    df['RACE'] = df['RACE'].fillna('Unknown')
    df['ETHNIC'] = df['ETHNIC'].fillna('Unknown')
    df['COUNTRY'] = df['COUNTRY'].fillna(df['COUNTRY'].mode()[0] if not df['COUNTRY'].mode().empty else 'Unknown')
    df['DMDY'] = df['DMDY'].fillna(df['DMDY'].median())
    df.dropna(subset=['USUBJID'], inplace=True)
    return df

impute_missing_df = impute_missing(dm_df)
impute_missing_df.head(5)
    

### 2. Date Standardization

In [None]:
def standardize_dates(df):
    df = df.copy()
    date_cols = ['RFSTDTC', 'RFENDTC', 'RFXSTDTC', 'RFXENDTC', 'RFICDTC', 'RFPENDTC', 'DTHDTC', 'DMDTC']
    for col in date_cols:
        if col in dm_df.columns:
           df[col] = pd.to_datetime(dm_df[col], errors='coerce')
    return df

standardize_df = standardize_dates(impute_missing_df)
standardize_df.head(5)

### 3. Data Type Enforcement (per CDISC specs)

In [None]:
def enforce_data_types(df):
    df = df.copy()
    df['AGE'] = pd.to_numeric(df['AGE'], errors='coerce').astype('Int64')
    df['AGEU'] = df['AGEU'].astype('category')
    df['SEX'] = df['SEX'].astype('category')
    df['RACE'] = df['RACE'].astype('category')
    df['ETHNIC'] = df['ETHNIC'].astype('category')
    df['STUDYID'] = df['STUDYID'].astype(str)
    df['DMDY'] = pd.to_numeric(df['DMDY'], errors='coerce').astype('Int64')
    df['DTHFL'] = df['DTHFL'].astype('category')
    return df

enforce_data_df = enforce_data_types(standardize_df)
enforce_data_df.head(5)

### 4. Derived Variables (insights-focused for oncology)

In [None]:
def derive_variables(df):
    df = df.copy()
    
    def age_group(age):
        if pd.isna(age):
            return 'Unknown'
        elif age < 18:
            return 'Pediatric'
        elif age <= 65:
            return 'Adult'
        else:
            return 'Senior'
    df['AGE_GROUP'] = df['AGE'].apply(age_group)
    
    df['STUDY_DURATION'] = df['DMDY'].fillna(0)
    if 'RFXENDTC' in df.columns and 'RFXSTDTC' in df.columns:
        df['STUDY_DURATION'] = df['STUDY_DURATION'].fillna((df['RFXENDTC'] - df['RFXSTDTC']).dt.days)
    
    df['ARM_TYPE'] = df['ARM'].str.lower().str.contains('placebo|control', na=False).map({True: 'Control', False: 'Treatment'}).fillna('Unknown')
    return df

derive_variables_df = derive_variables(enforce_data_df)
derive_variables_df.head(5)

### 5. Outlier Detection/Cleaning

In [None]:
def clean_outliers(df):
    df = df.copy()
    print("Before filters:", df.shape)
    
    # AGE: Keep if not NaN and in range
    df = df[(df['AGE'].notna()) & (df['AGE'] >= 0) & (df['AGE'] <= 120)]
    print("After AGE filter:", df.shape)
    
    # DMDY: Keep if not NaN and >=0 (imputed, so rare NaN)
    df = df[(df['DMDY'].notna()) & (df['DMDY'] >= 0)]
    print("After DMDY filter:", df.shape)
    print("DMDY unique values:", df['DMDY'].unique())
    
    # STUDY_DURATION: Keep if not NaN and >=0
    df = df[(df['STUDY_DURATION'].notna()) & (df['STUDY_DURATION'] >= 0)]
    print("After STUDY_DURATION filter:", df.shape)
    
    df = df.drop_duplicates(subset=['USUBJID'])
    print("After duplicates:", df.shape)
    return df
    
clean_outliers_df = clean_outliers(derive_variables_df)
clean_outliers_df.head(5)

### 6. Consistency Checks (CDISC controlled terms)

In [None]:
def consistency_checks(df):
    """Step 6: Consistency Checks (mapping and flags via direct assign)"""
    df = df.copy()
    # SEX restriction
    df['SEX'] = df['SEX'].where(df['SEX'].isin(['M', 'F', 'U']), 'U')
    
    # RACE mapping
    race_map = {
        'AMERICAN INDIAN OR ALASKA NATIVE': 'AMERICAN INDIAN/ALASKA NATIVE',
        'ASIAN': 'ASIAN',
        'BLACK OR AFRICAN AMERICAN': 'BLACK/AFRICAN AMERICAN',
        'NATIVE HAWAIIAN OR OTHER PACIFIC ISLANDER': 'NATIVE HAWAIIAN/OTHER PACIFIC ISLANDER',
        'WHITE': 'WHITE'
    }
    df['RACE'] = df['RACE'].replace(race_map)
    
    # High-Risk flag
    df['HIGH_RISK'] = ((df['AGE'] > 70) | 
                       (df['RACE'].str.contains('ASIAN|BLACK', na=False)) | 
                       (df['ARM_TYPE'] == 'Treatment')).astype(int)
    
    # Drop constants/redundants
    drop_cols = ['DOMAIN', 'SUBJID']
    df = df.drop(columns=[col for col in drop_cols if col in df.columns])
    return df

    
consistency_df = consistency_checks(clean_outliers_df)
consistency_df.head(5)

### 7. Export Prep (reorder for DB, add audit)

In [None]:
def prepare_export(df):
    """Step 7: Export Prep (reindex and timestamp)"""
    df = df.copy()
    col_order = [
        'USUBJID', 'STUDYID', 'SITEID', 'AGE', 'AGEU', 'SEX', 'RACE', 'ETHNIC', 'COUNTRY',
        'ARMCD', 'ARM', 'ACTARMCD', 'ACTARM', 'RFXSTDTC', 'RFXENDTC', 'DMDY', 'DMDTC',
        'AGE_GROUP', 'ARM_TYPE', 'STUDY_DURATION', 'HIGH_RISK'
    ]
    existing_cols = [col for col in col_order if col in df.columns]
    remaining_cols = [col for col in df.columns if col not in col_order]
    df = df.reindex(columns=existing_cols + remaining_cols)
    df['ETL_TIMESTAMP'] = pd.Timestamp.now()
    return df
    
prepare_export_df = prepare_export(consistency_df)
prepare_export_df.head(5)