# 2nd CMI-PB Prediction Challenge
## Team Advisor: Barry Grant, Jason Hsiao
## Team member: Peng Cheng, Javier Garcia, Brian Qian, Weikang Guan
## Part 1: Data Integration

In [1]:
# Import necessary Python libraries.
import os  # Library for interacting with the operating system
import numpy as np  # Library for numerical operations on large arrays and matrices
import pandas as pd  # Library for data manipulation and analysis
from sklearn.preprocessing import OneHotEncoder  # Tool for converting categorical data into a format that can be provided to ML algorithms

# Check if the directory for data does not exist, then create it.
# This ensures that there is a 'data' directory available for storing files.
if not os.path.exists('data'):
    os.makedirs('data')

In [2]:
## Load data into dataframes
df_2020_specimen = pd.read_csv("https://www.cmi-pb.org/downloads/cmipb_challenge_datasets/current/2nd_challenge/raw_datasets/training_data/2020LD_specimen.tsv", sep='\t')
df_2020_subject = pd.read_csv("https://www.cmi-pb.org/downloads/cmipb_challenge_datasets/current/2nd_challenge/raw_datasets/training_data/2020LD_subject.tsv", sep='\t')
df_2020_titer = pd.read_csv("https://www.cmi-pb.org/downloads/cmipb_challenge_datasets/current/2nd_challenge/raw_datasets/training_data/2020LD_plasma_ab_titer.tsv", sep='\t')
df_2020_cell_freq = pd.read_csv("https://www.cmi-pb.org/downloads/cmipb_challenge_datasets/current/2nd_challenge/raw_datasets/training_data/2020LD_pbmc_cell_frequency.tsv", sep='\t')
df_2020_gene = pd.read_csv("https://www.cmi-pb.org/downloads/cmipb_challenge_datasets/current/2nd_challenge/raw_datasets/training_data/2020LD_pbmc_gene_expression.tsv", sep='\t')
#df_2020_cytokine = pd.read_csv("https://www.cmi-pb.org/downloads/cmipb_challenge_datasets/current/2nd_challenge/raw_datasets/training_data/2020LD_plasma_cytokine_concentration.tsv", sep='\t')

df_2021_specimen = pd.read_csv("https://www.cmi-pb.org/downloads/cmipb_challenge_datasets/current/2nd_challenge/raw_datasets/training_data/2021LD_specimen.tsv", sep='\t')
df_2021_subject = pd.read_csv("https://www.cmi-pb.org/downloads/cmipb_challenge_datasets/current/2nd_challenge/raw_datasets/training_data/2021LD_subject.tsv", sep='\t')
df_2021_titer = pd.read_csv("https://www.cmi-pb.org/downloads/cmipb_challenge_datasets/current/2nd_challenge/raw_datasets/training_data/2021LD_plasma_ab_titer.tsv", sep='\t')
df_2021_cell_freq = pd.read_csv("https://www.cmi-pb.org/downloads/cmipb_challenge_datasets/current/2nd_challenge/raw_datasets/training_data/2021LD_pbmc_cell_frequency.tsv", sep='\t')
df_2021_gene = pd.read_csv("https://www.cmi-pb.org/downloads/cmipb_challenge_datasets/current/2nd_challenge/raw_datasets/training_data/2021LD_pbmc_gene_expression.tsv", sep='\t')
#df_2021_cytokine = pd.read_csv("https://www.cmi-pb.org/downloads/cmipb_challenge_datasets/current/2nd_challenge/raw_datasets/training_data/2021LD_plasma_cytokine_concentration.tsv", sep='\t')

df_2022_specimen = pd.read_csv("https://www.cmi-pb.org/downloads/cmipb_challenge_datasets/current/2nd_challenge/raw_datasets/prediction_data/2022BD_specimen.tsv", sep='\t')
df_2022_subject = pd.read_csv("https://www.cmi-pb.org/downloads/cmipb_challenge_datasets/current/2nd_challenge/raw_datasets/prediction_data/2022BD_subject.tsv", sep='\t')
df_2022_titer = pd.read_csv("https://www.cmi-pb.org/downloads/cmipb_challenge_datasets/current/2nd_challenge/raw_datasets/prediction_data/2022BD_plasma_ab_titer.tsv", sep='\t')
df_2022_cell_freq = pd.read_csv("https://www.cmi-pb.org/downloads/cmipb_challenge_datasets/current/2nd_challenge/raw_datasets/prediction_data/2022BD_pbmc_cell_frequency.tsv", sep='\t')
df_2022_gene = pd.read_csv("https://www.cmi-pb.org/downloads/cmipb_challenge_datasets/current/2nd_challenge/raw_datasets/prediction_data/2022BD_pbmc_gene_expression.tsv", sep='\t')
#df_2022_cytokine = pd.read_csv("https://www.cmi-pb.org/downloads/cmipb_challenge_datasets/current/2nd_challenge/raw_datasets/prediction_data/2022BD_plasma_cytokine_concentration.tsv", sep='\t')

In [3]:
def clean_df_subject(df):
    """
    Cleans the subject DataFrame by calculating ages and applying one-hot encoding to categorical variables.
    This function prepares the data for analysis by standardizing categorical data and simplifying the DataFrame structure,
    making it suitable for integration into statistical models or machine learning pipelines.

    Args:
    df (DataFrame): The subject data as a pandas DataFrame, containing demographics and other categorical data.

    Returns:
    DataFrame: The cleaned and transformed DataFrame with age calculations and encoded categorical variables.
    """
    # Calculate age from date_of_boost and year_of_birth
    df['Age'] = pd.to_numeric(df['date_of_boost'].str[:4]) - pd.to_numeric(df['year_of_birth'].str[:4])
        
    # Use OneHotEncoder for 'infancy_vac', 'biological_sex', 'ethnicity' and 'race' columns
    columns_to_encode = ['infancy_vac', 'biological_sex', 'ethnicity', 'race']
    encoder = OneHotEncoder(sparse_output=False, drop='first')
    df_encoded = pd.DataFrame(encoder.fit_transform(df[columns_to_encode]), columns=encoder.get_feature_names_out(columns_to_encode))
    
    # Concatenate the encoded columns with the original DataFrame
    result_df = pd.concat([df, df_encoded], axis=1)
    
    # Drop the original columns that were encoded and unnecessary columns
    result_df.drop(columns=columns_to_encode, inplace=True)
    result_df.drop(['year_of_birth', 'date_of_boost', "dataset"], axis=1, inplace=True)
    
    return result_df

def clean_df_specimen(df):
    """
    Filters and cleans the specimen DataFrame by selecting specific days relative to boost and ensuring subjects have a complete set of specified days.
    It also calculates the difference in days between planned and actual boost days for further analysis.
    This function ensures the data is suitable for studies where consistency across multiple time points is critical.

    Args:
    df (DataFrame): The specimen data as a pandas DataFrame.

    Returns:
    DataFrame: The filtered and transformed DataFrame with additional columns for day differences and unnecessary columns removed.
    """
    # Extract rows with specific planned days relative to boost
    df = df[(df['planned_day_relative_to_boost'] == 0) | 
            (df['planned_day_relative_to_boost'] == 1) |
            (df['planned_day_relative_to_boost'] == 3) |
            (df['planned_day_relative_to_boost'] == 14)]
    result_df = df[['specimen_id', 'subject_id', 'actual_day_relative_to_boost', 'planned_day_relative_to_boost']]
    
    # Remove subjects that do not have all specified planned days
    result_df = result_df.groupby('subject_id').filter(lambda x: (0 in x['planned_day_relative_to_boost'].values) and\
                                                       (1 in x['planned_day_relative_to_boost'].values) and\
                                                       (3 in x['planned_day_relative_to_boost'].values) and\
                                                       (14 in x['planned_day_relative_to_boost'].values))
    result_df = create_date_diff_col(result_df,'date_diff_D0',0)
    result_df = create_date_diff_col(result_df,'date_diff_D1',1)
    result_df = create_date_diff_col(result_df,'date_diff_D3',3)
    result_df = create_date_diff_col(result_df,'date_diff_D14',14)
    result_df = result_df.drop("actual_day_relative_to_boost", axis=1)
    
    return result_df

def clean_df_titer(df):
    """
    Transforms the titer DataFrame by restructuring it into a wide format where each row represents a specimen and 
    columns represent different isotype-antigen combinations. This format is more conducive for analyses that require 
    examining relationships or patterns across various isotype-antigen interactions within individual specimens.

    Args:
    df (DataFrame): The titer data as a pandas DataFrame in long format, typically with each row representing 
                    a measurement for a specific antigen and isotype for a given specimen.

    Returns:
    DataFrame: The transformed DataFrame in wide format where columns are dynamically generated based on unique 
               combinations of isotype and antigen, facilitating easier access and manipulation for further analysis.
    """
    # Create a new column name list based on isotype and antigen
    new_columns = [f"{isotype}-{antigen}" for isotype in df['isotype'].unique() for antigen in df['antigen'].unique()]
    
    # Initialize an empty list to collect rows
    rows = []
    
    # Iterate over the DataFrame rows
    for index, row in df.iterrows():
        specimen_id = row['specimen_id']
        isotype = row['isotype']
        antigen = row['antigen']
        MFI_normalised = row['MFI_normalised']
        column_name = f"{isotype}-{antigen}"
        
        # Check if there's already a row for the specimen
        existing_row = next((r for r in rows if r['specimen_id'] == specimen_id), None)
        if existing_row:
            existing_row[column_name] = MFI_normalised
        else:
            # Create a new row with all columns initialized to zero
            new_row = {col: 0 for col in new_columns}
            new_row['specimen_id'] = specimen_id
            new_row[column_name] = MFI_normalised
            rows.append(new_row)
    
    # Convert list of rows to DataFrame
    result_df = pd.DataFrame(rows)
    result_df['specimen_id'] = result_df['specimen_id'].astype(int)
    
    return result_df

def clean_df_cell_freq(df):
    """
    Converts the cell frequency DataFrame from a long format to a wide format, where each unique cell type becomes a column header.
    This restructuring facilitates easier manipulation and comparison of cell frequencies across different specimens,
    making it highly suitable for analyses that require direct comparisons of multiple cell types within and across datasets.

    Args:
    df (DataFrame): The cell frequency data as a pandas DataFrame in long format. Each row in the DataFrame typically
                    represents a measurement of cell frequency for a specific cell type in a specific specimen.

    Returns:
    DataFrame: The transformed DataFrame in wide format with cell types as column headers and cell frequency percentages
               as data entries, indexed by 'specimen_id'. Each row corresponds to a specimen, with cell frequencies spread across columns.
    """
    result_df = df.pivot_table(index=['specimen_id'], columns=['cell_type_name'], values='percent_live_cell').reset_index()
    return result_df

def clean_df_gene(df):
    """
    Converts the gene expression DataFrame from a long to a wide format, where each unique gene ID becomes a column header.
    This restructuring facilitates easier data manipulation and analysis by aligning gene expression values (tpm) under their
    respective gene IDs for each specimen.

    Args:
    df (DataFrame): The gene expression data as a pandas DataFrame in long format, where each row represents a single
                    observation of gene expression (tpm) for a specific gene in a specific specimen.

    Returns:
    DataFrame: The transformed DataFrame in wide format with gene IDs as column headers and tpm values as data entries,
               indexed by 'specimen_id'.
    """
    result_df = df.pivot_table(index=['specimen_id'], columns=['versioned_ensembl_gene_id'], values='tpm').reset_index()
    result_df.columns = [''.join(col).strip() for col in result_df.columns.values]
    return result_df

def clean_df_cytokine(df):
    """
    Converts the cytokine concentration DataFrame from a long format to a wide format. In the wide format, each unique
    protein ID becomes a column header. This transformation is essential for facilitating easier access to and analysis of
    cytokine concentrations across different specimens, as each row will represent a specimen with cytokine concentrations
    laid out across columns.

    Args:
    df (DataFrame): The cytokine concentration data as a pandas DataFrame in long format, where each row typically
                    represents a cytokine measurement for a specific protein in a specific specimen.

    Returns:
    DataFrame: The transformed DataFrame in wide format with protein IDs as column headers and cytokine concentrations
               (protein_expression) as data entries, indexed by 'specimen_id'.
    """
    result_df = df.pivot_table(index=['specimen_id'], columns=['protein_id'], values='protein_expression').reset_index()
    return result_df

def drop_nan_col(df, cols):
    """
    Removes columns with NaN (Not a Number) values from a DataFrame except for one specified column. This function is 
    particularly useful for data cleaning processes where maintaining a specific column is crucial despite its missing values.

    Args:
    df (DataFrame): The DataFrame from which columns will be cleaned.
    cols (str): The column name to preserve even if it contains NaN values, ensuring it is not dropped.

    Returns:
    DataFrame: The cleaned DataFrame with columns containing NaN values removed, except for the specified column.
    """
    # Filter out columns not named "Monocytes"
    non_monocytes_columns = [col for col in df.columns if col != cols]
    
    # Check if these columns contain NaN values
    columns_with_nan = df[non_monocytes_columns].columns[df[non_monocytes_columns].isna().any()].tolist()
    
    # Drop columns containing NaN values
    result_df = df.drop(columns=columns_with_nan)
    return result_df

def create_date_diff_col(df, col_name, date_num):
    """
    Adds a new column to the DataFrame representing the difference in days between the planned and actual boost days for a specific date.
    This function is particularly useful in studies where the timing of events is crucial, such as in clinical trials or time-series analyses.

    Args:
    df (DataFrame): The specimen data as a pandas DataFrame.
    col_name (str): The name of the new column to create. This column will store the day differences.
    date_num (int): The specific planned day relative to boost to calculate differences for. This helps focus the difference calculation on a particular day of interest.

    Returns:
    DataFrame: The modified DataFrame with the new column added. The DataFrame is copied to avoid altering the original data.
    """
    df_copy = df.copy()
    
    # Create a new column based on the difference between planned and actual days relative to boost
    df_copy[col_name] = df_copy['actual_day_relative_to_boost'] - df_copy['planned_day_relative_to_boost']
    
    # Filter rows where 'planned_day_relative_to_boost' is date_num
    mask = df_copy['planned_day_relative_to_boost'] == date_num
    
    # Update the new column to NaN where 'planned_day_relative_to_boost' is not date_num
    df_copy.loc[~mask, col_name] = pd.NA

    if col_name in df_copy.columns:
        # Use groupby and transform to fill NaNs in the new column within each subject_id group
        df_copy[col_name] = df_copy.groupby('subject_id')[col_name].transform(lambda x: x.ffill().bfill())
    
    return df_copy

def create_target_col(df, col_name, date_num):
    """
    Creates new columns in the DataFrame that reflect target values based on specific days relative to a boost event,
    along with fold-change calculations compared to baseline values.

    Args:
    df (DataFrame): The original DataFrame containing the data.
    col_name (str): The name of the column from which the target values are derived.
    date_num (int): The specific day relative to the boost event to filter the DataFrame on.

    Returns:
    DataFrame: The original DataFrame with two new columns added:
        - One column representing the absolute target values on the specified day.
        - Another column representing the fold-change of the target values from the overall baseline to the specified day.
    """
    # Filter the DataFrame to get col_name values for planned_day_relative_to_boost = date_num
    df_date = df[df['planned_day_relative_to_boost'] == date_num][['subject_id', col_name]]
    
    # Create a dictionary from the filtered DataFrame
    id_to_igg = pd.Series(df_date[col_name].values, index=df_date['subject_id']).to_dict()
    
    # Map this dictionary to a new column in a separate DataFrame
    new_col_name = col_name + "_D" + str(date_num)
    new_fc_col_name = new_col_name + "_FC"
    new_df = pd.DataFrame()
    new_df[new_col_name] = df['subject_id'].map(id_to_igg)
    new_df[new_fc_col_name] = np.log2(new_df[new_col_name] / df[col_name])

    # Concatenate the new DataFrame with the original DataFrame
    df = pd.concat([df, new_df], axis=1)
    return df

In [4]:
# Concatenate dataframes from 2020 and 2021, reset the index
df_specimen = pd.concat([clean_df_specimen(df_2020_specimen), clean_df_specimen(df_2021_specimen)], ignore_index=True).fillna(0.0)
df_subject = pd.concat([clean_df_subject(df_2020_subject), clean_df_subject(df_2021_subject)], ignore_index=True).fillna(0.0)
df_titer = pd.concat([clean_df_titer(df_2020_titer), clean_df_titer(df_2021_titer)], ignore_index=True)
df_cell_freq = pd.concat([clean_df_cell_freq(df_2020_cell_freq), clean_df_cell_freq(df_2021_cell_freq)], ignore_index=True)
df_cell_freq = drop_nan_col(df_cell_freq, "Monocytes")
df_gene = pd.concat([clean_df_gene(df_2020_gene), clean_df_gene(df_2021_gene)], ignore_index=True)
# df_cytokine = pd.concat([clean_df_cytokine(df_2020_cytokine), clean_df_cytokine(df_2021_cytokine)], ignore_index=True)
# df_cytokine = drop_nan_col(df_cytokine, "P10147")

# Merge all cleaned and prepared dataframes to create a comprehensive training dataset, filling missing values with 1.0 where necessary.
df_train = df_specimen.merge(df_subject, on='subject_id', how='left')
df_train = df_train.merge(df_titer, on='specimen_id', how='left').fillna(1.0)
df_train = df_train.merge(df_cell_freq, on='specimen_id', how='left')
df_train = df_train.merge(df_gene, on='specimen_id', how='left')
# df_train = df_train.merge(df_cytokine, on='specimen_id', how='left')

# Prepare the prediction dataset in a similar fashion using 2022 data.
df_pred = clean_df_specimen(df_2022_specimen).merge(clean_df_subject(df_2022_subject), on='subject_id', how='left')
df_pred = df_pred.merge(clean_df_titer(df_2022_titer), on='specimen_id', how='left').fillna(1.0)
df_pred = df_pred.merge(clean_df_cell_freq(df_2022_cell_freq), on='specimen_id', how='left')
df_pred = df_pred.merge(clean_df_gene(df_2022_gene), on='specimen_id', how='left')
# df_pred = df_pred.merge(clean_df_cytokine(df_2022_cytokine), on='specimen_id', how='left')

# Align columns in the training and prediction datasets to ensure they are consistent.
common_columns = df_train.columns.intersection(df_pred.columns)
df_train = df_train.loc[:, common_columns]
df_pred = df_pred.loc[:, common_columns]

# Create target columns for the training data based on specific criteria.
df_train = create_target_col(df_train, "IgG-PT", 14)
df_train = create_target_col(df_train, "Monocytes", 1)
df_train = create_target_col(df_train, "ENSG00000277632.1", 3)

# Filter the training data to only include entries from the initial planned day and drop unnecessary columns.
df_train = df_train[df_train['planned_day_relative_to_boost'] == 0]
df_train = df_train.drop(['subject_id', 'specimen_id', 'planned_day_relative_to_boost', 'race_Unknown or Not Reported'], axis=1)

# Rename columns for clarity in identifying tasks.
df_train = df_train.rename(columns={
    'IgG-PT_D14': 'task11',
    'IgG-PT_D14_FC': 'task12',
    'Monocytes_D1': 'task21',
    'Monocytes_D1_FC': 'task22',
    'ENSG00000277632.1_D3': 'task31',
    'ENSG00000277632.1_D3_FC': 'task32'
})

# Prepare separate dataframes for each task, dropping columns not needed for specific tasks and rows with NaN values in the target column.
df_train_task11 = df_train.drop(columns=['date_diff_D1', 'date_diff_D3', 'task12', 'task21', 'task22', 'task31', 'task32']).dropna(subset=['task11'])
df_train_task12 = df_train.drop(columns=['date_diff_D1', 'date_diff_D3', 'task11', 'task21', 'task22', 'task31', 'task32']).dropna(subset=['task12'])
df_train_task21 = df_train.drop(columns=['date_diff_D3', 'date_diff_D14', 'task11', 'task12', 'task22', 'task31', 'task32']).dropna(subset=['task21'])
df_train_task22 = df_train.drop(columns=['date_diff_D3', 'date_diff_D14', 'task11', 'task12', 'task21', 'task31', 'task32']).dropna(subset=['task22'])
df_train_task31 = df_train.drop(columns=['date_diff_D1', 'date_diff_D14', 'task11', 'task12', 'task21', 'task22', 'task32']).dropna(subset=['task31'])
df_train_task32 = df_train.drop(columns=['date_diff_D1', 'date_diff_D14', 'task11', 'task12', 'task21', 'task22', 'task31']).dropna(subset=['task32'])

df_pred = df_pred[df_pred['planned_day_relative_to_boost'] == 0].sort_values(by='subject_id')
df_pred = df_pred.drop(['subject_id', 'specimen_id', 'planned_day_relative_to_boost', 'race_Unknown or Not Reported'], axis=1)

df_pred_task11 = df_pred.drop(columns=['date_diff_D1', 'date_diff_D3'])
df_pred_task12 = df_pred.drop(columns=['date_diff_D1', 'date_diff_D3'])
df_pred_task21 = df_pred.drop(columns=['date_diff_D3', 'date_diff_D14'])
df_pred_task22 = df_pred.drop(columns=['date_diff_D3', 'date_diff_D14'])
df_pred_task31 = df_pred.drop(columns=['date_diff_D1', 'date_diff_D14'])
df_pred_task32 = df_pred.drop(columns=['date_diff_D1', 'date_diff_D14'])

# Save the final training and prediction datasets to CSV files
df_train.to_csv(f"data/df_train.csv", index=False)
df_pred.to_csv(f"data/df_pred.csv", index=False)

# Print the final datasets for verification
print('Training dataset:')
print(df_train)
print('\n')
print('Prediction dataset:')
print(df_pred)

Training dataset:
     date_diff_D0  date_diff_D1  date_diff_D3  date_diff_D14  Age  \
0            -3.0           0.0           0.0           -3.0   30   
7            -3.0           0.0           0.0            0.0   33   
11           -7.0           0.0           0.0            0.0   28   
15           -5.0           0.0           0.0            0.0   25   
19           -6.0           0.0           0.0            0.0   28   
..            ...           ...           ...            ...  ...   
360           0.0           0.0           0.0            0.0   19   
364           0.0           0.0           0.0            0.0   23   
368           0.0           0.0           0.0            0.0   20   
372           0.0           0.0           0.0            0.0   21   
376           0.0           0.0           0.0            0.0   19   

     infancy_vac_wP  biological_sex_Male  ethnicity_Not Hispanic or Latino  \
0               1.0                  0.0                               1.0 