# 2nd CMI-PB Prediction Challenge
## Team Advisor: Barry Grant, Jason Hsiao
## Team member: Peng Cheng, Javier Garcia, Brian Qian, Weikang Guan
## Part 3: Prediction

In [1]:
# Import necessary Python libraries.
import os  # Library for interacting with the operating system
import csv  # Library for handling CSV files
import numpy as np  # Library for numerical operations on large arrays and matrices
import pandas as pd  # Library for data manipulation and analysis
from joblib import load  # Library for loading saved models
from sklearn.preprocessing import OneHotEncoder, StandardScaler, Normalizer, RobustScaler, MinMaxScaler  # Preprocessing tools from sklearn

# Check if the directory for prediction results does not exist, then create it.
if not os.path.exists('prediction_result'):
    os.makedirs('prediction_result')

# Load the prediction data into a DataFrame
df_pred = pd.read_csv("data/df_pred.csv")

In [2]:
def make_prediction(df_pred_task, task_name):
    """
    Make predictions using the trained model and the best scaler for the given task.

    Args:
    df_pred_task (DataFrame): The prediction data.
    task_name (str): The name of the task for which the predictions are being made.

    Returns:
    numpy array: The predictions made by the model.
    """
    # Read the best scaler name from the file
    with open(f'training_result/{task_name}_best_scaler.txt', 'r') as file:
        scaler_name = file.read().strip()
    
    # Define a dictionary mapping scaler names to scaler classes
    scalers = {
        'StandardScaler': StandardScaler(),
        'MinMaxScaler': MinMaxScaler(),
        'Normalizer': Normalizer(),
        'RobustScaler': RobustScaler(),
    }
    
    # Apply the scaler to the prediction data if recognized
    if scaler_name in scalers:
        df_pred_task = pd.DataFrame(scalers[scaler_name].fit_transform(df_pred_task), columns=df_pred_task.columns)
    elif scaler_name == 'None':
        pass
    else:
        raise ValueError(f"Unrecognized scaler: {scaler_name}")
    
    # Read the selected features from the file
    selected_features = []
    with open(f'training_result/{task_name}_feature_selected.csv', newline='') as csvfile:
        reader = csv.reader(csvfile)
        next(reader)  # Skip the header row
        for row in reader:
            selected_features.append(row[0])
    
    # Apply the selected features to the prediction data
    df_pred_task = df_pred_task[selected_features]
    
    # Load the trained model
    model = load(f'training_result/{task_name}_best_model.joblib')
    
    # Make predictions using the model
    predictions = model.predict(df_pred_task)
    return predictions

def remove_rank_from_column_names(df):
    """
    Remove the suffix '-Rank' from column names in the DataFrame.

    Args:
    df (DataFrame): The DataFrame whose column names will be cleaned.

    Returns:
    DataFrame: The DataFrame with cleaned column names.
    """
    result_df = df.copy()
    result_df.columns = [col.replace('-Rank', '') for col in result_df.columns]
    return result_df

def value_to_rank(df):
    """
    Convert values in specific columns to ranks.

    Args:
    df (DataFrame): The DataFrame whose values will be converted to ranks.

    Returns:
    DataFrame: The DataFrame with values converted to ranks.
    """
    result_df = df.copy()
    for col in result_df.columns[4:10]:  # Assuming the columns to rank are the 5th to 10th columns
        result_df[col] = result_df[col].rank(ascending=False).astype(int)
    return result_df

def drop_nan_col(df, cols):
    """
    Removes columns with NaN (Not a Number) values from a DataFrame except for one specified column. This function is 
    particularly useful for data cleaning processes where maintaining a specific column is crucial despite its missing values.

    Args:
    df (DataFrame): The DataFrame from which columns will be cleaned.
    cols (str): The column name to preserve even if it contains NaN values, ensuring it is not dropped.

    Returns:
    DataFrame: The cleaned DataFrame with columns containing NaN values removed, except for the specified column.
    """
    # Filter out columns not named "Monocytes"
    non_monocytes_columns = [col for col in df.columns if col != cols]
    
    # Check if these columns contain NaN values
    columns_with_nan = df[non_monocytes_columns].columns[df[non_monocytes_columns].isna().any()].tolist()
    
    # Drop columns containing NaN values
    result_df = df.drop(columns=columns_with_nan)
    return result_df

In [3]:
# Fill missing values in the prediction DataFrame with the median value
df_pred_task = df_pred.fillna(df_pred.median())

# Load the prediction value template
df_pred_value = pd.read_csv('https://www.cmi-pb.org/downloads/cmipb_challenge_datasets/current/2nd_challenge/2ndChallengeSubmissionTemplate.tsv', delimiter='\t')

# Make predictions for each task and fill the corresponding columns in the prediction value template
df_pred_value.iloc[:, 4] = make_prediction(df_pred_task, 'task11')
df_pred_value.iloc[:, 5] = make_prediction(df_pred_task, 'task12')
df_pred_value.iloc[:, 6] = make_prediction(df_pred_task, 'task21')
df_pred_value.iloc[:, 7] = make_prediction(df_pred_task, 'task22')
df_pred_value.iloc[:, 8] = make_prediction(df_pred_task, 'task31')
df_pred_value.iloc[:, 9] = make_prediction(df_pred_task, 'task32')

# Remove '-Rank' from column names and save the cleaned prediction values to a CSV file
remove_rank_from_column_names(df_pred_value).to_csv(f'prediction_result/Prediction_value.csv', index=False)

# Convert values to ranks and save the ranked predictions to a CSV file
value_to_rank(df_pred_value).to_csv(f'prediction_result/Prediction_rank.csv', index=False)

# Display the prediction values DataFrame
df_pred_value

Unnamed: 0,Subject ID,Age,Biological Sex at Birth,Vaccine Priming Status,1.1) IgG-PT-D14-titer-Rank,1.2) IgG-PT-D14-FC-Rank,2.1) Monocytes-D1-Rank,2.2) Monocytes-D1-FC-Rank,3.1) CCL3-D3-Rank,3.2) CCL3-D3-FC-Rank
0,97,35,Male,wP,6.307777,1.394135,18.250394,0.226054,33.51281,0.44624
1,98,28,Female,wP,6.603358,1.443299,15.996157,0.798805,37.01685,0.195854
2,99,22,Female,aP,6.299454,1.507053,31.906312,0.652306,33.261766,0.737446
3,100,20,Female,aP,6.404223,2.414949,20.890465,0.496706,41.018895,-0.394985
4,101,18,Male,aP,6.782641,0.951839,23.048669,0.678535,36.23538,0.084791
5,102,18,Male,aP,6.683911,0.794266,37.864428,0.562568,35.031306,0.891409
6,103,27,Female,wP,6.660135,3.5631,20.357848,0.535244,37.277945,0.297983
7,104,32,Female,wP,6.625619,3.592939,19.029895,0.708539,37.143112,1.144763
8,105,27,Female,wP,6.641648,2.110331,35.582716,0.071251,34.867932,0.222857
9,106,25,Female,aP,6.713152,3.525173,19.635514,0.224763,33.509574,0.108258
