# Label propagation

Goal: determine the number of non-diagnosed / non-tested individuals in the phase 2 population. To extend the knowledge on this unlabelled set from the small labelled subset, we will establish the followin setting:

1. Baseline model > just run the validated transfer-learning-model (validated on phase 2 labelled data) on the unlabelled data
2. A dual self-supervised learning (SSL) and Semi-supervised learning (Semi-SL) [1] will be implemented.
3. Classic approach on label propagation [2]

An open question needed to be answered to success in this task:
- Shall we use labelled data from both phase 1 and phase 2, or exclusively phase 1 data?

[1]: Yoon, J., Zhang, Y., Jordon, J., & van der Schaar, M. (2020). Vime: Extending the success of self-and semi-supervised learning to tabular domain. Advances in Neural Information Processing Systems, 33, 11033-11043.
[2]: ZhuЃ, X., & GhahramaniЃн, Z. (2002). Learning from labeled and unlabeled data with label propagation. ProQuest Number: INFORMATION TO ALL USERS.

## Loading of libaries and data

In [79]:
import os
import numpy as np
import pandas as pd
import importlib
import joblib
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import torch
from sklearn.model_selection import train_test_split


# import from custom package
from auxFuns.EDA import *
from auxFuns.modelling import *
from auxFuns.class_overlap import *
from auxFuns.transfer_learning import *

In [78]:
import auxFuns.EDA 
importlib.reload(auxFuns.EDA)

import auxFuns.modelling
importlib.reload(auxFuns.modelling)

import auxFuns.class_overlap
importlib.reload(auxFuns.class_overlap)

import auxFuns.transfer_learning
importlib.reload(auxFuns.transfer_learning)

<module 'auxFuns.transfer_learning' from 'c:\\Users\\angel\\Documents\\VSCode\\rsv_modelling_transfer_learning\\auxFuns\\transfer_learning.py'>

In [5]:
# Load of the data and filter in the needed features

raw_datasets_path = os.getcwd() + '/datasets/raw'
processed_datasets_path = os.getcwd() + '/datasets/processed'

# Phase 1 data
rsv_predictors_df_v2 = pd.read_csv(processed_datasets_path + '/rsv_predictors_phase1_daysDedup_seasons_prevTest_v2.csv',low_memory=False)
rsv_predictors_phase1_df = make_it_categorical_v2(rsv_predictors_df_v2)

# Phase 2 data
rsv_phase2_df = pd.read_csv(processed_datasets_path + '/rsv_phase2_all_features.csv',low_memory=False)
rsv_phase2_df = make_it_categorical_v2(rsv_phase2_df, is_phase1 = False)

# Small subset of labelled data of phase 2:
labels_phase2_df = pd.read_csv(raw_datasets_path + '/rsv_test_phase2.csv',low_memory=False)
labels_phase2_df = labels_phase2_df.rename(columns = {'RSV_test_date':'index_date'})

# Compatibility issue detected: merging labelled and unlabelled data is challening due to incompatible date columns
rsv_phase2_df.index_date = pd.to_datetime(rsv_phase2_df.index_date)
labels_phase2_df.index_date = pd.to_datetime(labels_phase2_df.index_date)

rsv_phase2_labelled_df = labels_phase2_df.merge(rsv_phase2_df, how='left', on=['patient_id', 'index_date'])
# rsv_phase2_labelled_df = labels_phase2_df.merge(rsv_phase2_df, how='left', on=['patient_id'])
rsv_phase2_labelled_df = rsv_phase2_labelled_df.dropna()

rsv_predictors_phase1_df.shape, rsv_phase2_df.shape, rsv_phase2_labelled_df.shape

((86058, 64), (291938, 63), (2867, 64))

In [18]:
selected_features = ['sex', 'marital_status', 'race','patient_regional_location', 'age_group',
                     'Acute_upper_respiratory_infection','Influenza','Pneumonia','Bronchitis','Symptoms_and_signs__digestive_system_and_abdomen','General_symptoms_and_signs','any_symptom',
                     'COPD','AIDS','Asthma_chronic','CCI',
                     'sine','cosine','calendar_year', 
                     'healthcare_seeking', 'influenza_vaccine',
                     'n_symptoms','prev_positive_rsv','previous_test_daydiff','n_immunodeficiencies', 
                     'tumor_indicator','tumor_last_year',
                     'season',
                     'n_tests_that_day']
selected_features_v1 = ['n_tests_that_day', 'sine','cosine', 'previous_test_daydiff',
                     'Bronchitis', 'CCI',
                     'Acute_upper_respiratory_infection', 'n_immunodeficiencies', 'n_symptoms',
                     'healthcare_seeking', 
                     'General_symptoms_and_signs', 'prev_positive_rsv', 'Influenza',
                     'season','multiple_tests']
selected_features_v2 = ['n_tests_that_day', 'sine','cosine', 'previous_test_daydiff',
                     'Bronchitis', 'CCI',
                     'Acute_upper_respiratory_infection', 'n_immunodeficiencies', 'n_symptoms',
                     'healthcare_seeking', 
                     'General_symptoms_and_signs', 'prev_positive_rsv', 'Influenza',
                     'key_comorbidities','Pneumonia',
                     'season','month_of_the_test','multiple_tests',
                     'BPA','BPAI']
selected_features_v3 = selected_features_v2 + ['race', 'age_group','marital_status','sex',
                                                    'patient_regional_location','calendar_year']

rsv_test_related_features = ['n_tests_that_day', 'previous_test_daydiff', 'multiple_tests']
selected_features_v4 = selected_features_v2.copy()
[selected_features_v4.remove(feature) for feature in rsv_test_related_features]

selected_features_v4_aux = selected_features_v4.copy()
selected_features_v1.append('RSV_test_result')
selected_features_v2.append('RSV_test_result')
selected_features_v3.append('RSV_test_result')

In [68]:
df_modelling_phase1 = rsv_predictors_phase1_df[selected_features_v4]
df_modelling_phase2 = rsv_phase2_labelled_df[selected_features_v4]
df_modelling_all_phase2 = rsv_phase2_df[selected_features_v4_aux]

# differentiate between labelled and non-labelled data in phase 2
labelled_data_phase2_mask = rsv_phase2_labelled_df.index
labelled_mask = rsv_phase2_df.index.isin(labelled_data_phase2_mask)
non_labelled_mask = ~rsv_phase2_df.index.isin(labelled_data_phase2_mask)

df_phase2_labelled = df_modelling_phase2.copy()
df_phase2_unlabelled = df_modelling_all_phase2.loc[non_labelled_mask,:]

df_modelling_phase1.shape, df_phase2_labelled.shape, df_phase2_unlabelled.shape

((86058, 17), (2867, 17), (289071, 17))

## Baseline model

## Label propagation: classical / graph approach

## SSL and Semi-SL for label propagation

Guideline:

**1. Self-supervised learning**

(SSL: produce 'pseudolabels' on the purely unlabelled data)

**Goal of 1**: learn relevant representations of the unlabelled data, transferable to the semi-supervised learning phase

1.1 Implementing the Pretext Tasks:
- Mask Vector Generation: mask some of the features
- Sample corruption: create corrupted/noisy samples *^x^*

1.2. Encoder and pretect predictive models:

- encoder: to map corrputed samples *^x^* to a latent representation *z*
- mask vector estimator: predicts which features of *^x^* had been masked (binary cross entropy loss)
- feature vector estimator: predicts original sample *x* from *^x^* (autoencoder-like loss function)


**2. Semi-supervised learning**

**Goal of 2**: extend the prediction model on the unlabelled data, taking the small labelled subset as well as the representations provided by the autoencoder.

2.1. Use of the encoder

- To map the samples to a common latent space *Z*

2.2. Training of the predictive model

- Training on labelled data on the **supervised loss Ls** and on unlabelled data using **consisteny loss Lu**
- Consitency loss Lu: for a sample *x*, build its corrupted counterpart *^x^* K times. Lu measures how much predictions for the original sample and the corrupted counterparts differ.


The training step will involve both the labelled and unlabelled sets (unlabelled for the self-supervised, both for the semi-supervised)
The evaluation of the model needs to be made on a held-out validation set


### Self-supervised learning: building the i) encoder, ii) mask and iii) feature estimators

Reminder: all of this part is done with UNLABELLED data only

In [124]:
def generate_mask_vector(n_features, pm):
    """
    Generates a binary mask vector using a Bernoulli distribution

    Parameters: 
    n_features (int): number of features of the vector to be masked
    pm (float): parameter of the masking (Bernoulli) distribution

    Returns:
    mask (np.ndarray): masking vector of the input feature, of length 

    """
    mask = np.random.binomial(1, pm, n_features)
    return mask

def generate_masked_sample(x, mask, data):
    """
    Generates a corrupted version of the sample using the mask vector

    Given a sample `x` and a binary mask vector, this function returns a version of `x` where the 
    elements corresponding to a '1' in the mask vector are replaced with random samples from `data`.
    If the mask vector contains '0' for an index, the original value from `x` is retained.

    Parameters:
    ----------
    x : numpy.array
        A 1D numpy array representing the sample to be corrupted.
        Its length should match the length of the `mask` vector.

    mask : numpy.array
        A 1D binary numpy array where '1' indicates the positions in `x` to be corrupted, and '0' 
        indicates the positions to be left unchanged. Should be of same length as `x`.

    data : numpy.array or list
        A collection from which random values are drawn to replace the corrupted positions in `x`. 
        It can be a list or a 1D numpy array.

    Returns:
    -------
    numpy.array
        A corrupted version of `x` where values have been replaced based on the mask vector.

    Examples:
    --------
    >>> x = np.array([1, 2, 3, 4, 5])
    >>> mask = np.array([0, 1, 0, 1, 0])
    >>> data = np.array([10, 11, 12, 13, 14, 15])
    >>> generate_masked_sample(x, mask, data)
    array([ 1, 11,  3, 13,  5])
    """
    flattened_data = data.flatten()  # Flatten the data
    x_corrupted = np.where(mask, np.random.choice(flattened_data, len(x)), x)
    return x_corrupted

def from_df2DataLoader(df, is_labelled=False, label_name=None, custom_batch_size = 32, custom_shuffle = True):
    assert isinstance(df, pd.DataFrame), 'Input is not a dataframe, please transform it into a pd.DataFrame'

    dataset = CustomDataset(df, is_labelled=is_labelled, label_name=label_name)
    dataloader = DataLoader(dataset, batch_size=custom_batch_size, shuffle=custom_shuffle)

    return dataloader

In [98]:
class CustomDataset(Dataset):
    def __init__(self, dataframe, is_labelled=False, label_name=None):
        self.dataframe = dataframe
        self.is_labelled = is_labelled
        self.label_name = label_name

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, index):
        row = self.dataframe.iloc[index]

        # Decide the value of x based on is_labelled flag
        if self.is_labelled:
            x = torch.tensor(row[self.dataframe.columns != self.label_name].values, dtype=torch.float32)
        else:
            x = torch.tensor(row.values, dtype=torch.float32)

        return x

In [64]:
# Define the three elements of the architecture:
# 1. Encoder (bringing corrupted samples to a latent space Z)
# 2. Mask Estimator (determines which features of x have been replaced)
# 3. Feature Estimator (predicts original x from corrupted ^x^)

class Encoder(nn.Module):
    def __init__(self, input_dim, latent_dim):
        super().__init__() # ensures the underlying initilization of the nn.Module superclass
        self.fc1 = nn.Linear(input_dim, 128)
        self.fc2 = nn.Linear(128, latent_dim)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = self.fc2(x)
        return x

class MaskEstimator(nn.Module):
    def __init__(self, latent_dim, output_dim):
        super(MaskEstimator, self).__init__()
        self.fc1 = nn.Linear(latent_dim, 128)
        self.fc2 = nn.Linear(128, output_dim)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = self.sigmoid(self.fc2(x))
        return x


class FeatureEstimator(nn.Module):
    def __init__(self, latent_dim, output_dim):
        super(FeatureEstimator, self).__init__()
        self.fc1 = nn.Linear(latent_dim, 128)
        self.fc2 = nn.Linear(128, output_dim)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = self.fc2(x)
        return x

In [122]:
# 0. Prepare data
# Remember, for this stage we will only work the UNLABELLED DATA
## 0.1 Train-test(-validation) split
ssl_data = df_phase2_unlabelled.copy()

train_data, aux_data = train_test_split(ssl_data, test_size=0.3)  # 70% training, 30% aux
valid_data, test_data = train_test_split(aux_data, test_size=0.5)  # 15% validation, 15% test

## 0.2 Preprocess the data
preprocessor = build_preprocessor_phase2(train_data)

X_train_processed = preprocessor.fit_transform(train_data)
X_test_processed = preprocessor.transform(test_data)
X_valid_processed = preprocessor.transform(valid_data)

# 0.3 Adapt the datasets to the Pytorch DataLoader format
train_dataloader = from_df2DataLoader(pd.DataFrame(X_train_processed, columns = (get_feature_names_OneHotEncoder_preprocessor(preprocessor))))
test_dataloader = from_df2DataLoader(pd.DataFrame(X_test_processed, columns = (get_feature_names_OneHotEncoder_preprocessor(preprocessor))))
valid_dataloader = from_df2DataLoader(pd.DataFrame(X_valid_processed, columns = (get_feature_names_OneHotEncoder_preprocessor(preprocessor))))

# 0.4 For consistency with posterior training loop
train_array = np.vstack([batch.numpy() for batch in train_dataloader])
test_array = np.vstack([batch.numpy() for batch in test_dataloader])
valid_array = np.vstack([batch.numpy() for batch in valid_dataloader])

In [None]:
input_dim = X_train_processed.shape[1]
latent_dim = 5

encoder = Encoder(input_dim, latent_dim)
mask_estimator = MaskEstimator(latent_dim, input_dim)
feature_estimator = FeatureEstimator(latent_dim, input_dim)

optimizer = optim.Adam(list(encoder.parameters()) + list(mask_estimator.parameters()) + list(feature_estimator.parameters()), lr=0.001)
loss_fn_mask = nn.BCELoss()
loss_fn_feature = nn.MSELoss()

num_epochs = 100
alpha = 0.5

for epoch in range(num_epochs):
    # Training loop
    for batch_data in train_dataloader:
        # Mask and corrupt data
        mask = generate_mask_vector(input_dim, pm=0.5)
        x_corrupted = generate_masked_sample(batch_data, mask, train_array)
        
        # Forward pass
        z = encoder(x_corrupted)
        mask_pred = mask_estimator(z)
        feature_pred = feature_estimator(z)
        
        # Calculate loss
        loss_mask = loss_fn_mask(mask_pred, mask)
        loss_feature = loss_fn_feature(feature_pred, batch_data)
        loss = loss_mask + alpha * loss_feature
        
        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    # Testing loop: 
    # torch.no_grad() to ensure we are NOT updating the model weights outside the training loop
    with torch.no_grad():
        total_test_loss = 0
        for batch_data in test_dataloader:
            # Mask and corrupt data
            mask = generate_mask_vector(input_dim, pm=0.5)
            x_corrupted = generate_masked_sample(batch_data, mask, test_array)
            
            # Forward pass
            z = encoder(x_corrupted)
            mask_pred = mask_estimator(z)
            feature_pred = feature_estimator(z)
            
            # Calculate loss
            loss_mask = loss_fn_mask(mask_pred, mask)
            loss_feature = loss_fn_feature(feature_pred, batch_data)
            loss = loss_mask + alpha * loss_feature
            
            total_test_loss += loss.item()
        
        # Print or store validation loss
        avg_test_loss = total_test_loss / len(test_data)
        print(f"Epoch {epoch+1}, Test Loss: {avg_test_loss}")
