In [None]:
""" 
Ordered response + PCA
"""

Motivation

* Background: Cancer has two stages - early and late # TODO: this needs a better interpretation, early and late are in the embedding space, not in the feature space

* Data: Source data is dense in early stage, but sparse in late stage. Target data is sparse in early stage, but dense in late stage.

* Question: Does transporting source representations onto target representations help with late stage cancer prediction?

* Note: This question is different from previous simulation questions. We only benchmark a subset of the source data; that is, data in the late stage.

Simulation

* Suppose the embedding dimension is 1.

* $r$ for indexing domains

* Let $[0, S]$ be early stage, and $(S, E]$ be late stage

* Let $N$ be the total number of data, $N_{se}$ be the number of data in early stage in the source data, and $N_{tl}$ be the number of data in late stage in the target data. Note that $N_{se}$ and $N_{tl}$ are supposed to be $> \frac{N}{2}$.

* Simulate $X_{se} \sim \operatorname{Unif}\left( 0, S \right)$ where $|X_{se}| = N_{se}$

* Generate $Y_{se} = k_1 X_{se} + \epsilon$

* Simulate $X_{sl} \sim \operatorname{Unif}\left(S, E \right)$ where $|X_{sl}| = N-N_{se}$

* Generate $Y_{sl} = k_2 X_{sl} + \epsilon$

* Simulate $X_{te} \sim \operatorname{Unif}\left(0, S \right)$ where $|X_{te}| = N-N_{tl}$

* Generate $Y_{te} = k_1 X_{te} + \epsilon$

* Simulate $X_{tl} \sim \operatorname{Unif}\left(S, E \right)$ where $|X_{tl}| = N_{tl}$

* Generate $Y_{tl} = k_2 X_{tl} + \epsilon$

* The source representations are: $[X_{se}, X_{sl}]$. Source labels are $[Y_{se}, Y_{sl}]$. 

* The target representations are: $[X_{te}, X_{tl}]$. Target labels are $[Y_{te}, Y_{tl}]$. 




In [None]:
from sklearn.utils import check_random_state
from math import floor
from random import randint
from numpy.random import uniform
import numpy as np


In [None]:
def make_func(k):
    """ 
    Make a linear function with coefficient k and intercept 0

    returns
        - the above described function
    """
    
    def f(x):
        return k*x
    return f 

In [None]:
def sample_X(x_num, low, high):
    """ 
    Sample x_num of x within [low, high]
    """
    return uniform(low=low, high=high, size=x_num)

def sample_y(X, k, random_state=None, nz=0.5):
    """
    Sample y based on y=kx+noise

    :param float nz: noise level
    """

    generator = check_random_state(random_state)

    f = make_func(k)
    y = [f(x) for x in X]
    y += 1.5 * nz * generator.randn(sum(y), 1)
    return y

In [None]:
def simulate_emb_labels(num_patient, early_prop, k_1, k_2):
    """ 
    Simulate desired early and late stage embeddings and labels (assume the embedding dimension is 1)

    :param int num_patient: number of patients
    :param float early_prop: proportion of patients in the early stage
    :param float k_1: the desired coefficient for modeling the relationship between x and y in early stage
    :param float k_2: the desired coefficient for modeling the relationship between x and y in late stage
    """
    early_patient_num = floor(early_prop*num_patient)
    late_patient_num = num_patient-early_patient_num
    early_X = sample_X(early_patient_num, 0, 1)
    early_y = sample_y(early_X, k_1)
    late_X = sample_X(late_patient_num, 1, 2)
    late_y = sample_y(late_X, k_2)
    embs = early_X.extend(late_X)
    labels = early_y.extend(late_y)
    return embs, labels


In [None]:
""" 
Simulation scheme
"""

def simulate_pca_train(D, d_1, d_2, num_patient):
    """ 
    Simulate features and labels for domain 1 and domain 2, for PCA training
    :param int D:  total number of features
    :param int d_1: number of features with higher frequency in domain 1
    :param int d_2: number of features with higher frequency in domain 2
    :param int num_patient: number of patients in each domain

    Variables in the implementation are consistent with the variables in the scheme

    TODO: reconsider the choice of alpha_1 and alpha_2

    :return
        list[list[int]] domain 1 features
        list[int] domain 1 labels
        list[list[int]] domain 2 features
        list[int] domain 2 labels
    """

    d_1 = randint(0, floor(0.25*D))
    d_2 = randint(0, floor(0.25*D))
    delta_1 = np.random.choice(size = d_1, a = range(1, D+1), replace=False)
    remaining_set = list(set(list(range(1, D+1)))-set(delta_1))
    delta_2 = np.random.choice(size = d_1, a = remaining_set, replace=False)
    
    unit_1 = 1/(2*d_1-2*d_2+3*D)
    alpha_1 = [5*unit_1]*d_1
    alpha_1.extend([unit_1]*d_2)
    alpha_1.extend([3*unit_1]*(D-d_1-d_2))
  
    unit_2 = 1/(-2*d_1+2*d_2+3*D)
    alpha_2 = [unit_2]*d_1
    alpha_2.extend([5*unit_2]*d_2)
    alpha_2.extend([3*unit_2]*(D-d_1-d_2))  
    W = np.random.normal(size=D)
    W  = [abs(W_k) for W_k in W] # only sample positive weights

    def gen_feature_vector_label(alpha):
        """ 
        Generate feature vectors and labels
        :param list[float] alpha: concentration parameteres for the dirichlet distribution
        """

        def sigmoid(x):
            return 1 / (1 + exp(-x))

        rho = dirichlet(alpha=alpha, size=1)[0]

        X = []
        Y = []
        b = 0
        all_sum = []

        for _ in range(num_patient):
            X_i = np.random.multinomial(len(rho), rho)
            for k in range(len(X_i)):
                if X_i[k] > 0:
                    X_i[k] = 1 # dominant effect
            X.append(X_i)
            cur_sum = np.sum(np.multiply(W, X_i))
            all_sum.append(cur_sum)
        
        # print("all_sum before preprocessing is:", all_sum)
        # standardize
        all_sum = preprocessing.scale(all_sum)
        # print("all_sum after preprocessing is:", all_sum)

        all_sum = np.array(all_sum)
        
        P = []
        for cur_sum in all_sum:
            p_i = sigmoid(cur_sum)
            P.append(p_i)
            Y_i = 0
            if p_i >= 0.5: # TODO: mimic exact logistic regression, change to np.random.binomial later
                Y_i = 1
            # Y_i = np.random.binomial(1, p_i) # too much noise, domain 1 data cannot learn well
            Y.append(int(Y_i))
        # print("P is:", P)

            
        return X, Y, W, b
    
    def feature_vector_to_feature(feature_vectors):
        """ 
        Convert feature vectors to features
        :param list[list[int]]: feature vectors consisting of indicators

        Returns
            - features consisting of actual codes
        """
        features = []
        for feature_vector in feature_vectors:
            features.append([i for i, e in enumerate(feature_vector) if e != 0])
        return features
    
    def pad_features(features_list):
        """ 
        Pad features to the same length (maximum length of the original features)\
            in each domain by -1
        """
        max_len = 0
        for features in features_list:
            max_len = max(max_len, len(features))

        for i in range(len(features_list)):
            features_list[i] += [-1] * (max_len - len(features_list[i]))
        return features_list



    feature_vector_1, label_1, W_1, b_1 = gen_feature_vector_label(alpha_1)
    feature_1 = pad_features(feature_vector_to_feature(feature_vector_1))
    feature_vector_2, label_2, W_2, b_2 = gen_feature_vector_label(alpha_2)
    feature_2 = pad_features(feature_vector_to_feature(feature_vector_2))
    return np.array(feature_1), label_1, np.array(feature_2), label_2

