In [2]:
import pandas          as pd
import numpy           as np
from   multiprocessing import Pool

In [3]:
def getData(cancerType):
    
    filename  = '../data/data_phy_top100_'+cancerType+'.csv'
    data = pd.read_csv(filename)
    return data

def getWeight(data):
    
    data.sort_values(by = ['OS_MONTHS'],inplace=True)
    data['OS_STATUS']   = data['OS_STATUS'].apply(lambda x : 1 if x=='DECEASED' else 0)
    n                   = data.shape[0]
    w                   = [1]*n
    w[0]                = data['OS_STATUS'][0]/n
    
    for i in range(1,n):
        for j in range(i):
            w[i]  = w[i]*np.power(((n-j)/(n-j+1)),(data['OS_STATUS'][j]))
        w[i]      = (data['OS_STATUS'][i]/(n-i+1))*w[i]
        
    return w,data

In [4]:
class TransformColumn(object):
    def __init__(self,data,w,start_column,end_column):
        
        self.data         = data
        self.w            = w
        self.start_column = start_column
        self.end_column   = end_column
        
    def singleColumnTransform(self,column_no):
        
        #print(column_no)
        mean_x = sum(self.data.iloc[:,column_no]*w)/sum(w)
        self.data.iloc[:,column_no] = (self.data.iloc[:,column_no]-mean_x)*np.power(self.w,0.5)
        
        return self.data
    
    def multipleColumnsTransform(self):
        
        tasks   = np.arange(self.start_column,self.end_column+1)
        mp_pool = Pool(50)
        result  = mp_pool.map(self.singleColumnTransform,tasks)
        
        return result
    
def transform(data,w):
    
    mean_y            = sum(data['OS_MONTHS']*w)/sum(w)
    data['OS_MONTHS'] = (data['OS_MONTHS']-mean_y)*np.power(w,0.5)
    
    return data

In [5]:
start_column  = 2
end_column    = 101

In [6]:
data_luad       = getData('LUAD')
w,data_luad     = getWeight(data_luad) 
Transformcolumn = TransformColumn(data_luad,w,start_column,end_column)
result          = Transformcolumn.multipleColumnsTransform()
no_column       = len(result)
data_luad_trans = data_luad.copy()
for i in np.arange(2,no_column+2):
    data_luad_trans.iloc[:,i] = result[i-2].iloc[:,i]
data_luad_trans = transform(data_luad_trans,w) 

In [14]:
data_luad_trans.to_csv("../data/data_luad_trans.csv")

In [11]:
data_lusc       = getData('LUSC')
w,data_lusc     = getWeight(data_lusc) 
Transformcolumn = TransformColumn(data_lusc,w,start_column,end_column)
result          = Transformcolumn.multipleColumnsTransform()
no_column       = len(result)
data_lusc_trans = data_lusc.copy()
for i in np.arange(2,no_column+2):
    data_lusc_trans.iloc[:,i] = result[i-2].iloc[:,i]
data_lusc_trans = transform(data_lusc_trans,w) 

In [15]:
data_lusc_trans.to_csv("../data/data_lusc_trans.csv")