In [1]:
'''
preprocess feature seletion data
'''

'\npreprocess feature seletion data\n'

In [1]:
import numpy as np
import pandas as pd 
import os 

Read Data

In [4]:
path_dir = './data/feature_selection_test.csv'
df = pd.read_csv(path_dir)
df.head(5)

Unnamed: 0,Username,Mobility,ArmSwelling,BreastSwelling,Skin,PAS,FHT,DISCOMFORT,SYM_COUNT,TIME_LAPSE,...,chemo_numbered,Radiation,Age,SLNB_Removed_LN,ALND_Removed_LN,SLNB_ALND_Removed,Mastectomy,Lumpectomy,Hormonal,BMI
0,ML380,2,0,2,0,1,0,1,5,0.8,...,1,1,70.0,1,0,0,0,1,1.0,39.8
1,ML381,1,0,0,1,0,0,1,4,2.2,...,1,1,54.0,2,0,0,0,1,1.0,25.2
2,ML382,0,0,0,0,0,0,0,0,6.3,...,1,1,70.0,0,0,7,0,1,1.0,33.8
3,ML383,0,0,0,0,1,3,3,10,4.3,...,1,1,40.0,0,0,15,0,1,0.0,21.1
4,ML384,0,0,0,0,0,0,0,0,3.1,...,0,1,76.0,1,0,0,0,1,1.0,27.2


In [5]:
print('columns:')
print(df.columns)

columns:
Index(['Username', 'Mobility', 'ArmSwelling', 'BreastSwelling', 'Skin', 'PAS',
       'FHT', 'DISCOMFORT', 'SYM_COUNT', 'TIME_LAPSE', 'LVC',
       'ChestWallSwelling', 'fluid_total', 'chemo_numbered', 'Radiation',
       'Age', 'SLNB_Removed_LN', 'ALND_Removed_LN', 'SLNB_ALND_Removed',
       'Mastectomy', 'Lumpectomy', 'Hormonal', 'BMI'],
      dtype='object')


Select Features

In [6]:
data_df = df.rename({'chemo_numbered':'Chemotherapy'}, axis=1)
print('columns:')
print(data_df.columns)
print('shape:')
print(data_df.shape)

columns:
Index(['Username', 'Mobility', 'ArmSwelling', 'BreastSwelling', 'Skin', 'PAS',
       'FHT', 'DISCOMFORT', 'SYM_COUNT', 'TIME_LAPSE', 'LVC',
       'ChestWallSwelling', 'fluid_total', 'Chemotherapy', 'Radiation', 'Age',
       'SLNB_Removed_LN', 'ALND_Removed_LN', 'SLNB_ALND_Removed', 'Mastectomy',
       'Lumpectomy', 'Hormonal', 'BMI'],
      dtype='object')
shape:
(191, 23)


Convert feature value

Data Filtering

In [7]:
# remove specfic data point 
data_df = data_df[data_df.Username != 'ML509']

In [8]:
# fill 0 for missing data
for column in ['Mobility', 'ArmSwelling', 'BreastSwelling', 'Skin', 'ChestWallSwelling', 'Chemotherapy', \
               'Radiation', 'SLNB_Removed_LN', 'ALND_Removed_LN', 'SLNB_ALND_Removed']:
    data_df[column].fillna(0, inplace=True)

In [9]:
data_df = data_df.dropna(subset=['TIME_LAPSE', 'BMI', 'Age', 'LVC'])
print('shape:')
print(data_df.shape)

shape:
(190, 23)


In [10]:
data_df = data_df.apply(lambda x: pd.to_numeric(x, errors='coerce'))
# drop rows containing nan
data_df = data_df.dropna()

In [11]:
data_df[['Mobility', 'ArmSwelling', 'BreastSwelling', 'Skin', 'ChestWallSwelling', 'Chemotherapy', \
               'Radiation', 'SLNB_Removed_LN', 'ALND_Removed_LN', 'SLNB_ALND_Removed']].isnull().values.any()

False

<b> Combine all three dataset and divide it to train/test<b>

In [14]:
import numpy as np
import pandas as pd 
import os 
from expert_tree import get_expert_tree_results, Expert_Tree

In [15]:
csvs = [os.path.join('./data/', file) for file in os.listdir('./data/') if '.csv' in file]
pds = [pd.read_csv(csv) for csv in csvs]
print('data included:', csvs)

data included: ['./data/feature_selection_preprocessed_data.csv', './data/feature_selection_test.csv', './data/feature_selection_kinect_test.csv']


process feature_selection_preprocessed_data.csv

In [16]:
delete_rows = ['A001', 'A003', 'A004', 'A005', 'A015', 'A016',
               'A018', 'A025', 'A026', 'A029', 'A031', 'A032',
               'A035', 'A036', 'A038', 'A042', 'A046', 'A055',
               'T010', 'T013', 'T014', 'T016', 'T018', 'T019',
               'T021', 'T026', 'T029', 'T030', 'T036', 'T038',
               'T040', 'T043', 'T044', 'T054', 'T055', 'T056',
               'T057'] 

data = pds[0]
# drop list of subjects
data = data[~data.Username.isin(delete_rows)]
# drop data within a 6 months
data = data[data.TIME_LAPSE >=0.5]
data = data.drop(columns=['Unnamed: 0'])

# add log of time elapsed
data['TIME_LAPSE_LOG'] = np.log(data['TIME_LAPSE'])

# the data_pre contain '#DEV/0!' or "" to make the coloumn string instead of float
# and these dirty element would cause error in .astype operation
# here convert "" or  '#DEV/0!' to nan
data.iloc[:,1:] = data.iloc[:,1:].apply(lambda x: pd.to_numeric(x, errors='coerce'))
# drop rows containing nan
data = data.dropna()

# generate three class label 
labels_3, _, flags1 = get_expert_tree_results(data, is_Kinect=False, class_number=3)
print(np.unique(labels_3, return_counts=True))


data["3class_label"] = labels_3

# drop LVC and time time elapse
data = data.drop(columns=['LVC', 'TIME_LAPSE', 'fluid_total'])
data = data.drop(columns=['Number_nodes'])
print('shape:')
print(data.shape)

pds[0] = data


(array([0, 1, 2]), array([443, 150, 266]))
shape:
(859, 22)


In [17]:
[flag for flag in flags1 if 'u' in flag]

['u']

process feature_selection_test.csv

In [18]:
pds[1]

Unnamed: 0,Username,Mobility,ArmSwelling,BreastSwelling,Skin,PAS,FHT,DISCOMFORT,SYM_COUNT,TIME_LAPSE,...,chemo_numbered,Radiation,Age,SLNB_Removed_LN,ALND_Removed_LN,SLNB_ALND_Removed,Mastectomy,Lumpectomy,Hormonal,BMI
0,ML380,2,0,2,0,1,0,1,5,0.8,...,1,1,70.0,1,0,0,0,1,1.0,39.8
1,ML381,1,0,0,1,0,0,1,4,2.2,...,1,1,54.0,2,0,0,0,1,1.0,25.2
2,ML382,0,0,0,0,0,0,0,0,6.3,...,1,1,70.0,0,0,7,0,1,1.0,33.8
3,ML383,0,0,0,0,1,3,3,10,4.3,...,1,1,40.0,0,0,15,0,1,0.0,21.1
4,ML384,0,0,0,0,0,0,0,0,3.1,...,0,1,76.0,1,0,0,0,1,1.0,27.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
186,ML566,0,0,0,0,0,0,3,1,3.3,...,1,0,43.0,0,0,11,1,0,1.0,35.4
187,ML567,3,0,0,0,1,0,2,6,1.5,...,0,1,65.0,2,0,0,0,1,1.0,22.3
188,ML568,0,0,0,0,0,0,0,0,12,...,1,1,61.0,0,14,0,1,1,1.0,20.7
189,ML569,1,1,3,1,1,2,1,14,0.3,...,1,0,53.0,0,0,3,1,0,0.0,35.2


In [19]:
data = pds[1]

data = data.rename({'chemo_numbered':'Chemotherapy'}, axis=1)
# remove specfic data point 
data = data[data.Username != 'ML509']
# fill 0 for missing data
for column in ['Mobility', 'ArmSwelling', 'BreastSwelling', 'Skin', 'ChestWallSwelling', 'Chemotherapy', \
               'Radiation', 'SLNB_Removed_LN', 'ALND_Removed_LN', 'SLNB_ALND_Removed']:
    data[column].fillna(0, inplace=True)
# drop data missing important feature
data = data.dropna(subset=['TIME_LAPSE', 'BMI', 'Age', 'LVC'])
# delete Username
#data = data.drop(columns='Username')
# convert string to float
#data_test = data_test.apply(lambda x: pd.to_numeric(x, errors='coerce'))
data.iloc[:,1:] = data.iloc[:,1:].astype(np.float64)
# drop rows containing nan
data = data.dropna()
# drop data within a 6 months
data = data[data.TIME_LAPSE >=0.5]
# add log of time elapsed
data['TIME_LAPSE_LOG'] = np.log(data['TIME_LAPSE'])

#print('shape:')
#print(data.shape)

# generate three class label 
labels_3, _, flags2 = get_expert_tree_results(data, is_Kinect=False, class_number=3)
print(np.unique(labels_3, return_counts=True))

data["3class_label"] = labels_3

# drop LVC and time time elapse
data = data.drop(columns=['LVC', 'TIME_LAPSE', 'fluid_total'])
print('shape:')
print(data.shape)

pds[1] = data


(array([0, 1, 2]), array([134,  15,  27]))
shape:
(176, 22)


In [20]:
[flag for flag in flags2 if 'u' in flag]

[]

process feature_selection_kinect_test

In [21]:
data = pds[2]
# drop data within a 6 months
data = data[data.TIME_LAPSE >=0.5]
# data = data.drop(columns=['Username'])

# add log of time elapsed
data['TIME_LAPSE_LOG'] = np.log(data['TIME_LAPSE'])

# the data_pre contain '#DEV/0!' or "" to make the coloumn string instead of float
# and these dirty element would cause error in .astype operation
# here convert "" or  '#DEV/0!' to nan
#data.iloc[:,1:] = data.iloc[:,1:].apply(lambda x: pd.to_numeric(x, errors='coerce'))
data.iloc[:,1:] = data.iloc[:,1:].astype(np.float64)
# drop rows containing nan
data = data.dropna()

# generate three class label 
labels_3, _, flags3 = get_expert_tree_results(data, is_Kinect=True, class_number=3)
print(np.unique(labels_3, return_counts=True))

data["3class_label"] = labels_3

# drop LVC and time time elapse
data = data.drop(columns=['LVC', 'TIME_LAPSE', 'fluid_total'])
data = data.drop(columns=['Number_nodes'])
print('shape:')
print(data.shape)

pds[2] = data

(array([0, 1, 2]), array([ 6,  6, 18]))
shape:
(30, 22)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['TIME_LAPSE_LOG'] = np.log(data['TIME_LAPSE'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


In [22]:
[flag for flag in flags3 if 'u' in flag]

[]

In [21]:
# comapre 
trainset = pd.read_csv('./data/result_data/split_train_Oct21_stratified.csv')
valset = pd.read_csv('./data/result_data/split_test_Oct21_stratified.csv')

In [26]:
merged_data_2 = merged_data.set_index('Username')

Unnamed: 0_level_0,Mobility,ArmSwelling,BreastSwelling,Skin,PAS,FHT,DISCOMFORT,SYM_COUNT,ChestWallSwelling,Chemotherapy,...,Age,SLNB_Removed_LN,ALND_Removed_LN,SLNB_ALND_Removed,Mastectomy,Lumpectomy,Hormonal,BMI,TIME_LAPSE_LOG,3class_label
Username,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
B001_year,1.0,0.0,0.0,0.0,0.0,1.0,0.0,2.0,0.0,1.0,...,40.0,1.0,0.0,0.0,1.0,0.0,0.0,22.1,0.000000,0
B002_year,0.0,2.0,3.0,0.0,0.0,1.0,1.0,7.0,0.0,1.0,...,63.0,0.0,14.0,0.0,0.0,1.0,0.0,38.7,0.000000,2
B003_year,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,42.0,2.0,0.0,0.0,1.0,0.0,1.0,32.6,0.000000,0
B004_year,0.0,0.0,0.0,0.0,0.0,1.0,1.0,3.0,0.0,1.0,...,47.0,5.0,0.0,0.0,1.0,0.0,1.0,42.6,0.000000,0
B005_year,2.0,1.0,0.0,0.0,2.0,0.0,2.0,11.0,1.0,1.0,...,33.0,0.0,16.0,0.0,1.0,0.0,0.0,17.4,0.000000,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
K026,0.0,1.0,0.0,0.0,1.0,2.0,2.0,9.0,0.0,1.0,...,72.0,0.0,0.0,23.0,1.0,0.0,0.0,31.0,2.197225,2
K027,1.0,0.0,0.0,0.0,2.0,1.0,2.0,8.0,0.0,1.0,...,66.0,0.0,0.0,15.0,1.0,0.0,1.0,25.4,1.609438,0
K028,0.0,2.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,...,57.0,0.0,0.0,17.0,1.0,1.0,1.0,21.6,2.302585,0
K029,2.0,3.0,2.0,0.0,2.0,2.0,2.0,13.0,2.0,0.0,...,49.0,3.0,0.0,0.0,1.0,0.0,1.0,37.3,1.791759,2


In [30]:
generated_set = pd.concat([trainset, valset], axis=0)
generated_set = generated_set.set_index('Username')
generated_set

Unnamed: 0_level_0,Unnamed: 0,Mobility,ArmSwelling,BreastSwelling,Skin,PAS,FHT,DISCOMFORT,SYM_COUNT,ChestWallSwelling,...,Age,SLNB_Removed_LN,ALND_Removed_LN,SLNB_ALND_Removed,Mastectomy,Lumpectomy,Hormonal,BMI,TIME_LAPSE_LOG,3class_label
Username,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
B001_year,0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,2.0,0.0,...,40.0,1.0,0.0,0.0,1.0,0.0,0.0,22.1,0.000000,0
B002_year,1,0.0,2.0,3.0,0.0,0.0,1.0,1.0,7.0,0.0,...,63.0,0.0,14.0,0.0,0.0,1.0,0.0,38.7,0.000000,2
B003_year,2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,42.0,2.0,0.0,0.0,1.0,0.0,1.0,32.6,0.000000,0
B004_year,3,0.0,0.0,0.0,0.0,0.0,1.0,1.0,3.0,0.0,...,47.0,5.0,0.0,0.0,1.0,0.0,1.0,42.6,0.000000,0
B005_year,4,2.0,1.0,0.0,0.0,2.0,0.0,2.0,11.0,1.0,...,33.0,0.0,16.0,0.0,1.0,0.0,0.0,17.4,0.000000,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
K024,1058,3.0,4.0,0.0,2.0,2.0,4.0,4.0,16.0,0.0,...,53.0,0.0,16.0,0.0,1.0,0.0,1.0,30.5,2.197225,2
K025,1059,3.0,1.0,0.0,0.0,2.0,1.0,3.0,14.0,1.0,...,72.0,0.0,5.0,0.0,0.0,1.0,1.0,22.5,0.693147,2
K026,1060,0.0,1.0,0.0,0.0,1.0,2.0,2.0,9.0,0.0,...,72.0,0.0,0.0,23.0,1.0,0.0,0.0,31.0,2.197225,2
K027,1061,1.0,0.0,0.0,0.0,2.0,1.0,2.0,8.0,0.0,...,66.0,0.0,0.0,15.0,1.0,0.0,1.0,25.4,1.609438,0


In [74]:
merged_data_3 = merged_data_2.drop_duplicates()
euqals = []
for i, row in generated_set.iterrows():
    label1 = row['3class_label']
    label2 = merged_data_3.loc[i]['3class_label']
    euqals.append((label1,label2))

In [75]:
euqals = [i==j for i,j in euqals]
np.all(euqals)

True

In [66]:
merged_data_3 = merged_data_2.drop_duplicates()

In [None]:
merged_data_3['3class_label'] == 

merge 3 dataset

In [22]:
merged_data = pd.concat(pds, axis=0)
print("shape")
print(merged_data.shape)

shape
(1065, 22)


In [None]:
labels = merged_data['3class_label'].values

In [None]:
values, counts = np.unique(labels, return_counts=True)
print(values)
print(counts)
ratios = counts / len(labels)
print(ratios)

Final Process:  if a patient has both Lumpectomy=1 and Mastectomy=1, we consider it Mastecotomy=1 and Lumpectomy=0

In [None]:
merged_data.loc[(merged_data['Lumpectomy']==1) & (merged_data['Mastectomy']==1), 'Lumpectomy'] = 0

In [None]:
merged_data[(merged_data['Lumpectomy']==1) & (merged_data['Mastectomy']==1)].shape

In [None]:
merged_data.to_csv('./data/result_data/whole_dataset_Oct21.csv')

Divide Train and Validation

In [2]:
data = pd.read_csv('./data/result_data/whole_dataset_Oct21.csv')
data = data.drop(columns=['Unnamed: 0'])
data = data.drop_duplicates(subset=['Username'])
print("shape: ")
print(data.shape)

shape: 
(1064, 22)


In [8]:
_, counts = np.unique(data['3class_label'].values, return_counts=True)
counts

array([583, 170, 311])

In [10]:
ratio = counts / np.sum(counts)
ratio

array([0.54793233, 0.15977444, 0.29229323])

In [11]:
subset = data[data['Username'].str.contains('K|ML')]

In [15]:
test_names = []
for label in range(3):
    ids = subset[subset['3class_label']==label]['Username'].values
    ids = ids[np.random.choice(range(len(ids)),size=int(180*ratio[label]),replace=False)]
    test_names += ids.tolist()

In [16]:
testset = data.loc[data['Username'].isin(test_names)]
print(testset.shape)

(178, 22)


In [21]:
_, a = np.unique(testset['3class_label'].values, return_counts=True)
a

array([98, 28, 52])

In [22]:
a / np.sum(a)

array([0.5505618 , 0.15730337, 0.29213483])

In [23]:
testset.to_csv('./data/result_data/split_test_Oct21_stratified.csv')

In [24]:
trainset = data[~data['Username'].isin(test_names)]
print(trainset.shape)
trainset.to_csv('./data/result_data/split_train_Oct21_stratified.csv')

(886, 22)


In [25]:
_, a = np.unique(trainset['3class_label'].values, return_counts=True)
a

array([485, 142, 259])

In [26]:
a / np.sum(a)

array([0.54740406, 0.16027088, 0.29232506])

In [None]:
list(set(testset['Username']) & set(trainset['Username']))

In [None]:
DATA_PATH = './data/result_data/split_train_Oct21.csv'
data = pd.read_csv(DATA_PATH)
data = data.drop(columns=['Unnamed: 0', 'Username'])

In [None]:
data.iloc[:,:-1]

In [None]:
data = pd.read_csv('./data/result_data/split_train_Oct21.csv')

In [None]:
data

In [None]:
data = pd.read_csv('./data/feature_selection_preprocessed_data.csv')

delete_rows = ['A001', 'A003', 'A004', 'A005', 'A015', 'A016',
               'A018', 'A025', 'A026', 'A029', 'A031', 'A032',
               'A035', 'A036', 'A038', 'A042', 'A046', 'A055',
               'T010', 'T013', 'T014', 'T016', 'T018', 'T019',
               'T021', 'T026', 'T029', 'T030', 'T036', 'T038',
               'T040', 'T043', 'T044', 'T054', 'T055', 'T056',
               'T057'] 

# drop list of subjects
data = data[~data.Username.isin(delete_rows)]
# drop data within a 6 months
data = data[data.TIME_LAPSE >=0.5]
data = data.drop(columns=['Username', 'Unnamed: 0'])

# add log of time elapsed
data['TIME_LAPSE_Log'] = np.log(data['TIME_LAPSE'])

# the data_pre contain '#DEV/0!' or "" to make the coloumn string instead of float
# and these dirty element would cause error in .astype operation
# here convert "" or  '#DEV/0!' to nan
data = data.apply(lambda x: pd.to_numeric(x, errors='coerce'))
# drop rows containing nan
data = data.dropna()


# generate three class label 
labels_3, _, _ = get_expert_tree_results(data, 3)

data["3class_label"] = labels_3

# drop LVC and time time elapse
data = data.drop(columns=['LVC', 'TIME_LAPSE', 'fluid_total'])
data = data.drop(columns=['Number_nodes'])
print('shape:')
print(data.shape)

In [None]:
values, counts = np.unique(labels_3, return_counts=True)

In [None]:
counts

In [None]:
label 0: 385 label 1: 219, label 2: 254