**Train on whole dataset and test on test set**

In [30]:
import os
import pandas as pd 
import numpy as np

from expert_tree import get_expert_tree_results, Expert_Tree
from expert_tree_kinect import get_expert_tree_results as get_expert_tree_results2
from expert_tree_kinect import Expert_Tree as Expert_Tree_2

from wrapper import best_first_search_mg

from sklearn.metrics import confusion_matrix

Test define feature subset

In [31]:
feats1 = ['ArmSwelling', 'FHT', 'BreastSwelling', 'Skin', 'DISCOMFORT', 
         'TIME_LAPSE_LOG', 'Mobility', 'PAS', 'BMI', 
         'ChestWallSwelling', 'Chemotherapy', 'Mastectomy']

feats2 = ['ArmSwelling', 'SYM_COUNT', 'FHT', 'BreastSwelling', 'Skin', 'DISCOMFORT', 
         'TIME_LAPSE_LOG', 'Mobility', 'PAS']

feats3 = ['ArmSwelling', 'FHT', 'BreastSwelling', 'Skin', 'DISCOMFORT', 
         'TIME_LAPSE_LOG', 'PAS', 'BMI', 
         'SLNB_Removed_LN', 'Radiation', 'Lumpectomy', 'Chemotherapy']

In [32]:
feats = feats1

Generate Test set

In [33]:
#test_set_dir = './data/feature_selection_test_full_features.csv'
test_set_dir = 'feature_selection_test.csv'
data_test = pd.read_csv(test_set_dir)
#data_test.head(5)
print(data_test.shape)

(191, 23)


In [34]:
# change one column name 
data_test = data_test.rename({'chemo_numbered':'Chemotherapy'}, axis=1)
# remove specfic data point 
data_test = data_test[data_test.Username != 'ML509']
# fill 0 for missing data
for column in ['Mobility', 'ArmSwelling', 'BreastSwelling', 'Skin', 'ChestWallSwelling', 'Chemotherapy', \
               'Radiation', 'SLNB_Removed_LN', 'ALND_Removed_LN', 'SLNB_ALND_Removed']:
    data_test[column].fillna(0, inplace=True)
# drop data missing important feature
data_test = data_test.dropna(subset=['TIME_LAPSE', 'BMI', 'Age', 'LVC'])
# delete Username
data_test = data_test.drop(columns='Username')
# convert string to float
#data_test = data_test.apply(lambda x: pd.to_numeric(x, errors='coerce'))
data_test = data_test.astype(np.float64)
# drop rows containing nan
data_test = data_test.dropna()
# drop data within a 6 months
data_test = data_test[data_test.TIME_LAPSE >=0.5]
# add log of time elapsed
data_test['TIME_LAPSE_LOG'] = np.log(data_test['TIME_LAPSE'])

print('shape:')
print(data_test.shape)

shape:
(176, 23)


In [35]:
# generate 3 labels
Y_test, _, _ = get_expert_tree_results(data_test, class_number=3)

In [36]:
# drop LVC and time time elapse
data_test = data_test.drop(columns=['LVC', 'TIME_LAPSE', 'fluid_total'])
# prent shape and columns
print(f"data shape: {data_test.shape}")
print("columns:")
print(data_test.columns.values)

data shape: (176, 20)
columns:
['Mobility' 'ArmSwelling' 'BreastSwelling' 'Skin' 'PAS' 'FHT' 'DISCOMFORT'
 'SYM_COUNT' 'ChestWallSwelling' 'Chemotherapy' 'Radiation' 'Age'
 'SLNB_Removed_LN' 'ALND_Removed_LN' 'SLNB_ALND_Removed' 'Mastectomy'
 'Lumpectomy' 'Hormonal' 'BMI' 'TIME_LAPSE_LOG']


In [37]:
# get features for model  
X_test = data_test[feats].values
print(f"X shape: {X_test.shape}")
print(f"y length: {len(Y_test)}")

X shape: (176, 12)
y length: 176


 Do the same process for Kinect data

In [38]:
#test_set_dir = './data/feature_selection_test_full_features.csv'
test_set_dir = 'feature_selection_kinect_test.csv'
data_test2 = pd.read_csv(test_set_dir)
#data_test.head(5)
print(data_test2.shape)

(35, 24)


In [39]:
# change one column name 
data_test2 = data_test2.rename({'chemo_numbered':'Chemotherapy'}, axis=1)

# fill 0 for missing data
for column in ['Mobility', 'ArmSwelling', 'BreastSwelling', 'Skin', 'ChestWallSwelling', 'Chemotherapy', \
               'Radiation', 'SLNB_Removed_LN', 'ALND_Removed_LN', 'SLNB_ALND_Removed']:
    data_test2[column].fillna(0, inplace=True)
# drop data missing important feature
data_test2 = data_test2.dropna(subset=['TIME_LAPSE', 'BMI', 'Age', 'LVC'])
# delete Username
data_test2 = data_test2.drop(columns='Username')
# convert string to float
#data_test = data_test.apply(lambda x: pd.to_numeric(x, errors='coerce'))
data_test2 = data_test2.astype(np.float64)
# drop rows containing nan
data_test2 = data_test2.dropna()
# drop data within a 6 months
data_test2 = data_test2[data_test2.TIME_LAPSE >=0.5]
# add log of time elapsed
data_test2['TIME_LAPSE_LOG'] = np.log(data_test2['TIME_LAPSE'])

print('shape:')
print(data_test2.shape)

shape:
(30, 24)


In [40]:
data_test2 = data_test2.drop(columns='Number_nodes')

In [43]:
# generate 3 labels
Y_test2, _, _ = get_expert_tree_results2(data_test2, 3)

In [44]:
# drop LVC and time time elapse
data_test2 = data_test2.drop(columns=['LVC', 'TIME_LAPSE', 'fluid_total'])
# prent shape and columns
print(f"data shape: {data_test.shape}")
print("columns:")
print(data_test2.columns.values)

data shape: (176, 20)
columns:
['Mobility' 'ArmSwelling' 'BreastSwelling' 'Skin' 'PAS' 'FHT' 'DISCOMFORT'
 'SYM_COUNT' 'ChestWallSwelling' 'Chemotherapy' 'Radiation' 'Age'
 'SLNB_Removed_LN' 'ALND_Removed_LN' 'SLNB_ALND_Removed' 'Mastectomy'
 'Lumpectomy' 'Hormonal' 'BMI' 'TIME_LAPSE_LOG']


In [45]:
# get features for model  
X_test2 = data_test2[feats].values
print(f"X shape: {X_test2.shape}")
print(f"y length: {len(Y_test2)}")

X shape: (30, 12)
y length: 30


In [46]:
print( X_test.shape, len(Y_test) , X_test2.shape,len(Y_test2)) #verify the lengtth

(176, 12) 176 (30, 12) 30


In [47]:
#stick X_test2 under X_test, adn Y_test2 under Y_test


In [48]:
np.concatenate((X_test, X_test2), axis=0)[0] ==X_test[0]

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True])

In [49]:
 np.concatenate((X_test, X_test2), axis=0)[-1] ==X_test2[-1] #sucesfully X_Test, X_test2 COncatenated

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True])

In [50]:
X_test= np.concatenate((X_test, X_test2), axis=0)

In [51]:
Y_test.extend(Y_test2)

Training set

In [52]:
# define which dataset to use 
IF_USE_PRE_DATA = 1

delete_rows = ['A001', 'A003', 'A004', 'A005', 'A015', 'A016',
               'A018', 'A025', 'A026', 'A029', 'A031', 'A032',
               'A035', 'A036', 'A038', 'A042', 'A046', 'A055',
               'T010', 'T013', 'T014', 'T016', 'T018', 'T019',
               'T021', 'T026', 'T029', 'T030', 'T036', 'T038',
               'T040', 'T043', 'T044', 'T054', 'T055', 'T056',
               'T057'] 

# read in dataset 
DATA_PATH = ('./data/META_data_correct_v5.csv', './data/feature_selection_preprocessed_data.csv')[IF_USE_PRE_DATA > 0]
data = pd.read_csv(DATA_PATH)
print(f'data shape: {data.shape}')
data = data[data.Username != 'ML380']

# drop list of subjects
data = data[~data.Username.isin(delete_rows)]
# drop data within a 6 months
data = data[data.TIME_LAPSE >=0.5]
data = data.drop(columns=['Username', 'Unnamed: 0'] if IF_USE_PRE_DATA else 'Username')

# the data_pre contain '#DEV/0!' or "" to make the coloumn string instead of float
# and these dirty element would cause error in .astype operation
# here convert "" or  '#DEV/0!' to nan
#data1 = data
data = data.apply(lambda x: pd.to_numeric(x, errors='coerce'))
# drop rows containing nan
data = data.dropna()

# add log of time elapsed
data['TIME_LAPSE_LOG'] = np.log(data['TIME_LAPSE'])

# prent shape and columns
print(f"data shape: {data.shape}")
print("columns:")
print(data.columns.values)

data shape: (906, 25)
data shape: (858, 24)
columns:
['Mobility' 'ArmSwelling' 'BreastSwelling' 'Skin' 'PAS' 'FHT' 'DISCOMFORT'
 'SYM_COUNT' 'TIME_LAPSE' 'LVC' 'ChestWallSwelling' 'fluid_total'
 'Chemotherapy' 'Radiation' 'Age' 'SLNB_Removed_LN' 'ALND_Removed_LN'
 'SLNB_ALND_Removed' 'Number_nodes' 'Mastectomy' 'Lumpectomy' 'Hormonal'
 'BMI' 'TIME_LAPSE_LOG']


In [53]:
# generate 3 labels
Y_train, _, _ = get_expert_tree_results(data, class_number=3)

In [54]:
# drop LVC and time time elapse
data = data.drop(columns=['LVC', 'TIME_LAPSE', 'fluid_total', 'Number_nodes'])
# prent shape and columns
print(f"data shape: {data.shape}")
print("columns:")
print(data.columns.values)
# get features for model  
X_train = data[feats].values
print(f"X shape: {X_train.shape}")
print(f"y length: {len(Y_train)}")

data shape: (858, 20)
columns:
['Mobility' 'ArmSwelling' 'BreastSwelling' 'Skin' 'PAS' 'FHT' 'DISCOMFORT'
 'SYM_COUNT' 'ChestWallSwelling' 'Chemotherapy' 'Radiation' 'Age'
 'SLNB_Removed_LN' 'ALND_Removed_LN' 'SLNB_ALND_Removed' 'Mastectomy'
 'Lumpectomy' 'Hormonal' 'BMI' 'TIME_LAPSE_LOG']
X shape: (858, 12)
y length: 858


Classifier

In [55]:
# gradient boosting tree
from sklearn.ensemble import GradientBoostingClassifier
params = {'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 70}
gbt = GradientBoostingClassifier(**params)
gbt

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=2,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=70,
              n_iter_no_change=None, presort='auto', random_state=None,
              subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False)

**Train and test using feature subset**

In [56]:
print(f'X train: {X_train.shape}')
print(f'Y train: {len(Y_train)}')
print(f'X test: {X_test.shape}')
print(f'Y test: {len(Y_test)}')

X train: (858, 12)
Y train: 858
X test: (206, 12)
Y test: 206


In [57]:
gbt= gbt.fit(X_train, Y_train)
feature_weight_pair = sorted(zip(feats, gbt.feature_importances_), key=lambda pair : pair[1], reverse=True)
print('learnt importance:')
feature_weight_pair

learnt importance:


[('ArmSwelling', 0.6258789879114158),
 ('FHT', 0.11936156606848035),
 ('BreastSwelling', 0.07109668201966737),
 ('Skin', 0.05518804564892675),
 ('DISCOMFORT', 0.045725080605642454),
 ('TIME_LAPSE_LOG', 0.027881689237236773),
 ('Mobility', 0.023433218436228597),
 ('PAS', 0.018299294972359846),
 ('BMI', 0.009224283243041006),
 ('ChestWallSwelling', 0.003911151857001204),
 ('Chemotherapy', 0.0),
 ('Mastectomy', 0.0)]

In [58]:
accu = gbt.score(X_test, Y_test)
print(f'accuracy: {accu}')
CM = confusion_matrix(Y_test, gbt.predict(X_test))
print('confusion matrix:')
print(CM)

accuracy: 0.9320388349514563
confusion matrix:
[[127   1   0]
 [  5  30   0]
 [  0   8  35]]


**Train and test using all feature**

In [59]:
data_test2.columns 

Index(['Mobility', 'ArmSwelling', 'BreastSwelling', 'Skin', 'PAS', 'FHT',
       'DISCOMFORT', 'SYM_COUNT', 'ChestWallSwelling', 'Chemotherapy',
       'Radiation', 'Age', 'SLNB_Removed_LN', 'ALND_Removed_LN',
       'SLNB_ALND_Removed', 'Mastectomy', 'Lumpectomy', 'Hormonal', 'BMI',
       'TIME_LAPSE_LOG'],
      dtype='object')

In [60]:
X_test = data_test.values
X_test2 = data_test2.values

In [61]:
print(X_test.shape , X_test2.shape)

(176, 20) (30, 20)


In [62]:
np.concatenate((X_test, X_test2), axis=0).shape

(206, 20)

In [63]:
X_train = data.values
X_test = np.concatenate((X_test, X_test2), axis=0)
print(f'X train: {X_train.shape}')
print(f'Y train: {len(Y_train)}')
print(f'X test: {X_test.shape}')
print(f'Y test: {len(Y_test)}')

X train: (858, 20)
Y train: 858
X test: (206, 20)
Y test: 206


In [64]:
gbt= gbt.fit(X_train, Y_train)
feature_weight_pair = sorted(zip(data.columns.values, gbt.feature_importances_), key=lambda pair : pair[1], reverse=True)
print('learnt importance:')
feature_weight_pair

learnt importance:


[('ArmSwelling', 0.5992472470765142),
 ('SYM_COUNT', 0.15317670646953313),
 ('FHT', 0.06027047071576672),
 ('BreastSwelling', 0.057588135189392674),
 ('Skin', 0.04214504429255917),
 ('DISCOMFORT', 0.02860573235263586),
 ('TIME_LAPSE_LOG', 0.025565996176799183),
 ('Mobility', 0.010626149575466185),
 ('BMI', 0.006766147049681877),
 ('PAS', 0.003111608707692861),
 ('ChestWallSwelling', 0.0030876563832920433),
 ('Age', 0.0029442990095847627),
 ('SLNB_ALND_Removed', 0.0025788748207408473),
 ('ALND_Removed_LN', 0.0025617524720380975),
 ('SLNB_Removed_LN', 0.0011608676028077424),
 ('Radiation', 0.000352369989567843),
 ('Lumpectomy', 0.00021094211592681013),
 ('Chemotherapy', 0.0),
 ('Mastectomy', 0.0),
 ('Hormonal', 0.0)]

In [65]:
accu = gbt.score(X_test, Y_test)
print(f'accuracy: {accu}')
CM = confusion_matrix(Y_test, gbt.predict(X_test))
print('confusion matrix:')
print(CM)

accuracy: 0.9223300970873787
confusion matrix:
[[127   1   0]
 [  6  28   1]
 [  0   8  35]]


Check if there are sample overlap between the train and validation:

In [66]:
dir1 = 'feature_selection_test.csv'
dataTest = pd.read_csv(dir1)
dataTrain = pd.read_csv(DATA_PATH)
s1 = pd.merge(dataTest, dataTrain, how='inner', on=['Username'])
s1

Unnamed: 0,Username,Mobility_x,ArmSwelling_x,BreastSwelling_x,Skin_x,PAS_x,FHT_x,DISCOMFORT_x,SYM_COUNT_x,TIME_LAPSE_x,...,Radiation_y,Age_y,SLNB_Removed_LN_y,ALND_Removed_LN_y,SLNB_ALND_Removed_y,Number_nodes,Mastectomy_y,Lumpectomy_y,Hormonal_y,BMI_y
0,ML380,2,0,2,0,1,0,1,5,0.8,...,1,70.0,1.0,0.0,0.0,1.0,0,1,1,39.8


In [67]:
dir2 =  'feature_selection_kinect_test.csv'
dataTest = pd.read_csv(dir2)
dataTrain = pd.read_csv(DATA_PATH)
s1 = pd.merge(dataTest, dataTrain, how='inner', on=['Username'])
s1

Unnamed: 0,Username,Mobility_x,ArmSwelling_x,BreastSwelling_x,Skin_x,PAS_x,FHT_x,DISCOMFORT_x,SYM_COUNT_x,TIME_LAPSE_x,...,Radiation_y,Age_y,SLNB_Removed_LN_y,ALND_Removed_LN_y,SLNB_ALND_Removed_y,Number_nodes_y,Mastectomy_y,Lumpectomy_y,Hormonal_y,BMI_y


In [None]:
# print(sorted(dataTest['Username'].values))

In [None]:
# print(sorted(dataTrain['Username'].values))

Check label distribution

In [69]:
{label:count for label, count in zip(*np.unique(Y_test, return_counts=True))}

{0: 128, 1: 35, 2: 43}

In [70]:
{label:count for label, count in zip(*np.unique(Y_train, return_counts=True))}

{0: 385, 1: 219, 2: 254}

In [None]:
Y_test2, _, _ = get_expert_tree_results2(data_test2, 3)

In [71]:
Y_train, _, _ = get_expert_tree_results(data,4)

KeyError: 'TIME_LAPSE'