In [1]:
import os 
import pandas as pd 
import numpy as np 
import re
import collections

In [2]:
# read raw data  
raw_data = pd.read_csv('./data/newdataset_Fugmail_jc2.csv')
raw_data = raw_data[[f'BCLSEI{n}' for n in range(1, 26)] + ['Dxlength', 'Age','Lymphedema']]
print('raw data shape:', raw_data.shape)

raw data shape: (355, 28)


In [3]:
# drop data without time lapse
raw_data = raw_data.dropna(subset=['Dxlength'])
print('drop no time lapse', raw_data.shape)

drop no time lapse (329, 28)


In [4]:
# drop data with timelapse < 0.5
raw_data['Dxlength'] = raw_data['Dxlength'].apply(lambda x : float(x))
raw_data = raw_data[raw_data['Dxlength']>=0.5]
print('drop time elapse < 0.5', raw_data.shape)

drop time elapse < 0.5 (316, 28)


In [5]:
# fill missing data with 0 
raw_data = raw_data.fillna(0) 

In [6]:
sympmap = {'BCLSEI1':'ShoulderMovement',
           'BCLSEI2':'ElbowMovement',
           'BCLSEI3':'WristMovement',
           'BCLSEI4':'FingersMovement',
           'BCLSEI5':'ArmMovement',
           'BCLSEI6':'ArmSwelling',
           'BCLSEI7':'BreastSwelling',
           'BCLSEI8':'Chestswelling',
           'BCLSEI9':'Firmness',
           'BCLSEI10':'Tightness',
           'BCLSEI11':'Heaviness',
           'BCLSEI12':'Toughness or thickness of the skin',
           'BCLSEI13':'Stiffness',
           'BCLSEI14':'Tenderness',
           'BCLSEI15':'Hotness or Increased temperature',
           'BCLSEI16':'Redness',
           'BCLSEI17':'Blister',
           'BCLSEI18':'Arm Pain/Aching/Soreness',
           'BCLSEI19':'Numbness',
           'BCLSEI20':'Burning',
           'BCLSEI21':'Stabbing',
           'BCLSEI22':'Tingling',
           'BCLSEI23':'Hand or arm fatigue',
           'BCLSEI24':'Hand or arm weakness',
           'BCLSEI25':'Pocket of fluid develop (Seroma Formation)'}

In [7]:
agemap = [23, 27, 32, 37, 42, 47, 52, 57, 62, 67, 72, 77, 82]

In [8]:
pdholder = collections.defaultdict(list)
for i, row in raw_data.iterrows():
    Mobility = max(row[['BCLSEI1', 'BCLSEI2', 'BCLSEI3', 'BCLSEI4', 'BCLSEI5']])
    ArmSwelling = row['BCLSEI6']
    BreastSwelling = row['BCLSEI7']
    Skin = row['BCLSEI12']
    PAS = row['BCLSEI18']
    FHT = max(row[['BCLSEI9', 'BCLSEI10', 'BCLSEI11']])
    DISCOMFORT = max(row[['BCLSEI13','BCLSEI15', 'BCLSEI16','BCLSEI19','BCLSEI20','BCLSEI21','BCLSEI22','BCLSEI23','BCLSEI24']])
    SYM_COUNT = sum(row[[f'BCLSEI{n}' for n in range(1, 25)]] > 0)
    TIME_LAPSE = row['Dxlength']
    Age = agemap[int(row['Age'])]
    ChestWallSwelling = row['BCLSEI8']
    label = row['Lymphedema']
    
    pdholder['Mobility'].append(Mobility)
    pdholder['ArmSwelling'].append(ArmSwelling)
    pdholder['BreastSwelling'].append(BreastSwelling)
    pdholder['Skin'].append(Skin)
    pdholder['PAS'].append(PAS)
    pdholder['FHT'].append(FHT)
    pdholder['DISCOMFORT'].append(DISCOMFORT)
    pdholder['SYM_COUNT'].append(SYM_COUNT)
    pdholder['TIME_LAPSE'].append(TIME_LAPSE)
    pdholder['TIME_LAPSE_LOG'].append(np.log(TIME_LAPSE))
    pdholder['Age'].append(Age)
    pdholder['ChestWallSwelling'].append(ChestWallSwelling)
    
    pdholder['label'].append(label)
     
    
df = pd.DataFrame(data=pdholder, columns=['Mobility','ArmSwelling','BreastSwelling','Skin','PAS','FHT','DISCOMFORT','SYM_COUNT','TIME_LAPSE','TIME_LAPSE_LOG','Age','ChestWallSwelling','label'])
#df['TIME_LAPSE_LOG'] = np.log(df['TIME_LAPSE']) 
df.to_csv('web_data_processed.csv')