In [1]:
import pandas as pd
import numpy as np

In [4]:
BASE_DIR = '/srv/app/data'

DATA_DIR = BASE_DIR + '/data'

TRAIN_LABELS = DATA_DIR + '/stage_2_train.csv'

In [5]:
traindf = pd.read_csv(TRAIN_LABELS)
traindf.head()

Unnamed: 0,ID,Label
0,ID_12cadc6af_epidural,0
1,ID_12cadc6af_intraparenchymal,0
2,ID_12cadc6af_intraventricular,0
3,ID_12cadc6af_subarachnoid,0
4,ID_12cadc6af_subdural,0


In [6]:
# Apart label
label = traindf.Label.values

# Split Colum
traindf = traindf.ID.str.rsplit("_", n=1, expand=True)
traindf.loc[:, "label"] = label

# Rename Columns
traindf = traindf.rename({0: "id", 1: "subtype"}, axis=1)
traindf.head()

Unnamed: 0,id,subtype,label
0,ID_12cadc6af,epidural,0
1,ID_12cadc6af,intraparenchymal,0
2,ID_12cadc6af,intraventricular,0
3,ID_12cadc6af,subarachnoid,0
4,ID_12cadc6af,subdural,0


In [7]:
# Pivot Dataframe
traindf = pd.pivot_table(traindf, index="id", columns="subtype", values="label")
traindf = traindf.reset_index()
traindf.head()

subtype,id,any,epidural,intraparenchymal,intraventricular,subarachnoid,subdural
0,ID_000012eaf,0,0,0,0,0,0
1,ID_000039fa0,0,0,0,0,0,0
2,ID_00005679d,0,0,0,0,0,0
3,ID_00008ce3c,0,0,0,0,0,0
4,ID_0000950d7,0,0,0,0,0,0


In [8]:
traindf.loc[:, 'normal'] = abs(traindf.loc[:, 'any'] - 1)

In [9]:
traindf.head()

subtype,id,any,epidural,intraparenchymal,intraventricular,subarachnoid,subdural,normal
0,ID_000012eaf,0,0,0,0,0,0,1
1,ID_000039fa0,0,0,0,0,0,0,1
2,ID_00005679d,0,0,0,0,0,0,1
3,ID_00008ce3c,0,0,0,0,0,0,1
4,ID_0000950d7,0,0,0,0,0,0,1


In [10]:
traindf.to_csv(DATA_DIR + '/stage_2_train_pivoted.csv', index=False)

# Add position to labels for multitask training

In [11]:
metadata = pd.read_csv(DATA_DIR + '/stage_2_train_ordered.csv')[['SOP Instance UID', 'Study Instance UID','Image Position z']]
metadata.columns = ['id', 'study', 'z']
agg = metadata.groupby('study').agg({'z': {'min_z': np.min, 'max_z': np.max}})
metadata = pd.merge(metadata, agg, on = 'study')
metadata = metadata.rename(columns = {('z', 'min_z') : 'min_z', ('z', 'max_z') : 'max_z'})
metadata['z_min_max'] = (metadata['z'] - metadata['min_z']) / (metadata['max_z'] - metadata['min_z'])

in a future version.

For column-specific groupby renaming, use named aggregation

    >>> df.groupby(...).agg(name=('column', aggfunc))

  return super().aggregate(arg, *args, **kwargs)


In [12]:
traindf_z = pd.merge(traindf, metadata[['id', 'z_min_max']], on = 'id')

In [13]:
traindf_z.head()

Unnamed: 0,id,any,epidural,intraparenchymal,intraventricular,subarachnoid,subdural,normal,z_min_max
0,ID_000012eaf,0,0,0,0,0,0,1,0.258064
1,ID_000039fa0,0,0,0,0,0,0,1,0.111102
2,ID_00005679d,0,0,0,0,0,0,1,0.054054
3,ID_00008ce3c,0,0,0,0,0,0,1,0.290449
4,ID_0000950d7,0,0,0,0,0,0,1,0.914286


In [14]:
traindf_z.to_csv(DATA_DIR + '/stage_2_train_pivoted_z.csv', index=False)