## Reorganizing metadata

### str_metadata

In [1]:
# Setup
import IPython

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline

In [2]:
data_dir = '../data/'

In [3]:
# Read the original metadata file as a pandas dataframe and view the top 5 rows of the dataset

metadata = pd.read_csv(f'{data_dir}/metadata/sample_metadata.tsv', sep='\t')
metadata.head()

Unnamed: 0,sampleid,stool_consistency,hct_source,disease,categorical_time_relative_to_engraftment,week_relative_to_hct,timepoint_of_transplant,day_relative_to_nearest_hct,alleged_abduction
0,N4VICF,formed,cord,Myelodysplastic Syndromes,pre,-1.0,6.0,-6.0,0
1,8A0F9A,formed,cord,Leukemia,pre,-2.0,7.0,-7.0,0
2,5Y49IM,semi-formed,cord,Leukemia,peri,-1.0,7.0,0.0,1
3,ZKJI45,semi-formed,cord,Leukemia,post,1.0,7.0,8.0,0
4,2I7SIQ,liquid,cord,Leukemia,peri,-1.0,0.0,0.0,1


In [4]:
# For clearer understanding and better downstream anlysis
# Convert the type of "week_relative_to_hct" and "alleged_abduction" columns to be string

metadata['alleged_abduction']= metadata['alleged_abduction'].values.astype(str)
metadata['alleged_abduction'] = metadata['alleged_abduction'].str.replace('0','non_abducted')
metadata['alleged_abduction'] = metadata['alleged_abduction'].str.replace('1','abducted')
metadata['week_relative_to_hct']= metadata['week_relative_to_hct'].values.astype(str)
metadata['week_relative_to_hct']= metadata['week_relative_to_hct'].str.replace('-1.0','one week before HCT', regex=False)
metadata['week_relative_to_hct']= metadata['week_relative_to_hct'].str.replace('-2.0','two weeks before HCT', regex=False)
metadata['week_relative_to_hct']= metadata['week_relative_to_hct'].str.replace('1.0','one week after HCT', regex=False)
metadata['week_relative_to_hct']= metadata['week_relative_to_hct'].str.replace('2.0','two weeks after HCT', regex=False)
metadata['week_relative_to_hct']= metadata['week_relative_to_hct'].str.replace('0.0','HCT week', regex=False)
metadata['week_relative_to_hct']= metadata['week_relative_to_hct'].str.replace('nan','', regex=False)

In [5]:
# Read the new metadata file "str_metadata"
metadata.to_csv(f'{data_dir}/metadata/str_metadata.tsv', sep='\t', encoding='utf-8', escapechar='\n', index=False)
str_meta = pd.read_csv(f'{data_dir}/metadata/str_metadata.tsv', sep='\t')
str_meta.head()

Unnamed: 0,sampleid,stool_consistency,hct_source,disease,categorical_time_relative_to_engraftment,week_relative_to_hct,timepoint_of_transplant,day_relative_to_nearest_hct,alleged_abduction
0,N4VICF,formed,cord,Myelodysplastic Syndromes,pre,one week before HCT,6.0,-6.0,non_abducted
1,8A0F9A,formed,cord,Leukemia,pre,two weeks before HCT,7.0,-7.0,non_abducted
2,5Y49IM,semi-formed,cord,Leukemia,peri,one week before HCT,7.0,0.0,abducted
3,ZKJI45,semi-formed,cord,Leukemia,post,one week after HCT,7.0,8.0,non_abducted
4,2I7SIQ,liquid,cord,Leukemia,peri,one week before HCT,0.0,0.0,abducted


### str_nan_metadata

In [6]:
#Add nan to str_metadata so that there is no error message about missing value:
metadata=pd.read_csv('../data/metadata/str_metadata.tsv', sep='\t')
metadata['hct_source'].fillna('nan',inplace = True)
metadata['disease'].fillna('nan',inplace = True)
metadata['categorical_time_relative_to_engraftment'].fillna('nan',inplace = True)
metadata['timepoint_of_transplant'].fillna('nan',inplace = True)
metadata['day_relative_to_nearest_hct'].fillna('nan',inplace = True)
metadata['week_relative_to_hct'].fillna('nan',inplace = True)

#remove special characters in the disease column to avoid error message:
metadata['disease']= metadata['disease'].str.replace('Hodgkin\'s Disease','Hodgkin Disease', regex=False)
metadata['disease']= metadata['disease'].str.replace('Non-Hodgkin\'s Lymphoma','non-Hodgkin Disease', regex=False)

In [7]:
metadata.to_csv('../data/metadata//str_nan_metadata.tsv', sep='\t', encoding='utf-8', escapechar='\n', index=False)
str_nan_meta = pd.read_csv(f'{data_dir}/metadata/str_nan_metadata.tsv', sep='\t')
str_nan_meta.head()

Unnamed: 0,sampleid,stool_consistency,hct_source,disease,categorical_time_relative_to_engraftment,week_relative_to_hct,timepoint_of_transplant,day_relative_to_nearest_hct,alleged_abduction
0,N4VICF,formed,cord,Myelodysplastic Syndromes,pre,one week before HCT,6.0,-6.0,non_abducted
1,8A0F9A,formed,cord,Leukemia,pre,two weeks before HCT,7.0,-7.0,non_abducted
2,5Y49IM,semi-formed,cord,Leukemia,peri,one week before HCT,7.0,0.0,abducted
3,ZKJI45,semi-formed,cord,Leukemia,post,one week after HCT,7.0,8.0,non_abducted
4,2I7SIQ,liquid,cord,Leukemia,peri,one week before HCT,0.0,0.0,abducted
