# Notebook to edit metadata for Qiita: Cancer dataset from Liao
Guide: https://qiita.ucsd.edu/static/doc/html/checklist-for-ebi-ena-submission.html#checklist

# Questions:
- Do we want to add in from here: https://www.nature.com/articles/s41597-021-00860-8 or should we just link the paper?
    - tblbc.csv: Daily measurements of white blood cells, platelets and red blood cells for 1,278 patients
    - tbltemperature.csv: temperatures for 1,249 patients
    - tblInfectionsCidPapers.csv: The day of positive blood cultures for 426 patients and microbes (genera Enterococcus, Escherichia, Klebsiella, Enterobacter, Pseudomonas, Stenotrophomonas, and Citrobacter) analysed in previous publications from our team
    - tbldrug.csv: Timing and route of drug administration for 1,278 patients
    - tblhctmeta.csv: The day and source of HCT for 1,278 patients

In [2]:
import pandas as pd
import random

### Import file paths

In [3]:
fn_path = '22_07_13_QiitaCancer_Liao_filenames.txt'
meta_path = 'tblASVsamples_06222022.csv'
fmt_path = 'tblFMT.csv'

### Convert into list/df

In [4]:
#Filename list
fn = []
with open(fn_path) as file:
    while (line := file.readline().rstrip()):
        fn.append(line)
print(fn[:3])

#Metadata df
meta = pd.read_csv(meta_path)
print(meta.loc[:]['AccessionShotgun'].value_counts())
meta[:3]

['1000A..pool483_R1.fastq.gz', '1000A..pool483_R2.fastq.gz', '1000B..pool483_R1.fastq.gz']
SRR14092348    1
SRR13697224    1
SRR14092179    1
SRR14092180    1
SRR14092181    1
              ..
SRR14092211    1
SRR14092212    1
SRR14092213    1
SRR13697146    1
SRR14092273    1
Name: AccessionShotgun, Length: 393, dtype: int64


Unnamed: 0,SampleID,PatientID,Timepoint,Consistency,Accession,BioProject,DayRelativeToNearestHCT,AccessionShotgun,Pool
0,1000A,1000,0,formed,SRR11414397,PRJNA545312,-9.0,,483
1,1000B,1000,5,liquid,SRR11414992,PRJNA545312,-4.0,,483
2,1000C,1000,15,liquid,SRR11414991,PRJNA545312,6.0,,483


In [5]:
#FMT (Fecal Matter Transfer) Data
fmt_meta = pd.read_csv(fmt_path)
fmt_meta.rename(columns = {'PatientID':'host_subject_id'}, inplace = True)
fmt_meta[:3]

Unnamed: 0,host_subject_id,autoFmtPatientId,RandomizationArm,RandomizationDayRelativeToNearestHCT,FMTDayRelativeToNearestHCT,InitialDonorSampleID,ThawedDonorSampleID,LastPreFMTSampleID,ImmediatePostFMTSampleID
0,FMT.0009,C1,Control,35,,FMT.0009A,,FMT.0009W,
1,FMT.0103,C10,Control,47,,FMT.0103A,,FMT.0103X,
2,FMT.0107,C11,Control,24,,FMT.0107A,,FMT.0107Q,


## Sample Information

In [6]:
s = meta.copy()

#sample_name
s.rename(columns = {'SampleID':'sample_name'}, inplace = True)

#host_subject_id
s.rename(columns = {'PatientID':'host_subject_id'}, inplace = True)

#sample_type
s["sample_type"] = 'stool'

#taxon_id and scientific_name
s["taxon_id"] = 408170
s["scientific_name"] = 'human-gut-metagenome'

#env
s["env_biome"] = 'urban biome'
s["env_feature"] = 'human-associated habitat'
s["env_material"] = 'feces'
s["env_package"] = 'human-gut'

#elevation/latitude/longitude of sampling site
s["elevation"] = 37
s["latitude"] = '40.764'
s["longitude"] = '-73.957'

#physical_specimen_location
s["physical_specimen_location"] = 'Molecular Microbiology Facility (MMF)'

#collection_timestamp
s.rename(columns = {'Timepoint':'de_identified_collection_timestamp'}, inplace = True)

#empo
s["empo_1"] = 'Host-associated'
s["empo_2"] = 'Animal'
s["empo_3"] = 'Animal distal gut'

#host_common_name
s["host_common_name"] = 'human'

#Add additional metadata for select FMT patients
s = pd.merge(left=s, right=fmt_meta, how='left', left_on='host_subject_id', right_on='host_subject_id')
print(len(s))
s[:3]

12535


Unnamed: 0,sample_name,host_subject_id,de_identified_collection_timestamp,Consistency,Accession,BioProject,DayRelativeToNearestHCT,AccessionShotgun,Pool,sample_type,...,empo_3,host_common_name,autoFmtPatientId,RandomizationArm,RandomizationDayRelativeToNearestHCT,FMTDayRelativeToNearestHCT,InitialDonorSampleID,ThawedDonorSampleID,LastPreFMTSampleID,ImmediatePostFMTSampleID
0,1000A,1000,0,formed,SRR11414397,PRJNA545312,-9.0,,483,stool,...,Animal distal gut,human,,,,,,,,
1,1000B,1000,5,liquid,SRR11414992,PRJNA545312,-4.0,,483,stool,...,Animal distal gut,human,,,,,,,,
2,1000C,1000,15,liquid,SRR11414991,PRJNA545312,6.0,,483,stool,...,Animal distal gut,human,,,,,,,,


## Prep Information

In [7]:
p = meta.copy()

#sample_name
p.rename(columns = {'SampleID':'sample_name'}, inplace = True)

#host_subject_id
p.rename(columns = {'PatientID':'host_subject_id'}, inplace = True)

#primer
p['primer'] = 'CCGTCAATTYHTTTRAGT'

#platform
p["platform"] = 'Illumina'

#experiment_design_description
p["experiment_design_description"] = 'https://www.nature.com/articles/s41597-021-00860-8'

#center_name
p["center_name"] = 'Molecular Microbiology Facility (MMF) at Memorial Sloan Kettering'

#center_project_name
p["center_project_name"] = 'Liao_et_al_Data'

#library_construction_protocol
p["library_construction_protocol"] = '16S, V4-V5'

#instrument_model
p["instrument_model"] = 'Illumina MiSeq'

#seqencing method
p["sequencing_method"] = 'Sequencing by synthesis'

#pcr_primers
p["pcr_primer"] = 'FWD:AYTGGGYDTAAAGNG, REV:CCGTCAATTYHTTTRAGT'

#run_prefix
p["run_prefix"] = p["sample_name"] + str('..pool') + p["Pool"]

#run_center
p["run_center"] = 'Molecular Microbiology Facility (MMF)'

#target_gene
p['target_gene'] = '16S rRNA'

#target_subfragment
p['target_subfragment'] = 'V4-V5'

p.drop(['Timepoint', 'Consistency', 'DayRelativeToNearestHCT'], inplace=True, axis=1)
print(len(p))
p[:3]

12535


Unnamed: 0,sample_name,host_subject_id,Accession,BioProject,AccessionShotgun,Pool,primer,platform,experiment_design_description,center_name,center_project_name,library_construction_protocol,instrument_model,sequencing_method,pcr_primer,run_prefix,run_center,target_gene,target_subfragment
0,1000A,1000,SRR11414397,PRJNA545312,,483,CCGTCAATTYHTTTRAGT,Illumina,https://www.nature.com/articles/s41597-021-008...,Molecular Microbiology Facility (MMF) at Memor...,Liao_et_al_Data,"16S, V4-V5",Illumina MiSeq,Sequencing by synthesis,"FWD:AYTGGGYDTAAAGNG, REV:CCGTCAATTYHTTTRAGT",1000A..pool483,Molecular Microbiology Facility (MMF),16S rRNA,V4-V5
1,1000B,1000,SRR11414992,PRJNA545312,,483,CCGTCAATTYHTTTRAGT,Illumina,https://www.nature.com/articles/s41597-021-008...,Molecular Microbiology Facility (MMF) at Memor...,Liao_et_al_Data,"16S, V4-V5",Illumina MiSeq,Sequencing by synthesis,"FWD:AYTGGGYDTAAAGNG, REV:CCGTCAATTYHTTTRAGT",1000B..pool483,Molecular Microbiology Facility (MMF),16S rRNA,V4-V5
2,1000C,1000,SRR11414991,PRJNA545312,,483,CCGTCAATTYHTTTRAGT,Illumina,https://www.nature.com/articles/s41597-021-008...,Molecular Microbiology Facility (MMF) at Memor...,Liao_et_al_Data,"16S, V4-V5",Illumina MiSeq,Sequencing by synthesis,"FWD:AYTGGGYDTAAAGNG, REV:CCGTCAATTYHTTTRAGT",1000C..pool483,Molecular Microbiology Facility (MMF),16S rRNA,V4-V5


### Split Prep Information

#### FMT Split

In [8]:
#Split FMT. cases out
fmt = p[p.apply(lambda x: 'FMT' in x.host_subject_id, axis=1)]
fmt = fmt.reset_index(drop=True)

#Split FMT cases which have additonal metadata from paper
#https://pubmed.ncbi.nlm.nih.gov/30257956/

fmt_study = fmt[fmt.apply(lambda x: x.host_subject_id in list(fmt_meta['host_subject_id']), axis=1)]
fmt_study = fmt_study.reset_index(drop=True)

print('fmt_study only', len(fmt_study))
display(fmt_study[:3])

#Update the fmt df by removing those subjects in the fmt_study
fmt = fmt[fmt.apply(lambda x: x.host_subject_id not in list(fmt_meta['host_subject_id']), axis=1)]

#Create the remaning FMT cases df
print('fmt without the study', len(fmt))
display(fmt[:3])
print('All FMT cases:', len(fmt) + len(fmt_study))
#print(fmt.loc[:]['Pool'].value_counts())

fmt_study only 553


Unnamed: 0,sample_name,host_subject_id,Accession,BioProject,AccessionShotgun,Pool,primer,platform,experiment_design_description,center_name,center_project_name,library_construction_protocol,instrument_model,sequencing_method,pcr_primer,run_prefix,run_center,target_gene,target_subfragment
0,992A,FMT.0001,SRR11420351,PRJNA545312,,473,CCGTCAATTYHTTTRAGT,Illumina,https://www.nature.com/articles/s41597-021-008...,Molecular Microbiology Facility (MMF) at Memor...,Liao_et_al_Data,"16S, V4-V5",Illumina MiSeq,Sequencing by synthesis,"FWD:AYTGGGYDTAAAGNG, REV:CCGTCAATTYHTTTRAGT",992A..pool473,Molecular Microbiology Facility (MMF),16S rRNA,V4-V5
1,992B,FMT.0001,SRR11420350,PRJNA545312,,473,CCGTCAATTYHTTTRAGT,Illumina,https://www.nature.com/articles/s41597-021-008...,Molecular Microbiology Facility (MMF) at Memor...,Liao_et_al_Data,"16S, V4-V5",Illumina MiSeq,Sequencing by synthesis,"FWD:AYTGGGYDTAAAGNG, REV:CCGTCAATTYHTTTRAGT",992B..pool473,Molecular Microbiology Facility (MMF),16S rRNA,V4-V5
2,992C,FMT.0001,SRR11420349,PRJNA545312,,473,CCGTCAATTYHTTTRAGT,Illumina,https://www.nature.com/articles/s41597-021-008...,Molecular Microbiology Facility (MMF) at Memor...,Liao_et_al_Data,"16S, V4-V5",Illumina MiSeq,Sequencing by synthesis,"FWD:AYTGGGYDTAAAGNG, REV:CCGTCAATTYHTTTRAGT",992C..pool473,Molecular Microbiology Facility (MMF),16S rRNA,V4-V5


fmt without the study 2855


Unnamed: 0,sample_name,host_subject_id,Accession,BioProject,AccessionShotgun,Pool,primer,platform,experiment_design_description,center_name,center_project_name,library_construction_protocol,instrument_model,sequencing_method,pcr_primer,run_prefix,run_center,target_gene,target_subfragment
0,1004A,FMT.0002,SRR11414975,PRJNA545312,,483,CCGTCAATTYHTTTRAGT,Illumina,https://www.nature.com/articles/s41597-021-008...,Molecular Microbiology Facility (MMF) at Memor...,Liao_et_al_Data,"16S, V4-V5",Illumina MiSeq,Sequencing by synthesis,"FWD:AYTGGGYDTAAAGNG, REV:CCGTCAATTYHTTTRAGT",1004A..pool483,Molecular Microbiology Facility (MMF),16S rRNA,V4-V5
1,1004B,FMT.0002,SRR11414972,PRJNA545312,,483,CCGTCAATTYHTTTRAGT,Illumina,https://www.nature.com/articles/s41597-021-008...,Molecular Microbiology Facility (MMF) at Memor...,Liao_et_al_Data,"16S, V4-V5",Illumina MiSeq,Sequencing by synthesis,"FWD:AYTGGGYDTAAAGNG, REV:CCGTCAATTYHTTTRAGT",1004B..pool483,Molecular Microbiology Facility (MMF),16S rRNA,V4-V5
2,1083A,FMT.0023,SRR11414929,PRJNA545312,,550,CCGTCAATTYHTTTRAGT,Illumina,https://www.nature.com/articles/s41597-021-008...,Molecular Microbiology Facility (MMF) at Memor...,Liao_et_al_Data,"16S, V4-V5",Illumina MiSeq,Sequencing by synthesis,"FWD:AYTGGGYDTAAAGNG, REV:CCGTCAATTYHTTTRAGT",1083A..pool550,Molecular Microbiology Facility (MMF),16S rRNA,V4-V5


All FMT cases: 3408


In [73]:
#Randomly split remaning FMT cases keeping pool numbers aligned

#Change this variable depending on the size per group
aprx_size_lists = 250

pool_df = fmt.loc[:]['Pool'].value_counts() 
all_pools = list(pool_df.index)
pool_groups = []

while len(all_pools) > int(aprx_size_lists/50):
    current_group = []
    sum_current_group = 0
    while sum_current_group < aprx_size_lists and len(all_pools)>=1:
        selected_pool = random.sample(all_pools, 1)
        count_in_pool = pool_df[selected_pool][0]
        sum_current_group += count_in_pool
        current_group.append(selected_pool[0])
        all_pools.remove(selected_pool[0])
    pool_groups.append(current_group)
    #print(sum_current_group)
if len(all_pools)>=1:
    pool_groups.append(all_pools)
print('Total number of groups:', len(pool_groups), 'of ~', aprx_size_lists, 'samples')
fmt_pool_groups = pool_groups

Total number of groups: 11 of ~ 250 samples


In [9]:
#Split pt cases out (These are older and of low quality) 
pt = p[p.apply(lambda x: 'pt' in x.host_subject_id, axis=1)]
pt = pt.reset_index(drop=True)
print(len(pt))
display(pt[:1])

186


Unnamed: 0,sample_name,host_subject_id,Accession,BioProject,AccessionShotgun,Pool,primer,platform,experiment_design_description,center_name,center_project_name,library_construction_protocol,instrument_model,sequencing_method,pcr_primer,run_prefix,run_center,target_gene,target_subfragment
0,1001,pt_with_samples_1001_1002_1003_1004_1005_1006_...,SRR11414988,PRJNA545312,,535,CCGTCAATTYHTTTRAGT,Illumina,https://www.nature.com/articles/s41597-021-008...,Molecular Microbiology Facility (MMF) at Memor...,Liao_et_al_Data,"16S, V4-V5",Illumina MiSeq,Sequencing by synthesis,"FWD:AYTGGGYDTAAAGNG, REV:CCGTCAATTYHTTTRAGT",1001..pool535,Molecular Microbiology Facility (MMF),16S rRNA,V4-V5


In [28]:
#Remaining cases (w/o FMT and pt)
p0 = p[p.apply(lambda x: 'pt' not in x.host_subject_id, axis=1)]
p0 = p0[p0.apply(lambda x: 'FMT' not in x.host_subject_id, axis=1)]
p0 = p0.reset_index(drop=True)
print(len(p0))
print(8941 + 186 + 3408)
display(p0[:3])

8941
12535


Unnamed: 0,sample_name,host_subject_id,Accession,BioProject,AccessionShotgun,Pool,primer,platform,experiment_design_description,center_name,center_project_name,library_construction_protocol,instrument_model,sequencing_method,pcr_primer,run_prefix,run_center,target_gene,target_subfragment
0,1000A,1000,SRR11414397,PRJNA545312,,483,CCGTCAATTYHTTTRAGT,Illumina,https://www.nature.com/articles/s41597-021-008...,Molecular Microbiology Facility (MMF) at Memor...,Liao_et_al_Data,"16S, V4-V5",Illumina MiSeq,Sequencing by synthesis,"FWD:AYTGGGYDTAAAGNG, REV:CCGTCAATTYHTTTRAGT",1000A..pool483,Molecular Microbiology Facility (MMF),16S rRNA,V4-V5
1,1000B,1000,SRR11414992,PRJNA545312,,483,CCGTCAATTYHTTTRAGT,Illumina,https://www.nature.com/articles/s41597-021-008...,Molecular Microbiology Facility (MMF) at Memor...,Liao_et_al_Data,"16S, V4-V5",Illumina MiSeq,Sequencing by synthesis,"FWD:AYTGGGYDTAAAGNG, REV:CCGTCAATTYHTTTRAGT",1000B..pool483,Molecular Microbiology Facility (MMF),16S rRNA,V4-V5
2,1000C,1000,SRR11414991,PRJNA545312,,483,CCGTCAATTYHTTTRAGT,Illumina,https://www.nature.com/articles/s41597-021-008...,Molecular Microbiology Facility (MMF) at Memor...,Liao_et_al_Data,"16S, V4-V5",Illumina MiSeq,Sequencing by synthesis,"FWD:AYTGGGYDTAAAGNG, REV:CCGTCAATTYHTTTRAGT",1000C..pool483,Molecular Microbiology Facility (MMF),16S rRNA,V4-V5


In [69]:
#Randomly split remaning cases keeping pool numbers aligned

#Change this variable depending on the size per group
aprx_size_lists = 250

pool_df = p0.loc[:]['Pool'].value_counts() 
all_pools = list(pool_df.index)
print(len(all_pools))
pool_groups = []

while len(all_pools) > int(aprx_size_lists/50):
    current_group = []
    sum_current_group = 0
    while sum_current_group < aprx_size_lists and len(all_pools)>=1:
        selected_pool = random.sample(all_pools, 1)
        count_in_pool = pool_df[selected_pool][0]
        sum_current_group += count_in_pool
        current_group.append(selected_pool[0])
        all_pools.remove(selected_pool[0])
    pool_groups.append(current_group)
    #print(sum_current_group)
if len(all_pools)>=1:
    pool_groups.append(all_pools)
print('Total number of groups:', len(pool_groups), 'of ~', aprx_size_lists, 'samples')

236
Total number of groups: 33 of ~ 250 samples


## Export df as csv

In [12]:
#Sample
s.to_csv('Liao_sample_info.tsv',  sep='\t', index = False)

#Prep
#FMT_study
fmt_study.to_csv('prep/Liao_prep_fmt_study.tsv',  sep='\t', index = False)

#FMT
for group in range(0, len(fmt_pool_groups)):
    px = fmt[fmt['Pool'].isin(fmt_pool_groups[group])]
    name = 'prep/Liao_prep_fmt_' + str(group+1) + '.tsv'
    px.to_csv(name,  sep='\t', index = False)

#Remaining Cases
for group in range(0, len(pool_groups)):
    px = p0[p0['Pool'].isin(pool_groups[group])]
    name = 'prep/Liao_prep_' + str(group+1) + '.tsv'
    px.to_csv(name,  sep='\t', index = False)

In [14]:
#pt_study -- THESE WILL NOT GO INTO Qiita, just doing to ensure the numbers are correct
pt.to_csv('Liao_prep_pt_study.tsv',  sep='\t', index = False)

In [78]:
#Accounting for the error that you found before, some groups didn't get put into any prep
original_missing_pools = ['701', '542', '546', '294.bmt']
px = p0[p0['Pool'].isin(original_missing_pools)]
name = 'Liao_prep_' + '34' + '.tsv'
px.to_csv(name,  sep='\t', index = False)