# Moving data sets around
Using the following sets, as they are all MiSeq, all are Stool 16S:
- Son 2015 &ensp; SRA study SRP057700 &ensp; autism vs neurotypical  &ensp; Library_Name_s
- Youngster 2014 &ensp; SRA study SRP040146 
- Baxter 2016 &ensp; SRA study SRP062005
- Zeller 2014 &ensp; ENA study PRJEB6070 &ensp; Subject ID
- Lozupone 2013 &ensp; ENA study PRJEB4335
- Noguera-Julian 2016 &ensp; SRA study SRP068240
- Gevers 2014, IBD &ensp; SRA study SRP040765
- Goodrich 2014 &ensp; ENA studies PRJEB6702 and PRJEB6705
- Alkanani 2015 &ensp; N/A (email authors)

### Processing strategy:
- data stored as otu tables, seperate FASTA file
- format otu table as rows:Samples, columns:OTU id, cells: abundance
- create mapping table with ID, SEQ 

In [1]:
import pandas as pd
from Bio import SeqIO, SeqRecord, Seq

In [2]:
#study_name is the directory.
#sample name is always the first column of the metadata file
#label is always "Disease state"
def join_tables(study_name):
    root = 'data/raw_otu_tables/{0}_results/{1}'.format(study_name, study_name+"{}")
    print(root.format(".metadata.txt"))
    metadata = pd.read_table(root.format(".metadata.txt"), encoding="iso-8859-1")
    sample_col = metadata.columns[0]
    otus = pd.read_table(root.format(".otu_table.100.denovo"))
    fa = SeqIO.parse(root.format(".otu_seqs.100.fasta"), "fasta")
    #make a table of Otu ID and sequence
    ids= []
    seqs = []
    for r in fa:
        ids.append(r.name)
        seqs.append("{}".format(r.seq))
    seq_df = pd.DataFrame({"OTU_ID":ids, "Sequence": seqs}).set_index("OTU_ID")
    #join tables
    otus_joined = otus.join(seq_df, on='OTU_ID')
    
    #Get the sample labels
    labels = {'OTU_ID': 'disease_state'}
    v = metadata[[sample_col, 'DiseaseState']].values
    for i in v:
        labels[i[0]] = i[1]
    labels['Sequence'] = "N/A"
    otus_joined = otus_joined.append(labels, ignore_index = True)
    return otus_joined
    
                                         

In [37]:
studies = ['asd_son', 'cdi_youngster', 'crc_baxter', 'crc_zeller', 'hiv_lozupone', 'hiv_noguerajulian',
          'ibd_gevers_2014', 'ob_goodrich', 't1d_alkanani']

for study in studies: 
    out = "../selected_datasets_joined/{}.tsv".format(study)
    join_tables(study).to_csv(out, index=False)

data/raw_otu_tables/asd_son_results/asd_son.metadata.txt
data/raw_otu_tables/cdi_youngster_results/cdi_youngster.metadata.txt
data/raw_otu_tables/crc_baxter_results/crc_baxter.metadata.txt
data/raw_otu_tables/crc_zeller_results/crc_zeller.metadata.txt
data/raw_otu_tables/hiv_lozupone_results/hiv_lozupone.metadata.txt
data/raw_otu_tables/hiv_noguerajulian_results/hiv_noguerajulian.metadata.txt
data/raw_otu_tables/ibd_gevers_2014_results/ibd_gevers_2014.metadata.txt
data/raw_otu_tables/ob_goodrich_results/ob_goodrich.metadata.txt
data/raw_otu_tables/t1d_alkanani_results/t1d_alkanani.metadata.txt
