# Obesity

Downloaded obesity raw data from QIITA. Downloaded fasta file of sequences, and converted BIOM file to asv table using code below.

In [1]:
import pandas as pd
import numpy as np
from Bio import SeqIO

In [18]:
# Setup sample_vs_asv
asv_table = pd.read_table(
    '/home/groups/dpwall/briannac/sequence_based_biomarkers/data/obesity/asv_table.tsv', 
    skiprows=1, index_col=0)
asv_table.index = [i for i in range(len(asv_table))]
asv_table.drop('taxonomy', axis=1, inplace=True)
asv_order = np.argsort(asv_table.sum(axis=1))
asv_order = asv_order[(asv_table.iloc[asv_order].sum(axis=1)>0).values]
asv_table = asv_table.apply(lambda x: x/sum(x))
asv_table.iloc[asv_order].to_csv(
    '/home/groups/dpwall/briannac/sequence_based_biomarkers/data/obesity/sample_vs_asv.tsv',
    sep='\t')

In [24]:
# Set up sample_metadata.tsv
sample_metadata = pd.read_table('/home/groups/dpwall/briannac/sequence_based_biomarkers/data/obesity/sample_data.tsv', index_col=0)
sample_metadata['phenotype'] = sample_metadata['obesitycat']=='Obese'
sample_metadata['subclass'] = sample_metadata['family']
sample_metadata['subject'] = sample_metadata['host_subject_id']
sample_metadata[['phenotype', 'family', 'subclass', 'subject']].loc[asv_table.columns].to_csv('/home/groups/dpwall/briannac/sequence_based_biomarkers/data/obesity/sample_metadata.tsv', sep='\t')


In [31]:
# Set up seqs.fa
seqs = [record for record in SeqIO.parse(
    '/home/groups/dpwall/briannac/sequence_based_biomarkers/data/obesity/all.61224.seqs.fa', 'fasta')]
sizes = asv_table.sum(axis=1)
for i in range(len(seqs)):
    seqs[i].id = str(i)+';size=%i;' % sizes[i]
    seqs[i].description  = ''
seqs = [seqs[i] for i in asv_order]
SeqIO.write(seqs, '/home/groups/dpwall/briannac/sequence_based_biomarkers/data/obesity/seqs.fa', "fasta")



12363

# Autism

In [24]:
# Set up sample_metadata.tsv
sample_metadata = pd.read_table('/home/groups/dpwall/briannac/sequence_based_biomarkers/data/autism/sample_data.tsv', index_col=0, sep=' ')
sample_metadata['family'] = sample_metadata['Family.group.ID..Biospecimen.']
sample_metadata['subclass'] = [str(i)+','+j for i,j in zip(sample_metadata['Family.group.ID..Biospecimen.'], sample_metadata['Within.study.sampling.date..Biospecimen.'])]
sample_metadata['phenotype'] = sample_metadata['phenotype']=='A'
sample_metadata['subject'] = sample_metadata['Host.ID']
grouped = sample_metadata.groupby(['subclass']).aggregate(lambda x: len(set(x)))
subclasses = grouped[grouped.phenotype==2].index
sample_metadata = sample_metadata[[s in subclasses for s in sample_metadata.subclass]]
sample_metadata = sample_metadata.sort_values(['subclass', 'phenotype'])
sample_metadata[['phenotype', 'family', 'subclass', 'subject']].to_csv('/home/groups/dpwall/briannac/sequence_based_biomarkers/data/autism/sample_metadata.tsv', sep='\t')

In [26]:
# Set up otu_table.tsv
asv_table = pd.read_table('/home/groups/dpwall/briannac/sequence_based_biomarkers/data/autism/otu_table.tsv', sep=' ')
asv_table.index = [i for i in range(len(asv_table))]
asv_order = np.argsort(asv_table.sum(axis=1))
asv_order = asv_order[(asv_table.iloc[asv_order].sum(axis=1)>0).values]
asv_table = asv_table.apply(lambda x: x/sum(x))
asv_table = asv_table.iloc[asv_order][sample_metadata.index]
asv_table.to_csv('/home/groups/dpwall/briannac/sequence_based_biomarkers/data/autism/sample_vs_asv.tsv', sep='\t')

In [27]:
# Set up seqs.fa
asvid_to_seq = pd.read_csv('/home/groups/dpwall/briannac/sequence_based_biomarkers/data/autism/ASVid_to_seq.tsv', sep=' ')
asvid_to_seq.index = [i for i in range(len(asvid_to_seq))]
asvid_to_seq = asvid_to_seq.iloc[asv_order]
with open('/home/groups/dpwall/briannac/sequence_based_biomarkers/data/autism/seqs.fa','w') as f:
    for idx,seq,size in zip(asvid_to_seq.index, asvid_to_seq.Sequence,asv_table.sum(axis=1)[asv_order]):
        f.write('>%i;size=%i;\n' % (idx, size))
        f.write(seq + '\n')

# Align Sequences
Aligned both ```data/<dataset>/seqs.fa``` files w/RDP database at: https://pyro.cme.msu.edu/aligner/, saved in ```data/<dataset>/seqs_aligned.fa```


# Taxonomic annotation
Annotated both ```data/<dataset>/seqs.fa``` files w/RDP database at: https://rdp.cme.msu.edu/classifier/, saved in ```data/<dataset>/seqs_annotated.fa```



In [22]:
for dataset in ['autism', 'obesity']:
    file_name = '/home/groups/dpwall/briannac/sequence_based_biomarkers/data/%s/seqs_annotated.txt' % dataset
    with open(file_name) as f:
        lines = f.readlines()[7:]
    mylist = []
    for l in lines:
        tax_names = l.split(';')[4::2]
        percentages = [int(i) for i in l.replace('/n', '').replace('%', '').split(';')[5::2]]
        mylist = mylist + [[l.split(';')[0]]+[tax_name if percentage>80 else 'unclassified' for tax_name, percentage in zip(tax_names, percentages)]]
    df = pd.DataFrame(mylist)
    df.set_index(0, inplace=True)
    df.columns = ['Kingdom', 'Phylum', 'Class', 'Order', 'Family', 'Genus']
    df.to_csv(file_name.replace('seqs_annotated.txt', 'asv_vs_taxa_annotation.tsv'), sep='\t')

In [23]:
mylist = []
for l in lines:
    tax_names = l.split(';')[4::2]
    percentages = [int(i) for i in l.replace('/n', '').replace('%', '').split(';')[5::2]]
    mylist = mylist + [[l.split(';')[0]]+[tax_name if percentage>80 else 'unclassified' for tax_name, percentage in zip(tax_names, percentages)]]