In [1]:
from queue import Queue
import os
import glob
import biom
import pandas as pd

In [2]:
def resolve_sample_name(qiime_df):
    try:
        qiime_df.rename(index=str, columns={"#sampleid": "sample_name"}, inplace=True)
    except:
        print('#sampleid not found')
    return qiime_df

def clean_panda(file):
    cp_df = pd.read_csv(file,header=0, sep ='\t', decimal = ',',dtype={'sample_name': str,'#SampleID':str},
            true_values =['true','yes','y','Yes','Y','YES'],
            false_values=['false','no','n','No','N','NO'],
            na_values=['Unknown','Unspecified','no_data','not applicable','Missing: not collected', 'Missing: not provided','not collected','not provided', 'restricted_access'],
            low_memory= False
            )
    
    cp_df.columns = cp_df.columns.str.lower()
    
    #align dataframes on 'sample_name'
    cp_df = resolve_sample_name(cp_df)
    print(cp_df.head())
    cp_df.set_index('sample_name', inplace=True)
    cp_df.drop('#q2:types',axis=0,inplace=True)
    #print(cp_df.head())
    return cp_df

In [3]:
count = 0
for biom_file in glob.iglob('./*.biom'):
    print(biom_file)
    biom_table=biom.load_table(biom_file)
    df = pd.DataFrame(biom_table.ids(axis='observation'))
    #print(df.head())
    fasta_file = biom_file.replace('.biom','_rep_seqs.fa')
    if not os.path.isfile(fasta_file):
        with open(fasta_file,"a") as output:
            for row in df[0]:
                output.write('>' + row+'\n')
                output.write(row+'\n')
    #import features
    fasta_qza = fasta_file.replace('.fa','.qza')
    if not os.path.isfile(fasta_qza):
        !qiime tools import \
            --input-path $fasta_file \
            --output-path $fasta_qza \
            --type 'FeatureData[Sequence]'    

./20181120_seed_11851.biom
./20181120_seed_11712.biom
./20181120_seed_10798.biom
./GFoP_ref_hits_prep1.biom
./20181120_all_soil.biom
./20181120_seed_10911.biom
./20181120_seed_10442.biom
./20181101_emp_release1.biom
./20181120_seed_11149.biom
./20181120_seed_10812.biom
./20181120_seed_10541.biom
./20181120_seed_10764.biom
./20181120_all_mouse.biom
./20181120_seed_11713.biom
./20181120_seed_10781.biom
./20181120_seed_10895.biom
./20181120_seed_11043.biom
./20181120_seed_10918.biom
./20181120_seed_11129.biom
./20181120_seed_10689.biom
./20181120_seed_11546.biom
./20181120_seed_10464.biom
./20181120_seed_10955.biom
[32mImported ./20181120_seed_10955_rep_seqs.fa as DNASequencesDirectoryFormat to ./20181120_seed_10955_rep_seqs.qza[0m
./20181120_seed_11110.biom
./20181120_seed_10933.biom
./20181120_all_public_human.biom
./20181120_seed_10557.biom
./GFoP_ref_hits_prep2.biom
./20181120_seed_11479.biom
./20181120_seed_10801.biom
./20181120_seed_10724.biom
./20181120_seed_11261.biom
./20181120

In [4]:
for fasta in glob.iglob('./*.fa'):
    !cat $fasta >> 20181120_merged_fasta.csv

In [5]:
merged_fa = pd.read_csv('20181120_merged_fasta.csv',header=None)
de_duplicated=merged_fa[0].drop_duplicates()
de_duplicated.to_csv('20181120_merged_fasta_cleaned.fa',header=False,index=False)


In [6]:
!qiime tools import \
    --input-path 20181120_merged_fasta_cleaned.fa \
    --output-path 20181120_deblurred_seqs_megatree_merged.qza \
    --type 'FeatureData[Sequence]'

[32mImported 20181120_merged_fasta_cleaned.fa as DNASequencesDirectoryFormat to 20181120_deblurred_seqs_megatree_merged.qza[0m


In [11]:
#generate a tree for phylogenetic diversity analyses 
!qiime fragment-insertion sepp \
  --i-representative-sequences /home/adswafford/Projects/MegaTree/20181120_deblurred_seqs_megatree_merged.qza \
  --p-threads 20 \
  --o-tree /home/adswafford/Projects/MegaTree/20181120_megatree_insertion-tree.qza \
  --o-placements /home/adswafford/Projects/MegaTree/20181120_insertion-placements.qza
  
echo 'SEPP job done'

#get taxonomy
!qiime fragment-insertion classify-otus-experimental \
  --i-representative-sequences /home/adswafford/Projects/MegaTree/20181120_deblurred_seqs_megatree_merged.qza \
  --i-tree /home/adswafford/Projects/MegaTree/20181120_megatree_insertion-tree.qza \
  --i-reference-taxonomy /home/adswafford/Projects/MegaTree/taxonomy_gg99.qza \
  --o-classification /home/adswafford/Projects/MegaTree/20181120_megatree_taxonomy.qza

>TACGAAGGGGGCGAGCGTTATTCGGAATCACTGGGCGTAAAGCGTGCGTAGGCGGTTTTGTAAGTTGGAAGTGAAAGCCCAGGGCTCAACCTTGGAATTGCTTTCAAAACTACAAGACTTGAATTCGGGAGAGGATAGCGGAATTATCAG
TACGAAGGGGGCGAGCGTTATTCGGAATCACTGGGCGTAAAGCGTGCGTAGGCGGTTTTGTAAGTTGGAAGTGAAAGCCCAGGGCTCAACCTTGGAATTGCTTTCAAAACTACAAGACTTGAATTCGGGAGAGGATAGCGGAATTATCAG
>TACGTAGGGGGCAAGCGTTGTCCGGATTTATTGGGCGTAAAGAGCGTGTAGGCGGCCAGACAGGTCCGTTGTGAAAACTCGAGGCTCAACCTCGAGACGTCGATGGAAACCGTCTGGCTAGAGTCCGGAAGAGGAGAGGGGAATTCCTGG
TACGTAGGGGGCAAGCGTTGTCCGGATTTATTGGGCGTAAAGAGCGTGTAGGCGGCCAGACAGGTCCGTTGTGAAAACTCGAGGCTCAACCTCGAGACGTCGATGGAAACCGTCTGGCTAGAGTCCGGAAGAGGAGAGGGGAATTCCTGG
>TACGAAGGGTGCAAGCGTTGCTCGGAATTATTGGGCGTAAAGGGTTGGTAGGTGGTTACGTATGTCTGGGGTGAAATCCCTGAGCTCAACTCAGGACGTGCCTTGGAAACGGCGTAACTAGAGTACTAGAGAGGATCGTGGAATTCCTGG
TACGAAGGGTGCAAGCGTTGCTCGGAATTATTGGGCGTAAAGGGTTGGTAGGTGGTTACGTATGTCTGGGGTGAAATCCCTGAGCTCAACTCAGGACGTGCCTTGGAAACGGCGTAACTAGAGTACTAGAGAGGATCGTGGAATTCCTGG
>TACGTAGGGTGCAGGCGTTAATCGGAATTACTGGGCGTAAAGCGCGCGTAGGCGGATTGGAAAGTTGGGGGTGAAATCCCGGGG

In [18]:
#DO NOT USE, need to filter, rarefy, etc. first
!qiime diversity beta-phylogenetic \
    --i-table /home/adswafford/Projects/MegaTree/20181120_deblurred_biom_merged.qza \
    --i-phylogeny /home/adswafford/Projects/MegaTree/20181120_insertion-tree.qza \
    --p-metric unweighted_unifrac \
    --o-distance-matrix /home/adswafford/Projects/MegaTree/20181120_megatree_unweightedUniFrac_dm.qza

!qiime diversity beta-phylogenetic \
    --i-table /home/adswafford/Projects/MegaTree/20181120_deblurred_biom_merged.qza \
    --i-phylogeny /home/adswafford/Projects/MegaTree/20181120_insertion-tree.qza \
    --p-metric weighted_normalized_unifrac \
    --o-distance-matrix /home/adswafford/Projects/MegaTree/20181120_megatree_WeightedNormUniFrac_dm.qza

[test, test2]
