In [1]:
import pandas as pd
import numpy as np
import glob
import sys 
import os

In [4]:
def read_matches(matches_file):
    '''
    This function reads .matches files from ustacks step and return the loci and sample ID. 
    This allow to match the sequences located in the catalog file with the sample for further analysis 
    '''
    
    matches = pd.read_csv(matches_file,sep='\t',
                          on_bad_lines='skip', 
                          skiprows=1 ,header=None, 
                          names = ['loci_id','sample_id','locus_id','h','depth','cigar'],
                          usecols=['loci_id','sample_id'])
    matches = matches[:-1]
    matches = matches.astype({'loci_id':'str'})  # Return loci_id as string
    
    return matches

def read_catalog(catalog_file):
    '''
    This function allow to get the loci id and the consensus sequence. Even if the file contains a sample column, 
    this is full of zeros, thats why we need to extract the id from matches file.
    '''
    
    catalog = pd.read_csv(catalog_file,sep='\t', header = None, 
                      names = ['sample_id','loci_id','consensus','st_comp','seq_id','seq','fl1','fl2','fl3'],
                      usecols=['loci_id','seq'],
                      dtype={'loci_id':'str'})
    catalog = catalog[1:-1]
    
    return catalog

def write_fasta(matches_file, catalog):
    '''
    This information create a fasta file from catalog and matches file. 
    First, the tables are merge into a single dataframe on the loci id column. Then, a pandas series 
    is created with the fasta header that contain the catalog and sample ID.
    Finally, a fasta file is created using .to_csv function replacing the ',' separator by a '\n'
    '''
    
    matches = read_matches(matches_file)
    merged = pd.merge(matches,catalog,on=['loci_id'])
    
    seq_name = '>Catalog: ' + merged.loci_id.astype('str') + ' Sample:'+ merged.sample_id.astype('str')
    merged = pd.concat([seq_name,merged['seq']],axis = 1)
    merged = merged.drop_duplicates()
    
    fst_name = os.path.basename(matches_file).split('.')[0]
    fst_name += '.fasta'
    merged.to_csv(fst_name, sep = '\n',index = False,header=False)
    print('{0} was imported succesfully'.format(os.path.basename(matches_file).split('.')[0]))


In [53]:

# Path to stacks results
stacks_results_path = '/home/alexsanyum/meta_project/server_results/tepui_stacks'

# Read catalog file
catalog = glob.glob(stacks_results_path + '/catalog.tags*')
catalog = read_catalog(catalog[0])

# Iterate the function over all
matches_list = glob.glob(stacks_results_path + '/*matches*')    
for file in matches_list:
    write_fasta(file,catalog)
    

Ab_403_GAGTC-ATCACG was imported succesfully


  matches = pd.read_csv(matches_file,sep='\t',


Er_462_GTCGA-CGATGT was imported succesfully
Er_R-16_AGCTA-CGATGT was imported succesfully
Ab_370_CGGTA-CGATGT was imported succesfully
Er_466_TCTGC-ATCACG was imported succesfully
Ab_394_CTGTC-ATCACG was imported succesfully


  matches = pd.read_csv(matches_file,sep='\t',


Ch_362_CATAT-CGATGT was imported succesfully
Au_05830_TCAGT-CGATGT was imported succesfully
Ch_331_ACGGT-CGATGT was imported succesfully
Ch_337_ACTGG-ATCACG was imported succesfully
Er_446_GGATA-ATCACG was imported succesfully
Au_05824_TACCG-CGATGT was imported succesfully


  matches = pd.read_csv(matches_file,sep='\t',


Ch_358_ATGAG-CGATGT was imported succesfully
Ch_363_CATAT-ATCACG was imported succesfully
Er_460_TAGTA-ATCACG was imported succesfully
Ab_402_GCCGT-CGATGT was imported succesfully
Er_R-13_GGTTG-ATCACG was imported succesfully
Ch_335_AATTA-ATCACG was imported succesfully
Er_R-12_CAACC-ATCACG was imported succesfully
Ab_387_CGTCG-CGATGT was imported succesfully


  matches = pd.read_csv(matches_file,sep='\t',


Er_R-10_TGCAT-CGATGT was imported succesfully
Er_R-03_AACCA-ATCACG was imported succesfully
Ab_397_GACAC-ATCACG was imported succesfully


  matches = pd.read_csv(matches_file,sep='\t',


Er_457_GTAGT-CGATGT was imported succesfully
Au_05832_TCTGC-CGATGT was imported succesfully
Ab_396_CTTGG-CGATGT was imported succesfully
Ch_345_ATACG-ATCACG was imported succesfully
Au_05826_TAGTA-CGATGT was imported succesfully
Ab_365_CGAAT-CGATGT was imported succesfully


  matches = pd.read_csv(matches_file,sep='\t',


Ab_399_GAGAT-CGATGT was imported succesfully
Au_05829_TCACG-CGATGT was imported succesfully
Ch_334_ACACA-ATCACG was imported succesfully
Er_465_TCCGG-ATCACG was imported succesfully
Er_R-08_CGATC-CGATGT was imported succesfully
Ab_390_CTGAT-ATCACG was imported succesfully
Au_05831_TCCGG-CGATGT was imported succesfully
Ab_367_CGGTA-ATCACG was imported succesfully
Er_451_GTAGT-ATCACG was imported succesfully


  matches = pd.read_csv(matches_file,sep='\t',


Ab_392_CTGTC-CGATGT was imported succesfully


  matches = pd.read_csv(matches_file,sep='\t',


Er_449_GGATA-CGATGT was imported succesfully
Au_05836_TTACC-ATCACG was imported succesfully
Er_R-17_ACACA-CGATGT was imported succesfully
Er_464_TCAGT-ATCACG was imported succesfully
Er_R-05_TCGAT-ATCACG was imported succesfully
Ab_368_CGGCT-CGATGT was imported succesfully
Er_R-09_TCGAT-CGATGT was imported succesfully
Er_411_GCTGA-ATCACG was imported succesfully
Ab_393_CTGCG-ATCACG was imported succesfully
Er_R-07_TGCAT-ATCACG was imported succesfully


  matches = pd.read_csv(matches_file,sep='\t',


Ch_339_ACTGG-CGATGT was imported succesfully
Ab_400_GAGAT-ATCACG was imported succesfully
Ch_360_ATTAC-CGATGT was imported succesfully
Er_R-11_CAACC-CGATGT was imported succesfully
Ab_398_GACAC-CGATGT was imported succesfully
Er_463_TCACG-ATCACG was imported succesfully
Ch_359_ATGAG-ATCACG was imported succesfully


  matches = pd.read_csv(matches_file,sep='\t',


Ab_388_CTGAT-CGATGT was imported succesfully
Au_05825_TACGT-CGATGT was imported succesfully
Ch_346_ACTTC-CGATGT was imported succesfully
Ab_369_CGTAC-ATCACG was imported succesfully
Au_05827_TGGAA-ATCACG was imported succesfully
Er_455_TACCG-ATCACG was imported succesfully
Ab_371_CGTAC-CGATGT was imported succesfully
Ch_364_CGAAT-ATCACG was imported succesfully
Ab_391_CTGCG-CGATGT was imported succesfully
Ab_395_CTTGG-ATCACG was imported succesfully
Er_453_GTCGA-ATCACG was imported succesfully
Er_461_TATAC-ATCACG was imported succesfully
Er_R-04_CGATC-ATCACG was imported succesfully
Ab_389_CGTCG-ATCACG was imported succesfully
Er_R-01_GCATG-ATCACG was imported succesfully
Er_456_GGCTC-CGATGT was imported succesfully
Ab_404_GCCGT-ATCACG was imported succesfully
Ch_361_ATTAC-ATCACG was imported succesfully


  matches = pd.read_csv(matches_file,sep='\t',


Er_R-14_GGTTG-CGATGT was imported succesfully
Au_05835_TTACC-CGATGT was imported succesfully
Au_05834_TGGAA-CGATGT was imported succesfully
Er_447_GGCCA-ATCACG was imported succesfully
Er_448_GGCTC-ATCACG was imported succesfully
Er_452_GTCCG-ATCACG was imported succesfully
Ab_401_GAGTC-CGATGT was imported succesfully
Er_458_TACGT-ATCACG was imported succesfully
Ch_338_ACTTC-ATCACG was imported succesfully
Ch_357_ATACG-CGATGT was imported succesfully
Ch_333_AGCTA-ATCACG was imported succesfully
Er_R-02_GCATG-CGATGT was imported succesfully
Au_05833_CTGCG-CGATGT was imported succesfully
Ch_336_ACGGT-ATCACG was imported succesfully
Au_05828_TATAC-CGATGT was imported succesfully


  matches = pd.read_csv(matches_file,sep='\t',


Ch_330_AATTA-CGATGT was imported succesfully
Er_459_GTCCG-CGATGT was imported succesfully
Ab_366_CGGCT-ATCACG was imported succesfully
Er_450_GGCCA-CGATGT was imported succesfully
Ch_332_AAGGA-ATCACG was imported succesfully
Er_410_GCTGA-CGATGT was imported succesfully


  matches = pd.read_csv(matches_file,sep='\t',


Er_R-15_AAGGA-CGATGT was imported succesfully


  matches = pd.read_csv(matches_file,sep='\t',


Er_R-06_AACCA-CGATGT was imported succesfully
