In [51]:
from Bio import SeqIO
import os
def generate_dict_of_cluster(cluster_file):
    '''This function created two dictionaries of a cluster file in Cd-Hit Cluster format
    '''
    f = open(cluster_file,'r')
    clust_dict = {}
    #Generate a dictionary with all clustres
    for line in f:
        if '>' == line[0]:
            head = line
            clust_dict[head] = []
        else:
            clust_dict[head].append(line)
    #Generated two dictionaries, no_clst for no clustered sequences and clst for clustered sequences
    no_clst = {}
    clst = {}
    for key in clust_dict.keys():
        if len(clust_dict[key]) > 1:
            clst[key] = clust_dict[key]
        else:
            no_clst[key] = clust_dict[key][0]
    return [no_clst, clst]
def make_fasta_dict(dataset, path='../../EST_data'):
    '''This function generate a dictionary of a fasta file
    Requieres the dataset(fasta filename) and the path
    Path is setting to EST data by defuault'''
    seq_dict = SeqIO.to_dict(SeqIO.parse(path +'/' +dataset +'.fasta', "fasta"))
    return seq_dict
def make_no_clustered_fasta(no_clustered_dict,sequences_dict,dataset, path):
    '''This function generated a fasta file of no clustered sequences'''
    out_name = dataset + '_noclustered.fasta'
    f_out = open(path + '/' + out_name,'w')
    for cluster in no_clustered_dict.keys():
        seq = no_clustered_dict[cluster]
        seqID = seq[seq.find('>')+1:seq.find('...')]
        sequence = str(sequences_dict[seqID].seq)
        f_out.write('>' + seqID + '\n')
        f_out.write(sequence+ '\n')
    f_out.close
def make_clustered_fasta(clustered_dict, sequence_dict, dataset,path):
    '''This function generated a fasta file for each cluster'''
    i = 1 
    if dataset not in os.listdir(path):
        os.mkdir(path + '/' + dataset)
    path += dataset + '/'
    for cluster in clustered_dict.keys():
        outname = str(i) + '_' + dataset + '_clstrd.fasta'
        f_out = open(path  + outname,'w')
        for seq in clustered_dict[cluster]:
            seqID = seq[seq.find('>')+1:seq.find('...')]
            sequence = str(sequence_dict[seqID].seq)
            f_out.write('>' + seqID + '\n')
            f_out.write(sequence+ '\n')
        f_out.close()
        i+=1
def separe_clusted_sequences(percentageID, datasets,outputdir):
    cluster_files = ['../../clusters/cluster_' + str(percentageID) + '/' + file for file in os.listdir('../../clusters/cluster_' + str(percentageID))]
    if outputdir not in os.listdir('../../clusters/'):
        os.mkdir('../../clusters/' + outputdir)
    outputpath = '../../clusters/' + outputdir + '/'
    for file in cluster_files:
        dataset = [specie for specie in datasets if specie + '_c' in file][0]
        clust_dict = generate_dict_of_cluster(file)
        sequences = make_fasta_dict(dataset)
        make_no_clustered_fasta(clust_dict[0], sequences, dataset, outputpath)
        make_clustered_fasta(clust_dict[1], sequences, dataset, outputpath)
    print('All files were generated succesfully')

In [52]:
species = ['Parasteatodatepidariorum', 
             'Nephilaantipodiana', 
             'Leucaugevenusta_2', 
             'Leucaugevenusta', 
             'Steatodagrossa', 
             'Latrodectushesperus']
separe_clusted_sequences(75, species, 'separed_clusters')

All files were generated succesfully


In [53]:
make_fasta_dict('Parasteatodatepidariorum')

{'et|JZ979961.1_PAF56_T7': SeqRecord(seq=Seq('TTTTTTTTTTTTTTTGCTTAGAAAAAATTCTTTATTATCATTTAAAACATTTAC...ATT'), id='et|JZ979961.1_PAF56_T7', name='et|JZ979961.1_PAF56_T7', description='et|JZ979961.1_PAF56_T7', dbxrefs=[]),
 'et|JZ979960.1_PAF54_T7': SeqRecord(seq=Seq('ATAAAATATGTCTCAGTATTTATTATCATGCATTAATTTTGTAGCACAGATTTA...CGA'), id='et|JZ979960.1_PAF54_T7', name='et|JZ979960.1_PAF54_T7', description='et|JZ979960.1_PAF54_T7', dbxrefs=[]),
 'et|JZ979959.1_PAF53_T7': SeqRecord(seq=Seq('CTCCAACGTTTTTTGCTTGTTCTTGTGGTTTTGGCTGTTTTAGCTGTTTCAGCG...AAA'), id='et|JZ979959.1_PAF53_T7', name='et|JZ979959.1_PAF53_T7', description='et|JZ979959.1_PAF53_T7', dbxrefs=[]),
 'et|JZ979958.1_PAF32_': SeqRecord(seq=Seq('TAAAGTTTTTTTGAGCAAGAAAAAAAAATGAAAAAAAAGAAAGCGAGAGATAGA...GAC'), id='et|JZ979958.1_PAF32_', name='et|JZ979958.1_PAF32_', description='et|JZ979958.1_PAF32_', dbxrefs=[]),
 'et|JZ979957.1_PAF32_Sp6': SeqRecord(seq=Seq('GTCCAAAATATTTATTACACACTTTTAGCTACAAATGCTTTATTCTGTATCAAT...TTA'), id='et|JZ9799