# Small cookbook for parsing fastq files that I've needed to use more than once.
- just a quick reference to share

In [4]:

### This parses a fasta file and searches for a particular set of ids (names_to_search), and printing

names_to_search = ['hsa-let-7g-5p', 'let-7b-5p']
from Bio import SeqIO
handle = open("/projects/ps-yeolab/genomes/mirbase/release_21/mature.fa", "rU")
for record in SeqIO.parse(handle, "fasta"):
    for name in names_to_search:
        if name in record.id:
            print(record.id)
            print(record.seq)
handle.close()

hsa-let-7b-5p
UGAGGUAGUAGGUUGUGUGGUU
hsa-let-7g-5p
UGAGGUAGUAGUUUGUACAGUU
mmu-let-7b-5p
UGAGGUAGUAGGUUGUGUGGUU
rno-let-7b-5p
UGAGGUAGUAGGUUGUGUGGUU
sme-let-7b-5p
UGAGGUAGAUUGUUGGAUGACU
oan-let-7b-5p
UGAGGUAGUAGGUUGUGUGGUU
cin-let-7b-5p
UGAGGUAGUAGGUUAUGUUGUU
mml-let-7b-5p
UGAGGUAGUAGGUUGUGUGGUU
tgu-let-7b-5p
UGAGGUAGUAGGUUGUGUGGUU
pma-let-7b-5p
UGAGGUAGUAGGUUUUGUAGUU
aca-let-7b-5p
UGAGGUAGUAGGUUGUGUGGU
pol-let-7b-5p
UGAGGUAGUAGGUUGUGUGGUU
ssa-let-7b-5p
UGAGGUAGUAGGUUGUGUGGUU
chi-let-7b-5p
UGAGGUAGUAGGUUGUGUGGUU
oha-let-7b-5p
UGAGGUAGUAGGUUGUGUGGUU


In [8]:
### This parses a fasta file and searches for a particular set of ids (names_to_search), and writing to a separate file

from Bio import SeqIO
names_to_search = ['rna', 'RNA', 'alu']
handle = open("/projects/ps-yeolab/genomes/RepBase18.05.fasta/all.ref", "rU")
outfile = "/home/bay001/projects/encode/analysis/tests/eclip_tests/small_repelements/small_repelements.fa"
records = []
x = 0
for record in SeqIO.parse(handle, "fasta"):
    for name in names_to_search:
        if name in record.id:
            x = x + 1
            print('[{}]'.format(x)),
            records.append(record)
SeqIO.write(records,outfile,"fasta")
handle.close()
            

[1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] [14] [15] [16] [17] [18] [19] [20] [21] [22] [23] [24] [25] [26] [27] [28] [29] [30] [31] [32] [33] [34] [35] [36] [37] [38] [39] [40] [41] [42] [43] [44] [45] [46] [47] [48] [49] [50] [51] [52] [53] [54] [55] [56] [57] [58] [59] [60] [61] [62] [63] [64] [65] [66] [67] [68] [69] [70] [71] [72] [73] [74] [75] [76] [77] [78] [79] [80] [81] [82] [83] [84] [85] [86] [87] [88] [89] [90] [91] [92] [93] [94] [95] [96] [97] [98] [99] [100] [101] [102] [103] [104] [105] [106] [107] [108] [109] [110] [111] [112] [113]


In [13]:
### This parses a fasta file and searches for a particular set of ids (names_to_search), and printing
from Bio import SeqIO
import pandas as pd

lengths = {}
handle = open("/projects/ps-yeolab/genomes/mirbase/release_21/mature.fa", "rU")
for record in SeqIO.parse(handle, "fasta"):
    lengths[record.id] = len(record.seq)
handle.close()
pd.DataFrame(lengths,index=['len']).T

Unnamed: 0,len
aae-bantam-3p,22
aae-bantam-5p,23
aae-let-7,21
aae-miR-1,22
aae-miR-10,22
aae-miR-100,22
aae-miR-1000,21
aae-miR-11-3p,22
aae-miR-11-5p,22
aae-miR-1174,21


In [6]:
### This parses a fasta file renames the ID

from Bio import SeqIO
handle = open("/projects/ps-yeolab/genomes/ce11/ws245_genes.ucsctable.fa", "rU")
outfile = "/projects/ps-yeolab/genomes/ce11/ws245_genes.ucsctable.fix_genenames.fa"
records = []
x = 0
for record in SeqIO.parse(handle, "fasta"):
    record.id = record.id.split('_')[2]
    records.append(record)
SeqIO.write(records,outfile,"fasta")
handle.close()
            

In [3]:
### This gets len of each sequence

from Bio import SeqIO
handle = open("/projects/ps-yeolab/genomes/ce10/ce10.fa", "rU")

for record in SeqIO.parse(handle, "fasta"):
    print len(record.seq), record.name
handle.close()
            

15072423 chrI
15279345 chrII
13783700 chrIII
17493793 chrIV
13794 chrM
20924149 chrV
17718866 chrX


In [4]:
### This gets len of each sequence

from Bio import SeqIO
handle = open("/projects/ps-yeolab/genomes/ce10/chromosomes/all.fa", "rU")

for record in SeqIO.parse(handle, "fasta"):
    print len(record.seq), record.name
handle.close()
            

15072423 chrI
15279345 chrII
13783700 chrIII
17493793 chrIV
13794 chrM
20924149 chrV
17718866 chrX


In [1]:
from Bio import SeqIO
def get_seq_dict_from_file(f, seq_ids=[], file_type='fasta', equal=True):
    """
    Returns dictionary of {name : sequence}
     
    Parameters
    ----------
    
    f : basestring
        file location of a fasta file
    seq_ids : list
        list of sequence ids to search. Empty field returns all sequences
    equal : bool
        True if seq_id needs to be identical
        False if we just have partial identifier
    file_type : basestring
        default "fasta" type file
    Returns
    -------
    records : dict
        {name:sequence}
    """
    records = {}
    for record in SeqIO.parse(f, file_type):
        if len(seq_ids) > 0:
            for name in seq_ids:
                if equal:
                    if name == record.id:
                        records[record.id] = record.seq
                else:
                    if name in record.id:
                        records[record.id] = record.seq
        else:
            records[record.id] = record.seq
    return records

def get_seq_sizes(f, seq_ids=[], file_type='fasta', equal=True):
    """
    Returns dictionary of {name : seqsize}
    
    Parameters
    ----------
    f
    seq_ids
    equal
    file_type

    Returns
    -------

    """
    lengths = {}
    records = get_seq_dict_from_file(f, seq_ids, file_type, equal)
    
    for seq_id, sequence in records.iteritems():
        lengths[seq_id] = len(sequence)
    return lengths

f = "/projects/ps-yeolab/genomes/hg38/gencode/v24/GRCh38.primary_assembly.genome.fa"
seq_ids = []

d = get_seq_sizes(f, )

{'chr1': 249250621,
 'chr10': 135534747,
 'chr11': 135006516,
 'chr12': 133851895,
 'chr13': 115169878,
 'chr14': 107349540,
 'chr15': 102531392,
 'chr16': 90354753,
 'chr17': 81195210,
 'chr18': 78077248,
 'chr19': 59128983,
 'chr2': 243199373,
 'chr20': 63025520,
 'chr21': 48129895,
 'chr22': 51304566,
 'chr3': 198022430,
 'chr4': 191154276,
 'chr5': 180915260,
 'chr6': 171115067,
 'chr7': 159138663,
 'chr8': 146364022,
 'chr9': 141213431,
 'chrM': 16571,
 'chrX': 155270560,
 'chrY': 59373566}

In [2]:
### this subsets the fasta file to just contain XYZ number of bases

from Bio import SeqIO
handle = open("/projects/ps-yeolab3/cellrangerdatasets/chr19.fa", "rU")
outfile = "/projects/ps-yeolab3/cellrangerdatasets/hg19chr19kbp550_CELLRANGER_REFERENCE/chr19.1M.fa"
records = []
for record in SeqIO.parse(handle, "fasta"):
    record.seq = record.seq[:1000000]
    records.append(record)
    print(len(record.seq))
SeqIO.write(records,outfile,"fasta")
handle.close()

1000000
