# Small cookbook for parsing fastq files that I've needed to use more than once.
- just a quick reference to share

### common imports that I like

In [2]:
import os
from tqdm import tnrange, tqdm_notebook

In [3]:
# temporary output directory for demo purposes
input_dir = '/projects/ps-yeolab3/bay001/codebase/bfx/data/'
output_dir = '/projects/ps-yeolab3/bay001/codebase/bfx/scratch/'

### This parses a fasta file and searches for a particular set of ids (names_to_search), and printing

In [4]:
from Bio import SeqIO

names_to_search = ['hsa-let-7g-5p', 'let-7b-5p']
handle = open(os.path.join(input_dir, "mature.fa"), "rU")
for record in SeqIO.parse(handle, "fasta"):
    for name in names_to_search:
        if name in record.id:
            print("record: {}, record_obj: {}, record_string: {}".format(record.id, record.seq, str(record.seq)))
handle.close()

record: hsa-let-7b-5p, record_obj: UGAGGUAGUAGGUUGUGUGGUU, record_string: UGAGGUAGUAGGUUGUGUGGUU
record: hsa-let-7g-5p, record_obj: UGAGGUAGUAGUUUGUACAGUU, record_string: UGAGGUAGUAGUUUGUACAGUU
record: mmu-let-7b-5p, record_obj: UGAGGUAGUAGGUUGUGUGGUU, record_string: UGAGGUAGUAGGUUGUGUGGUU
record: rno-let-7b-5p, record_obj: UGAGGUAGUAGGUUGUGUGGUU, record_string: UGAGGUAGUAGGUUGUGUGGUU
record: sme-let-7b-5p, record_obj: UGAGGUAGAUUGUUGGAUGACU, record_string: UGAGGUAGAUUGUUGGAUGACU
record: oan-let-7b-5p, record_obj: UGAGGUAGUAGGUUGUGUGGUU, record_string: UGAGGUAGUAGGUUGUGUGGUU
record: cin-let-7b-5p, record_obj: UGAGGUAGUAGGUUAUGUUGUU, record_string: UGAGGUAGUAGGUUAUGUUGUU
record: mml-let-7b-5p, record_obj: UGAGGUAGUAGGUUGUGUGGUU, record_string: UGAGGUAGUAGGUUGUGUGGUU
record: tgu-let-7b-5p, record_obj: UGAGGUAGUAGGUUGUGUGGUU, record_string: UGAGGUAGUAGGUUGUGUGGUU
record: pma-let-7b-5p, record_obj: UGAGGUAGUAGGUUUUGUAGUU, record_string: UGAGGUAGUAGGUUUUGUAGUU
record: aca-let-7b-5p, record_

### This parses a fasta file and searches for a particular set of ids (names_to_search), and writing to a separate file

In [4]:
from Bio import SeqIO
names_to_search = ['rna', 'RNA', 'alu']
handle = open(os.path.join(input_dir, "all.ref"), "rU")
outfile = os.path.join(output_dir, "small_repelements.fa")
records = []
for record in SeqIO.parse(handle, "fasta"):
    for name in names_to_search:
        if name in record.id:
            records.append(record)
SeqIO.write(records,outfile,"fasta")
handle.close()

### This parses a fasta file and searches for a particular set of ids (names_to_search), and printing

In [5]:
from Bio import SeqIO
import pandas as pd

lengths = {}
handle = open(os.path.join(input_dir, "mature.fa"), "rU")
for record in SeqIO.parse(handle, "fasta"):
    lengths[record.id] = len(record.seq)
handle.close()
pd.DataFrame(lengths,index=['len']).T.head()

Unnamed: 0,len
aae-bantam-3p,22
aae-bantam-5p,23
aae-let-7,21
aae-miR-1,22
aae-miR-10,22


### This parses a fasta file renames the ID

In [6]:
from Bio import SeqIO
handle = open(os.path.join(input_dir, "ws245_genes.ucsctable.fa"), "rU")
outfile = os.path.join(output_dir, "ws245_genes.ucsctable.fix_genenames.fa")
records = []
x = 0
for record in SeqIO.parse(handle, "fasta"):
    record.id = record.id.split('_')[2]
    records.append(record)
SeqIO.write(records,outfile,"fasta")
handle.close()    

### This gets len of each sequence

In [7]:
from Bio import SeqIO
handle = open(os.path.join(input_dir, "ce10.fa"), "rU")

for record in SeqIO.parse(handle, "fasta"):
    print len(record.seq), record.name
handle.close()

15072423 chrI
15279345 chrII
13783700 chrIII
17493793 chrIV
13794 chrM
20924149 chrV
17718866 chrX


### Useful functions using biopython

In [8]:
from Bio import SeqIO
def get_seq_dict_from_file(fn, seq_ids=[], file_type='fasta', equal=True):
    """
    Returns dictionary of {name : sequence}
     
    Parameters
    ----------
    
    fn : basestring
        file location of a fasta file
    seq_ids : list
        list of sequence ids to search. Empty field returns all sequences
    equal : bool
        True if seq_id needs to be identical
        False if we just have partial identifier
    file_type : basestring
        default "fasta" type file
    Returns
    -------
    records : dict
        {name:sequence}
    """
    records = {}
    for record in SeqIO.parse(fn, file_type):
        if len(seq_ids) > 0:
            for name in seq_ids:
                if equal:
                    if name == record.id:
                        records[record.id] = record.seq
                else:
                    if name in record.id:
                        records[record.id] = record.seq
        else:
            records[record.id] = record.seq
    return records

def get_seq_sizes(fn, seq_ids=[], file_type='fasta', equal=True):
    """
    Returns dictionary of {name : seqsize}
    
    Parameters
    ----------
    
    fn : basestring
        file location of a fasta file
    seq_ids : list
        list of sequence ids to search. Empty field returns all sequences
    equal : bool
        True if seq_id needs to be identical
        False if we just have partial identifier
    file_type : basestring
        default "fasta" type file

    Returns
    -------

    """
    lengths = {}
    records = get_seq_dict_from_file(fn, seq_ids, file_type, equal)
    
    for seq_id, sequence in records.iteritems():
        lengths[seq_id] = len(sequence)
    return lengths

fn = os.path.join(input_dir, "GRCh38.primary_assembly.genome.fa")
seq_ids = ['chr1']  # just return the chr1 size for now, leave list empty of you want all of them.

d = get_seq_sizes(fn=fn, seq_ids=seq_ids)
d

{'chr1': 248956422}

### this subsets the fasta file to just contain XYZ number of bases

In [9]:
from Bio import SeqIO
handle = open(os.path.join(input_dir, "chr19.fa"), "rU")
outfile = "chr19.1M.fa"
records = []
for record in SeqIO.parse(handle, "fasta"):
    record.seq = record.seq[:1000000]  # take first 1,000,000 nucleotides
    records.append(record)
    print(len(record.seq))
SeqIO.write(records,outfile,"fasta")
handle.close()

1000000


### this performs very basic trimming on a fastq file ###

In [11]:
import gzip
from Bio import SeqIO

# preview just 10 records, so we don't print all of the lines in our fastq file
counter = 0
max_count = 10

handle = gzip.open(os.path.join(input_dir, "Barnayard_Original_S3_R2_001.fastq.gz"), "rt")  # use gzip open and 'rt' to open gzipped files
outfile = os.path.join(output_dir, "Barnayard_Original_S3_R2_001.short.fastq.gz")
for record in SeqIO.parse(handle, "fastq"):  # either fastq-solexa, fastq-illumina or 'fastq' which defaults to fastq-sanger
    record.seq = record.seq[:60]  # subset and print first 60 bases
    print(record.seq, len(record.seq))
    if counter > max_count:
        break
    else:
        counter+=1

handle.close()

(Seq('NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN', SingleLetterAlphabet()), 60)
(Seq('NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN', SingleLetterAlphabet()), 60)
(Seq('NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN', SingleLetterAlphabet()), 60)
(Seq('NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN', SingleLetterAlphabet()), 60)
(Seq('NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN', SingleLetterAlphabet()), 60)
(Seq('NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN', SingleLetterAlphabet()), 60)
(Seq('NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN', SingleLetterAlphabet()), 60)
(Seq('NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN', SingleLetterAlphabet()), 60)
(Seq('NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN', SingleLetterAlphabet()), 60)
(Seq('NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN', SingleLetterAlphabet()), 60)
(Seq('NANNNNNNNNNNNN