# Small cookbook for parsing fastq files that I've needed to use more than once.
- just a quick reference to share

### common imports that I like

In [1]:
import os
from tqdm import tnrange, tqdm_notebook

In [2]:
# temporary output directory for demo purposes
input_dir = '/projects/ps-yeolab3/bay001/codebase/bfx/data/'
output_dir = '/projects/ps-yeolab3/bay001/codebase/bfx/scratch/'

### This parses a fasta file and searches for a particular set of ids (names_to_search), and printing

In [3]:
from Bio.Blast import NCBIWWW

In [4]:
# Parsing a fastq file:

In [5]:
from Bio import SeqIO
import gzip

count = 0 # counter, 
handle = gzip.open('/projects/ps-yeolab3/seqdata/20181204_emily_miseq_3/all_fastqs/SG1_S1_L001_R2_001.fastq.gz', 'rt')
for record in SeqIO.parse(handle, "fastq"):
    print(record)
    
    # just print the first 10 or so records.
    count+=1
    if count > 10:
        break
handle.close()

ID: M01356:185:000000000-D577T:1:1101:15863:1331
Name: M01356:185:000000000-D577T:1:1101:15863:1331
Description: M01356:185:000000000-D577T:1:1101:15863:1331 2:N:0:CGCTCATT+CCTATCCT
Number of features: 0
Per letter annotation for: phred_quality
Seq('CCACTTTTTCAAGTTGATAACGGACTAGCCTTATTTTAACTTGCTATTTCTAGC...TGT', SingleLetterAlphabet())
ID: M01356:185:000000000-D577T:1:1101:15349:1339
Name: M01356:185:000000000-D577T:1:1101:15349:1339
Description: M01356:185:000000000-D577T:1:1101:15349:1339 2:N:0:AGCTCATT+CCTATCCT
Number of features: 0
Per letter annotation for: phred_quality
Seq('CCACTTTTTCAAGTTGATAACGGACTAGCCTTATTTTAACTTGCTATTTCTAGC...CCA', SingleLetterAlphabet())
ID: M01356:185:000000000-D577T:1:1101:16275:1344
Name: M01356:185:000000000-D577T:1:1101:16275:1344
Description: M01356:185:000000000-D577T:1:1101:16275:1344 2:N:0:CGCTCATT+CCTATCCT
Number of features: 0
Per letter annotation for: phred_quality
Seq('CCACTTTTTCAAGTTGATAACGGACTAGCGTCGTGTCGGGGTGTTTCGTCCTTT...TGT', SingleLetterA

### This parses a fasta file and searches for a particular set of ids (names_to_search), and writing to a separate file

In [6]:
from Bio import SeqIO
names_to_search = ['rna', 'RNA', 'alu']
handle = open(os.path.join(input_dir, "all.ref"), "rU")
outfile = os.path.join(output_dir, "small_repelements.fa")
records = []
for record in SeqIO.parse(handle, "fasta"):
    for name in names_to_search:
        if name in record.id:
            records.append(record)
SeqIO.write(records,outfile,"fasta")
handle.close()

  This is separate from the ipykernel package so we can avoid doing imports until


FileNotFoundError: [Errno 2] No such file or directory: '/projects/ps-yeolab3/bay001/codebase/bfx/scratch/small_repelements.fa'

### This parses a fasta file and searches for a particular set of ids (names_to_search), and printing

In [None]:
from Bio import SeqIO
import pandas as pd

lengths = {}
handle = open(os.path.join(input_dir, "mature.fa"), "rU")
for record in SeqIO.parse(handle, "fasta"):
    lengths[record.id] = len(record.seq)
handle.close()
pd.DataFrame(lengths,index=['len']).T.head()

### This parses a fasta file renames the ID

In [None]:
from Bio import SeqIO
handle = open(os.path.join(input_dir, "ws245_genes.ucsctable.fa"), "rU")
outfile = os.path.join(output_dir, "ws245_genes.ucsctable.fix_genenames.fa")
records = []
x = 0
for record in SeqIO.parse(handle, "fasta"):
    record.id = record.id.split('_')[2]
    records.append(record)
SeqIO.write(records,outfile,"fasta")
handle.close()    

### This gets len of each sequence

In [None]:
from Bio import SeqIO
handle = open(os.path.join(input_dir, "ce10.fa"), "rU")

for record in SeqIO.parse(handle, "fasta"):
    print len(record.seq), record.name
handle.close()

### Useful functions using biopython

In [None]:
from Bio import SeqIO
def get_seq_dict_from_file(fn, seq_ids=[], file_type='fasta', equal=True):
    """
    Returns dictionary of {name : sequence}
     
    Parameters
    ----------
    
    fn : basestring
        file location of a fasta file
    seq_ids : list
        list of sequence ids to search. Empty field returns all sequences
    equal : bool
        True if seq_id needs to be identical
        False if we just have partial identifier
    file_type : basestring
        default "fasta" type file
    Returns
    -------
    records : dict
        {name:sequence}
    """
    records = {}
    for record in SeqIO.parse(fn, file_type):
        if len(seq_ids) > 0:
            for name in seq_ids:
                if equal:
                    if name == record.id:
                        records[record.id] = record.seq
                else:
                    if name in record.id:
                        records[record.id] = record.seq
        else:
            records[record.id] = record.seq
    return records

def get_seq_sizes(fn, seq_ids=[], file_type='fasta', equal=True):
    """
    Returns dictionary of {name : seqsize}
    
    Parameters
    ----------
    
    fn : basestring
        file location of a fasta file
    seq_ids : list
        list of sequence ids to search. Empty field returns all sequences
    equal : bool
        True if seq_id needs to be identical
        False if we just have partial identifier
    file_type : basestring
        default "fasta" type file

    Returns
    -------

    """
    lengths = {}
    records = get_seq_dict_from_file(fn, seq_ids, file_type, equal)
    
    for seq_id, sequence in records.iteritems():
        lengths[seq_id] = len(sequence)
    return lengths

fn = os.path.join(input_dir, "GRCh38.primary_assembly.genome.fa")
seq_ids = ['chr1']  # just return the chr1 size for now, leave list empty of you want all of them.

d = get_seq_sizes(fn=fn, seq_ids=seq_ids)
d

### this subsets the fasta file to just contain XYZ number of bases

In [None]:
from Bio import SeqIO
handle = open(os.path.join(input_dir, "chr19.fa"), "rU")
outfile = "chr19.1M.fa"
records = []
for record in SeqIO.parse(handle, "fasta"):
    record.seq = record.seq[:1000000]  # take first 1,000,000 nucleotides
    records.append(record)
    print(len(record.seq))
SeqIO.write(records,outfile,"fasta")
handle.close()

### this performs very basic trimming on a fastq file ###

In [None]:
import gzip
from Bio import SeqIO

# preview just 10 records, so we don't print all of the lines in our fastq file
counter = 0
max_count = 10

handle = gzip.open(os.path.join(input_dir, "Barnayard_Original_S3_R2_001.fastq.gz"), "rt")  # use gzip open and 'rt' to open gzipped files
outfile = os.path.join(output_dir, "Barnayard_Original_S3_R2_001.short.fastq.gz")
for record in SeqIO.parse(handle, "fastq"):  # either fastq-solexa, fastq-illumina or 'fastq' which defaults to fastq-sanger
    record.seq = record.seq[:60]  # subset and print first 60 bases
    print(record.seq, len(record.seq))
    if counter > max_count:
        break
    else:
        counter+=1

handle.close()