In [None]:
import os
import pysam
import pandas as pd
from Bio import SeqIO

# Bioinformatic File Formats Notebook

## Introduction
In this notebook, we will explore some of the most common bioinformatics file formats: FASTA, BAM, BED, and VCF. We will see how these files are structured and how to read them using Python.


## Requirements
To follow along with this notebook, you need to install the following libraries:
```bash
pip install biopython pysam pandas

### Fasta Format
#### Description
The FASTA format is used to represent nucleotide or protein sequences. Each sequence begins with a description line starting with '>', followed by lines of sequence data.

##### Reading FASTA Files in Python
We will use the Biopython library to read FASTA files.

In [None]:
from Bio import SeqIO

# Read FASTA file
fasta_file = "example.fasta"
for record in SeqIO.parse(fasta_file, "fasta"):
    print(f"ID: {record.id}")
    print(f"Sequence: {record.seq}")

### BAM Format

#### Description
The BAM format is the binary version of the SAM format, used to store sequence alignments. It is space-efficient and allows for fast access.

BAM files Can not be shown directly as it is a binary format.

##### Reading BAM Files in Python
We will use the pysam library to read BAM files.

In [None]:
import pysam

# Read BAM file
bam_file = "example.bam"
bam = pysam.AlignmentFile(bam_file, "rb")
for read in bam.fetch():
    print(f"Read ID: {read.query_name}")
    print(f"Sequence: {read.query_sequence}")
    print(f"Alignment start: {read.reference_start}")

### BED Format

#### Description
The BED format is used to describe genomic features such as regions of interest using coordinates. The first three columns are: chromosome name, feature start, and feature end.

#### Example BED File

In [None]:
import pandas as pd

# Read BED file
bed_file = "example.bed"
bed_df = pd.read_csv(bed_file, sep="\t", header=None, names=["chrom", "start", "end", "name"])
print(bed_df)

### VCF Format

#### Description
The VCF (Variant Call Format) is used to store genomic variants such as SNPs, indels, and other structural variants.

In [None]:
import pysam

# Read VCF file
vcf_file = "example.vcf"
vcf = pysam.VariantFile(vcf_file)
for record in vcf.fetch():
    print(f"Chromosome: {record.chrom}")
    print(f"Position: {record.pos}")
    print(f"ID: {record.id}")
    print(f"Reference: {record.ref}")
    print(f"Alternate: {record.alts}")
    print(f"Quality: {record.qual}")
    print(f"Filter: {record.filter.keys()}")
    print(f"Info: {record.info}")