Getting started with Biopython
[link text](https://www.kaggle.com/code/mylesoneill/getting-started-with-biopython)

In [None]:
# !pip install biopython 

In [2]:
import numpy as np
import pandas as pd
import Bio
print("Biopython v" + Bio.__version__)

Biopython v1.84


In [3]:
from Bio.Seq import Seq

my_seq = Seq("AGTACACTGGT")
print(my_seq)

AGTACACTGGT


### The file below has the Homo Sapiens IGHG1 seq https://www.ncbi.nlm.nih.gov/gene/3500

In [4]:
from Bio.Seq import Seq

my_seq = Seq("AGTACACTGGT")
print(my_seq)

AGTACACTGGT


In [5]:
from Bio import SeqIO 

count = 0
sequences = []

for seq_record in SeqIO.parse("gene.fna", "fasta"):
  if (count < 6):
    sequences.append(seq_record)
    print("Id: " + seq_record.id + " \t " + "Length: " + str("{:,d}".format(len(seq_record))))
    print(repr(seq_record.seq) + "\n")
    count += 1

Id: NC_000014.9:c105743070-105741473 	 Length: 1,598
Seq('CCTCCACCAAGGGCCCATCGGTCTTCCCCCTGGCACCCTCCTCCAAGAGCACCT...TGA')

Id: NT_187600.1:c210839-209242 	 Length: 1,598
Seq('CCTCCACCAAGGGCCCATCGGTCTTCCCCCTGGCACCCTCCTCCAAGAGCACCT...TGA')

Id: NG_001019.6:1137275-1138978 	 Length: 1,704
Seq('CCTCCACCAAGGGCCCATCGGTCTTCCCCCTGGCACCCTCCTCCAAGAGCACCT...AAA')

Id: NC_060938.1:c100014268-100012671 	 Length: 1,598
Seq('CCTCCACCAAGGGCCCATCGGTCTTCCCCCTGGCACCCTCCTCCAAGAGCACCT...TGA')



In [6]:
# Lets set these sequences up for easy access later
# pa stands fos primary assembly

chr14_pa = sequences[0].seq
chr14_ref = sequences[1].seq
genomic_region = sequences[2].seq
chr14_alternate = sequences[3].seq

In [7]:
print(len(chr14_pa))

1598


In [8]:
print("First Letter: " + chr14_pa[0])
print("Third Letter: " + chr14_pa[2])
print("Last Letter: " + chr14_pa[-1])

First Letter: C
Third Letter: T
Last Letter: A


In [9]:
print("Length:\t" + str(len(chr14_pa)))
print("G Count:\t" + str(chr14_pa.count("G")))

Length:	1598
G Count:	425


In [10]:
# manually calculating GC%

print("GC%:\t" + str(100 * float((chr14_pa.count("G") + chr14_pa.count("C")) / len(chr14_pa) ) ))

GC%:	61.95244055068836


In [11]:
# biopython package to calculate GC%

from Bio.SeqUtils import gc_fraction

print("GC%:\t" + str(100 * gc_fraction(chr14_pa)))

GC%:	61.95244055068836
