# Bioinfoermatics Database

## Import library

In [1]:
# Import library
from Bio import SeqIO

## Import and parse dataset

In [2]:
# Import dataset
genbank_file = "sequence.gb"
record = SeqIO.read(genbank_file, "genbank")

## Extract metadata and sequence from dataset

In [3]:
# Extract dataset
print(f"Record ID: {record.id}")
print(f"Record description: {record.description}")
print(f"Organism: {record.annotations['organism']}")
print(f"Sequence length: {len(record.seq)}")
print(f"Sequence (first 100 bases): {record.seq[:100]}")

Record ID: LT934502.1
Record description: Homo sapiens mRNA for KIR3DL2 protein (KIR3DL2 gene), isolate human, allele KIR3DL2-011:01
Organism: Homo sapiens
Sequence length: 1368
Sequence (first 100 bases): ATGTCGCTCACGGTCGTCAGCATGGCGTGCGTTGGGTTCTTCTTGCTGCAGGGGGCCTGGCCACTCATGGGTGGTCAGGACAAACCCTTCCTGTCTGCCC


## Save sequence to a FASTA file

In [4]:
# Save sequence
with open("BRCA1.genbank", "w") as genbank_file:
    SeqIO.write(record, genbank_file, "genbank")

print("Sequence saved in genbank format as BRCA1.fasta")

Sequence saved in genbank format as BRCA1.fasta


# Web Scraping in Python

## Import library

In [1]:
# Import library
import requests

## Setup requests

In [12]:
# Set request parameters
"""
The E-utilities API (efetch.fcgi).

The E-utilities URL needs parameters to specify what you want to fetch. At minimum, it needs:
- db (database)
- id (the sequence identifier)
- rettype (return type)
- retmode (return mode)

For example, to fetch LT934502.1 in FASTA format using the web interface (nuccore) https://ncbi.nlm.nih.gov/nuccore/LT934502.1?report=fasta&log$=seqview&format=text requires:
```
https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nuccore&id=LT934502.1&rettype=fasta&retmode=text
```
"""

# url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"

# params = {
#     "db": "nucleotide", # Database
#     "id": "NM_007294", # Accession number
#     "rettype": "fasta", # Return type
#     "retmode": "text", # Return mode
# }

params = {
    "db": "nuccore", # Database
    "id": "LT934502", # Accession number
    "rettype": "fasta", # Return type
    "retmode": "text", # Return mode
}

## Fetch the sequence

In [None]:
# Fetch sequence
response = requests.get(url, params=params)

if response.status_code == 200:
    fasta_data = response.text
    print(f"FASTA Data:\n{fasta_data[:500]}...") # print the first 500 characters
else:
    print(f"Failed to fetch data. Status code: {response.status_code}")



FASTA Data:
>LT934502.1 Homo sapiens mRNA for KIR3DL2 protein (KIR3DL2 gene), isolate human, allele KIR3DL2-011:01
ATGTCGCTCACGGTCGTCAGCATGGCGTGCGTTGGGTTCTTCTTGCTGCAGGGGGCCTGGCCACTCATGG
GTGGTCAGGACAAACCCTTCCTGTCTGCCCGGCCCAGCACTGTGGTGCCTCGAGGAGGACACGTGGCTCT
TCAGTGTCACTATCGTCGTGGGTTTAACAATTTCATGCTGTACAAAGAAGACAGAAGCCACGTTCCCATC
TTCCACGGCAGAATATTCCAGGAGAGCTTCATCATGGGCCCTGTGACCCCAGCACATGCAGGGACCTACA
GATGTCGGGGTTCACGCCCACACTCCCTCACTGGGTGGTCGGCACCCAGCAACCCCGTGGTGATCATGGT
CACAGGAAACCACAGAAAACCTTCCCTCCTGGCCCACCCAGG...
