## Análise dos genes

In [2]:
# Importação dos módulos utilizados
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio import SeqFeature

### 01: Gene TGF-β1 (transforming growth factor beta 1 )

In [2]:
# Leitura do documento em formato genbank:
record = SeqIO.read("TGFB1.gb", "genbank") 
record

SeqRecord(seq=Seq('ATGATCCACCCCCCCCTAAGCCTCCTAAAGTGCTGGGATTAGAGGTGTGAGCCA...CTC'), id='NG_013364.1', name='NG_013364', description='Homo sapiens transforming growth factor beta 1 (TGFB1), RefSeqGene on chromosome 19', dbxrefs=[])

In [7]:
# Visualização de características das sequências
print(len(record.seq))
print(record.id)
print(record.description)
print(record.name)

30020
NG_013364.1
Homo sapiens transforming growth factor beta 1 (TGFB1), RefSeqGene on chromosome 19
NG_013364


In [8]:
# Visualização das anotações do ficheiro
print(record.annotations)

{'molecule_type': 'DNA', 'topology': 'linear', 'data_file_division': 'PRI', 'date': '19-JAN-2024', 'accessions': ['NG_013364'], 'sequence_version': 1, 'keywords': ['RefSeq', 'RefSeqGene'], 'source': 'Homo sapiens (human)', 'organism': 'Homo sapiens', 'taxonomy': ['Eukaryota', 'Metazoa', 'Chordata', 'Craniata', 'Vertebrata', 'Euteleostomi', 'Mammalia', 'Eutheria', 'Euarchontoglires', 'Primates', 'Haplorrhini', 'Catarrhini', 'Hominidae', 'Homo'], 'references': [Reference(title='Processing of transforming growth factor beta 1 precursor by human furin convertase', ...), Reference(title='Camurati-Engelmann Disease', ...), Reference(title='Characterization of the promoter region of the human transforming growth factor-beta 1 gene', ...), Reference(title='Latent high molecular weight complex of transforming growth factor beta 1. Purification from human platelets and structural characterization', ...), Reference(title='Intron-exon structure of the human transforming growth factor-beta precurso

In [9]:
# Visualização da txonomia do organismos da sequência anlisada e a base de dados
print(record.dbxrefs)
print(record.annotations["source"] )
print(record.annotations["taxonomy"])

[]
Homo sapiens (human)
['Eukaryota', 'Metazoa', 'Chordata', 'Craniata', 'Vertebrata', 'Euteleostomi', 'Mammalia', 'Eutheria', 'Euarchontoglires', 'Primates', 'Haplorrhini', 'Catarrhini', 'Hominidae', 'Homo']


In [10]:
# Visualização das features e sua quantidade
print("Temos", len(record.features), "features")
record.features

Temos 25 features


[SeqFeature(SimpleLocation(ExactPosition(0), ExactPosition(30020), strand=1), type='source', qualifiers=...),
 SeqFeature(SimpleLocation(BeforePosition(0), ExactPosition(4510), strand=1), type='gene', qualifiers=...),
 SeqFeature(CompoundLocation([SimpleLocation(BeforePosition(904), ExactPosition(1030), strand=1), SimpleLocation(ExactPosition(3913), ExactPosition(4510), strand=1)], 'join'), type='mRNA', location_operator='join', qualifiers=...),
 SeqFeature(CompoundLocation([SimpleLocation(BeforePosition(904), ExactPosition(1030), strand=1), SimpleLocation(ExactPosition(3913), ExactPosition(4227), strand=1)], 'join'), type='CDS', location_operator='join', qualifiers=...),
 SeqFeature(SimpleLocation(ExactPosition(5004), ExactPosition(28604), strand=1), type='gene', qualifiers=...),
 SeqFeature(CompoundLocation([SimpleLocation(ExactPosition(5004), ExactPosition(6237), strand=1), SimpleLocation(ExactPosition(10471), ExactPosition(10632), strand=1), SimpleLocation(ExactPosition(14062), Exa

In [11]:
# Determinação dos tipos de feature e suas localizações
for t in record.features:
    print(t.type, t.location)

source [0:30020](+)
gene [<0:4510](+)
mRNA join{[<904:1030](+), [3913:4510](+)}
CDS join{[<904:1030](+), [3913:4227](+)}
gene [5004:28604](+)
mRNA join{[5004:6237](+), [10471:10632](+), [14062:14180](+), [16679:16757](+), [16896:17044](+), [26645:26799](+), [27716:28604](+)}
exon [5004:6237](+)
CDS join{[5882:6237](+), [10471:10632](+), [14062:14180](+), [16679:16757](+), [16896:17044](+), [26645:26799](+), [27716:27875](+)}
sig_peptide [5882:5969](+)
mat_peptide join{[5969:6237](+), [10471:10632](+), [14062:14180](+), [16679:16757](+), [16896:17018](+)}
misc_feature [5969:6104](+)
misc_feature join{[6104:6237](+), [10471:10632](+), [14062:14180](+), [16679:16757](+), [16896:16997](+)}
misc_feature [6125:6128](+)
misc_feature [10521:10524](+)
misc_feature [14071:14074](+)
misc_feature join{[16720:16757](+), [16896:16940](+)}
misc_feature [16913:16922](+)
misc_feature [17015:17021](+)
mat_peptide join{[17018:17044](+), [26645:26799](+), [27716:27872](+)}
exon [10471:10632](+)
exon [1406

In [12]:
# Saber quais features que são do tipo "CDS"
feat_cds= []
for i in range(len(record.features)):
    if record.features[i].type == "CDS":
        feat_cds.append(i)
        print(record.features[i])
feat_cds

type: CDS
location: join{[<904:1030](+), [3913:4227](+)}
qualifiers:
    Key: codon_start, Value: ['3']
    Key: db_xref, Value: ['CCDS:CCDS12579.1', 'GeneID:80776', 'HGNC:HGNC:28636', 'MIM:611951']
    Key: exception, Value: ['annotated by transcript or proteomic data']
    Key: gene, Value: ['B9D2']
    Key: gene_synonym, Value: ['ICIS-1; JBTS34; MKS10; MKSR-2; MKSR2']
    Key: inference, Value: ['similar to AA sequence (same species):RefSeq:NP_085055.2']
    Key: note, Value: ['MKS1-related protein 2; involved in cIlia stability-1; B9 protein domain 2']
    Key: product, Value: ['B9 domain-containing protein 2']
    Key: protein_id, Value: ['NP_085055.2']
    Key: translation, Value: ['MAEVHVIGQIIGASGFSESSLFCKWGIHTGAAWKLLSGVREGQTQVDTPQIGDMAYWSHPIDLHFATKGLQGWPRLHFQVWSQDSFGRCQLAGYGFCHVPSSPGTHQLACPTWRPLGSWREQLARAFVGGGPQLLHGDTIYSGADRYRLHTAAGGTVHLEIGLLLRNFDRYGVEC']

type: CDS
location: join{[5882:6237](+), [10471:10632](+), [14062:14180](+), [16679:16757](+), [16896:17044](+), [26645:267

[3, 7]

In [16]:
print(record.features[4].qualifiers)
print(record.features[5].qualifiers)

{'gene': ['TGFB1'], 'gene_synonym': ['CED; DPD1; IBDIMDE; LAP; TGF-beta1; TGFB; TGFbeta'], 'note': ['Derived by automated computational analysis using gene prediction method: BestRefSeq.'], 'codon_start': ['1'], 'product': ['transforming growth factor beta-1 proprotein preproprotein'], 'protein_id': ['NP_000651.3'], 'db_xref': ['CCDS:CCDS33031.1', 'Ensembl:ENSP00000221930.4', 'GeneID:7040', 'HGNC:HGNC:11766', 'MIM:190180'], 'translation': ['MPPSGLRLLPLLLPLLWLLVLTPGRPAAGLSTCKTIDMELVKRKRIEAIRGQILSKLRLASPPSQGEVPPGPLPEAVLALYNSTRDRVAGESAEPEPEPEADYYAKEVTRVLMVETHNEIYDKFKQSTHSIYMFFNTSELREAVPEPVLLSRAELRLLRLKLKVEQHVELYQKYSNNSWRYLSNRLLAPSDSPEWLSFDVTGVVRQWLSRGGEIEGFRLSAHCSCDSRDNTLQVDINGFTTGRRGDLATIHGMNRPFLLLMATPLERAQHLQSSRHRRALDTNYCFSSTEKNCCVRQLYIDFRKDLGWKWIHEPKGYHANFCLGPCPYIWSLDTQYSKVLALYNQHNPGASAAPCCVPQALEPLPIVYYVGRKPKVEQLSNMIVRSCKCS']}
{'gene': ['TGFB1'], 'gene_synonym': ['CED; DPD1; IBDIMDE; LAP; TGF-beta1; TGFB; TGFbeta'], 'note': ['Derived by automated computational analysis using gene predi

In [13]:
# Saber a proteína codificada nos "CDS"
for i in feat_cds:
    coding_dna = record.features[i].extract(record.seq)
    print("DNA: ", coding_dna)
    print("Proteína: ", coding_dna.translate())

DNA:  GGGCGGCATGGAAGCTCCTGTCAGGCGTGCGGGAGGGCCAAACGCAAGTGGACACCCCGCAGATAGGGGACATGGCTTACTGGTCCCACCCCATCGACCTGCACTTCGCCACCAAAGGTCTTCAAGGCTGGCCCCGGCTCCATTTCCAGGTGTGGTCCCAGGACAGCTTTGGCCGCTGCCAGCTTGCAGGCTATGGATTTTGCCATGTGCCCAGTAGCCCGGGCACCCACCAGCTGGCCTGCCCCACGTGGCGGCCCCTGGGCAGTTGGCGAGAACAGTTGGCACGGGCTTTCGTGGGTGGTGGGCCGCAGCTGCTGCATGGGGACACCATCTACAGTGGGGCCGACCGCTATCGCCTGCACACAGCTGCTGGTGGCACCGTGCACCTGGAGATCGGCCTGCTGCTCCGCAACTTCGACCGCTACGGCGTGGAGTGCTGA
Proteína:  GRHGSSCQACGRAKRKWTPRR*GTWLTGPTPSTCTSPPKVFKAGPGSISRCGPRTALAAASLQAMDFAMCPVARAPTSWPAPRGGPWAVGENSWHGLSWVVGRSCCMGTPSTVGPTAIACTQLLVAPCTWRSACCSATSTATAWSA
DNA:  ATGCCGCCCTCCGGGCTGCGGCTGCTGCCGCTGCTGCTACCGCTGCTGTGGCTACTGGTGCTGACGCCTGGCCGGCCGGCCGCGGGACTATCCACCTGCAAGACTATCGACATGGAGCTGGTGAAGCGGAAGCGCATCGAGGCCATCCGCGGCCAGATCCTGTCCAAGCTGCGGCTCGCCAGCCCCCCGAGCCAGGGGGAGGTGCCGCCCGGCCCGCTGCCCGAGGCCGTGCTCGCCCTGTACAACAGCACCCGCGACCGGGTGGCCGGGGAGAGTGCAGAACCGGAGCCCGAGCCTGAGGCCGACTACTACGCCAAGGAGGTCACCCGCGTGCTAATGGTGGAAACCCACAACGAAATCTATGACAAGTTCAAGCAGAGTACACACAG



In [15]:
# Saber o número de features do tipo "gene"
feat_gene = []
for j in range(len(record.features)):
    if record.features[j].type == 'gene':
        feat_gene.append(j)
print('Número de features tipo gene: ', len(feat_gene))

Número de features tipo gene:  2


<span style='color: red;'>O código se repete para os demais genes, seguindo a mesma sequência e ideologia dos códigos descritos para o gene: TGFB1.</span>

### 02: Gene IL-10 (Interleukin 10)

In [3]:
record2 = SeqIO.read("IL10.gb", "genbank") 
record2

SeqRecord(seq=Seq('AGGCATGGGGTAACTACACCAAGAAGGCTGCCATTTTGCTTAGCATGAGAGTAG...TTA'), id='NG_012088.1', name='NG_012088', description='Homo sapiens interleukin 10 (IL10), RefSeqGene (LRG_1230) on chromosome 1', dbxrefs=[])

In [4]:
print(len(record2.seq))
print(record2.id)
print(record2.description)
print(record2.name)

11892
NG_012088.1
Homo sapiens interleukin 10 (IL10), RefSeqGene (LRG_1230) on chromosome 1
NG_012088


In [5]:
print(record2.annotations)

{'molecule_type': 'DNA', 'topology': 'linear', 'data_file_division': 'PRI', 'date': '05-OCT-2020', 'accessions': ['NG_012088'], 'sequence_version': 1, 'keywords': ['RefSeq', 'RefSeqGene'], 'source': 'Homo sapiens (human)', 'organism': 'Homo sapiens', 'taxonomy': ['Eukaryota', 'Metazoa', 'Chordata', 'Craniata', 'Vertebrata', 'Euteleostomi', 'Mammalia', 'Eutheria', 'Euarchontoglires', 'Primates', 'Haplorrhini', 'Catarrhini', 'Hominidae', 'Homo'], 'references': [Reference(title='Identification of functional domains on human interleukin 10', ...), Reference(title="Mapping of the human IL10 gene and further characterization of the 5' flanking sequence", ...), Reference(title='Isolation and expression of human cytokine synthesis inhibitory factor cDNA clones: homology to Epstein-Barr virus open reading frame BCRFI', ...)], 'comment': 'REVIEWED REFSEQ: This record has been curated by NCBI staff in\ncollaboration with Karyn Megy. The reference sequence was derived\nfrom AL513315.15 and AL59184

In [6]:
print(record2.dbxrefs)
print(record2.annotations["source"] )
print(record2.annotations["taxonomy"])

[]
Homo sapiens (human)
['Eukaryota', 'Metazoa', 'Chordata', 'Craniata', 'Vertebrata', 'Euteleostomi', 'Mammalia', 'Eutheria', 'Euarchontoglires', 'Primates', 'Haplorrhini', 'Catarrhini', 'Hominidae', 'Homo']


In [7]:
print("Temos", len(record2.features), "features")
record2.features

Temos 13 features


[SeqFeature(SimpleLocation(ExactPosition(0), ExactPosition(11892), strand=1), type='source', qualifiers=...),
 SeqFeature(SimpleLocation(BeforePosition(0), ExactPosition(6722), strand=-1), type='gene', qualifiers=...),
 SeqFeature(SimpleLocation(ExactPosition(5000), ExactPosition(9893), strand=1), type='gene', qualifiers=...),
 SeqFeature(CompoundLocation([SimpleLocation(ExactPosition(5000), ExactPosition(5224), strand=1), SimpleLocation(ExactPosition(6079), ExactPosition(6139), strand=1), SimpleLocation(ExactPosition(6435), ExactPosition(6588), strand=1), SimpleLocation(ExactPosition(7600), ExactPosition(7666), strand=1), SimpleLocation(ExactPosition(8766), ExactPosition(9893), strand=1)], 'join'), type='mRNA', location_operator='join', qualifiers=...),
 SeqFeature(SimpleLocation(ExactPosition(5000), ExactPosition(5224), strand=1), type='exon', qualifiers=...),
 SeqFeature(CompoundLocation([SimpleLocation(ExactPosition(5059), ExactPosition(5224), strand=1), SimpleLocation(ExactPositio

In [8]:
for t in record2.features:
    print(t.type, t.location)

source [0:11892](+)
gene [<0:6722](-)
gene [5000:9893](+)
mRNA join{[5000:5224](+), [6079:6139](+), [6435:6588](+), [7600:7666](+), [8766:9893](+)}
exon [5000:5224](+)
CDS join{[5059:5224](+), [6079:6139](+), [6435:6588](+), [7600:7666](+), [8766:8859](+)}
sig_peptide [5059:5113](+)
mat_peptide join{[5113:5224](+), [6079:6139](+), [6435:6588](+), [7600:7666](+), [8766:8856](+)}
misc_feature [7621:7624](+)
exon [6079:6139](+)
exon [6435:6588](+)
exon [7600:7666](+)
exon [8766:9893](+)


In [22]:
feat_cds= []
for i in range(len(record2.features)):
    if record2.features[i].type == "CDS":
        feat_cds.append(i)
        print(record2.features[i])
feat_cds

type: CDS
location: join{[5059:5224](+), [6079:6139](+), [6435:6588](+), [7600:7666](+), [8766:8859](+)}
qualifiers:
    Key: codon_start, Value: ['1']
    Key: db_xref, Value: ['CCDS:CCDS1467.1', 'GeneID:3586', 'LRG:p1', 'HGNC:HGNC:5962', 'MIM:124092']
    Key: gene, Value: ['IL10']
    Key: gene_synonym, Value: ['CSIF; GVHDS; IL-10; IL10A; TGIF']
    Key: note, Value: ['T-cell growth inhibitory factor; cytokine synthesis inhibitory factor']
    Key: product, Value: ['interleukin-10 isoform 1 precursor']
    Key: protein_id, Value: ['NP_000563.1']
    Key: translation, Value: ['MHSSALLCCLVLLTGVRASPGQGTQSENSCTHFPGNLPNMLRDLRDAFSRVKTFFQMKDQLDNLLLKESLLEDFKGYLGCQALSEMIQFYLEEVMPQAENQDPDIKAHVNSLGENLKTLRLRLRRCHRFLPCENKSKAVEQVKNAFNKLQEKGIYKAMSEFDIFINYIEAYMTMKIRN']



[5]

In [24]:
for i in feat_cds:
    coding_dna = record2.features[i].extract(record2.seq)
    print("DNA: ", coding_dna)
    print("Proteína: ", coding_dna.translate())

DNA:  ATGCACAGCTCAGCACTGCTCTGTTGCCTGGTCCTCCTGACTGGGGTGAGGGCCAGCCCAGGCCAGGGCACCCAGTCTGAGAACAGCTGCACCCACTTCCCAGGCAACCTGCCTAACATGCTTCGAGATCTCCGAGATGCCTTCAGCAGAGTGAAGACTTTCTTTCAAATGAAGGATCAGCTGGACAACTTGTTGTTAAAGGAGTCCTTGCTGGAGGACTTTAAGGGTTACCTGGGTTGCCAAGCCTTGTCTGAGATGATCCAGTTTTACCTGGAGGAGGTGATGCCCCAAGCTGAGAACCAAGACCCAGACATCAAGGCGCATGTGAACTCCCTGGGGGAGAACCTGAAGACCCTCAGGCTGAGGCTACGGCGCTGTCATCGATTTCTTCCCTGTGAAAACAAGAGCAAGGCCGTGGAGCAGGTGAAGAATGCCTTTAATAAGCTCCAAGAGAAAGGCATCTACAAAGCCATGAGTGAGTTTGACATCTTCATCAACTACATAGAAGCCTACATGACAATGAAGATACGAAACTGA
Proteína:  MHSSALLCCLVLLTGVRASPGQGTQSENSCTHFPGNLPNMLRDLRDAFSRVKTFFQMKDQLDNLLLKESLLEDFKGYLGCQALSEMIQFYLEEVMPQAENQDPDIKAHVNSLGENLKTLRLRLRRCHRFLPCENKSKAVEQVKNAFNKLQEKGIYKAMSEFDIFINYIEAYMTMKIRN*


In [25]:
feat_gene = []
for j in range(len(record2.features)):
    if record2.features[j].type == 'gene':
        feat_gene.append(j)
print('Número de features tipo gene: ', len(feat_gene), feat_gene)

Número de features tipo gene:  2 [1, 2]


In [26]:
print(record2.features[1].qualifiers)
print(record2.features[2].qualifiers)

{'gene': ['IL19'], 'gene_synonym': ['IL-10C; MDA1; NG.1; ZMDA1'], 'note': ['interleukin 19'], 'db_xref': ['GeneID:29949', 'HGNC:HGNC:5990', 'MIM:605687']}
{'gene': ['IL10'], 'gene_synonym': ['CSIF; GVHDS; IL-10; IL10A; TGIF'], 'note': ['interleukin 10'], 'db_xref': ['GeneID:3586', 'HGNC:HGNC:5962', 'MIM:124092']}


### 03: Gene IL-13 (Interleukin 13)

In [28]:
record3 = SeqIO.read("IL13.gb", "genbank") 
record3

SeqRecord(seq=Seq('TTTTGCATATCTGTACTTTACACAATAACATTTTGTGTAATCTTTTTTTAAATG...CAC'), id='NG_012090.1', name='NG_012090', description='Homo sapiens interleukin 13 (IL13), RefSeqGene on chromosome 5', dbxrefs=[])

In [29]:
print(len(record3.seq))
print(record3.id)
print(record3.description)
print(record3.name)

9937
NG_012090.1
Homo sapiens interleukin 13 (IL13), RefSeqGene on chromosome 5
NG_012090


In [30]:
print(record3.annotations)

{'molecule_type': 'DNA', 'topology': 'linear', 'data_file_division': 'PRI', 'date': '26-DEC-2023', 'accessions': ['NG_012090'], 'sequence_version': 1, 'keywords': ['RefSeq', 'RefSeqGene'], 'source': 'Homo sapiens (human)', 'organism': 'Homo sapiens', 'taxonomy': ['Eukaryota', 'Metazoa', 'Chordata', 'Craniata', 'Vertebrata', 'Euteleostomi', 'Mammalia', 'Eutheria', 'Euarchontoglires', 'Primates', 'Haplorrhini', 'Catarrhini', 'Hominidae', 'Homo'], 'references': [Reference(title='Interleukin-13 is a new human lymphokine regulating inflammatory and immune responses', ...)], 'comment': 'REVIEWED REFSEQ: This record has been curated by NCBI staff. The\nreference sequence was derived from AC004041.1 and AC004039.1.\nThis sequence is a reference standard in the RefSeqGene project.\nSummary: This gene encodes an immunoregulatory cytokine produced\nprimarily by activated Th2 cells. This cytokine is involved in\nseveral stages of B-cell maturation and differentiation. It\nup-regulates CD23 and MHC

In [31]:
print(record3.dbxrefs)
print(record3.annotations["source"] )
print(record3.annotations["taxonomy"])

[]
Homo sapiens (human)
['Eukaryota', 'Metazoa', 'Chordata', 'Craniata', 'Vertebrata', 'Euteleostomi', 'Mammalia', 'Eutheria', 'Euarchontoglires', 'Primates', 'Haplorrhini', 'Catarrhini', 'Hominidae', 'Homo']


In [32]:
print("Temos", len(record3.features), "features")
record3.features

Temos 13 features


[SeqFeature(SimpleLocation(ExactPosition(0), ExactPosition(9937), strand=1), type='source', qualifiers=...),
 SeqFeature(SimpleLocation(BeforePosition(0), ExactPosition(2982), strand=-1), type='gene', qualifiers=...),
 SeqFeature(SimpleLocation(BeforePosition(2546), ExactPosition(2720), strand=-1), type='ncRNA', qualifiers=...),
 SeqFeature(SimpleLocation(BeforePosition(2878), ExactPosition(2982), strand=-1), type='ncRNA', qualifiers=...),
 SeqFeature(SimpleLocation(ExactPosition(5000), ExactPosition(7938), strand=1), type='gene', qualifiers=...),
 SeqFeature(CompoundLocation([SimpleLocation(ExactPosition(5000), ExactPosition(5188), strand=1), SimpleLocation(ExactPosition(6245), ExactPosition(6299), strand=1), SimpleLocation(ExactPosition(6551), ExactPosition(6656), strand=1), SimpleLocation(ExactPosition(7002), ExactPosition(7938), strand=1)], 'join'), type='mRNA', location_operator='join', qualifiers=...),
 SeqFeature(SimpleLocation(ExactPosition(5000), ExactPosition(5188), strand=1)

In [33]:
for t in record3.features:
    print(t.type, t.location)

source [0:9937](+)
gene [<0:2982](-)
ncRNA [<2546:2720](-)
ncRNA [<2878:2982](-)
gene [5000:7938](+)
mRNA join{[5000:5188](+), [6245:6299](+), [6551:6656](+), [7002:7938](+)}
exon [5000:5188](+)
CDS join{[5014:5188](+), [6245:6299](+), [6551:6656](+), [7002:7110](+)}
sig_peptide [5014:5074](+)
mat_peptide join{[5074:5188](+), [6245:6299](+), [6551:6656](+), [7002:7065](+)}
exon [6245:6299](+)
exon [6551:6656](+)
exon [7002:7938](+)


In [34]:
feat_cds= []
for i in range(len(record3.features)):
    if record3.features[i].type == "CDS":
        feat_cds.append(i)
feat_cds

[7]

In [38]:
for i in feat_cds:
    coding_dna = record3.features[i].extract(record3.seq)
    print("DNA: ", coding_dna)
    print("Proteína: ", coding_dna.translate(), i)

DNA:  ATGCATCCGCTCCTCAATCCTCTCCTGTTGGCACTGGGCCTCATGGCGCTTTTGTTGACCACGGTCATTGCTCTCACTTGCCTTGGCGGCTTTGCCTCCCCAGGCCCTGTGCCTCCCTCTACAGCCCTCAGGGAGCTCATTGAGGAGCTGGTCAACATCACCCAGAACCAGAAGGCTCCGCTCTGCAATGGCAGCATGGTATGGAGCATCAACCTGACAGCTGGCATGTACTGTGCAGCCCTGGAATCCCTGATCAACGTGTCAGGCTGCAGTGCCATCGAGAAGACCCAGAGGATGCTGAGCGGATTCTGCCCGCACAAGGTCTCAGCTGGGCAGTTTTCCAGCTTGCATGTCCGAGACACCAAAATCGAGGTGGCCCAGTTTGTAAAGGACCTGCTCTTACATTTAAAGAAACTTTTTCGCGAGGGACAGTTCAACTGA
Proteína:  MHPLLNPLLLALGLMALLLTTVIALTCLGGFASPGPVPPSTALRELIEELVNITQNQKAPLCNGSMVWSINLTAGMYCAALESLINVSGCSAIEKTQRMLSGFCPHKVSAGQFSSLHVRDTKIEVAQFVKDLLLHLKKLFREGQFN* 7


In [36]:
feat_gene = []
for j in range(len(record3.features)):
    if record3.features[j].type == 'gene':
        feat_gene.append(j)
print('Número de features tipo gene: ', len(feat_gene), feat_gene)

Número de features tipo gene:  2 [1, 4]


In [39]:
print(record3.features[1].qualifiers)
print(record3.features[4].qualifiers)
print(record3.features[7].qualifiers)

{'gene': ['TH2LCRR'], 'gene_synonym': ['TH2-LCR'], 'note': ['T helper type 2 locus control region associated RNA'], 'db_xref': ['GeneID:101927761', 'HGNC:HGNC:40495']}
{'gene': ['IL13'], 'gene_synonym': ['IL-13; P600'], 'note': ['interleukin 13'], 'db_xref': ['GeneID:3596', 'HGNC:HGNC:5973', 'MIM:147683']}
{'gene': ['IL13'], 'gene_synonym': ['IL-13; P600'], 'note': ['isoform 1 precursor is encoded by transcript variant 1'], 'codon_start': ['1'], 'product': ['interleukin-13 isoform 1 precursor'], 'protein_id': ['NP_002179.2'], 'db_xref': ['CCDS:CCDS4157.1', 'GeneID:3596', 'HGNC:HGNC:5973', 'MIM:147683'], 'translation': ['MHPLLNPLLLALGLMALLLTTVIALTCLGGFASPGPVPPSTALRELIEELVNITQNQKAPLCNGSMVWSINLTAGMYCAALESLINVSGCSAIEKTQRMLSGFCPHKVSAGQFSSLHVRDTKIEVAQFVKDLLLHLKKLFREGQFN']}


### 04: Gene STAT6 (Signal transducer and activator of transcription 6)

In [40]:
record4 = SeqIO.read("STAT6.gb", "genbank") 
record4

SeqRecord(seq=Seq('CCTCCAGAGTAGCTGGGATTACAGGCATGCGCCACCACGCATGGCTAATTTTGT...CTG'), id='NG_021272.2', name='NG_021272', description='Homo sapiens signal transducer and activator of transcription 6 (STAT6), RefSeqGene (LRG_1369) on chromosome 12', dbxrefs=[])

In [41]:
print(len(record4.seq))
print(record4.id)
print(record4.description)
print(record4.name)

43736
NG_021272.2
Homo sapiens signal transducer and activator of transcription 6 (STAT6), RefSeqGene (LRG_1369) on chromosome 12
NG_021272


In [39]:
print(record4.annotations)

{'molecule_type': 'DNA', 'topology': 'linear', 'data_file_division': 'PRI', 'date': '09-OCT-2023', 'accessions': ['NG_021272'], 'sequence_version': 2, 'keywords': ['RefSeq', 'RefSeqGene'], 'source': 'Homo sapiens (human)', 'organism': 'Homo sapiens', 'taxonomy': ['Eukaryota', 'Metazoa', 'Chordata', 'Craniata', 'Vertebrata', 'Euteleostomi', 'Mammalia', 'Eutheria', 'Euarchontoglires', 'Primates', 'Haplorrhini', 'Catarrhini', 'Hominidae', 'Homo'], 'comment': 'REVIEWED REFSEQ: This record has been curated by NCBI staff in\ncollaboration with Shruti Srivastava, Liz Starks. The reference\nsequence was derived from AC023237.34.\nThis sequence is a reference standard in the RefSeqGene project.\nOn Feb 27, 2019 this sequence version replaced NG_021272.1.\nSummary: The protein encoded by this gene is a member of the STAT\nfamily of transcription factors. In response to cytokines and\ngrowth factors, STAT family members are phosphorylated by the\nreceptor associated kinases, and then form homo- o

In [42]:
print(record4.dbxrefs)
print(record4.annotations["source"] )
print(record4.annotations["taxonomy"])

[]
Homo sapiens (human)
['Eukaryota', 'Metazoa', 'Chordata', 'Craniata', 'Vertebrata', 'Euteleostomi', 'Mammalia', 'Eutheria', 'Euarchontoglires', 'Primates', 'Haplorrhini', 'Catarrhini', 'Hominidae', 'Homo']


In [43]:
print("Temos", len(record4.features), "features")
record4.features

Temos 43 features


[SeqFeature(SimpleLocation(ExactPosition(0), ExactPosition(43736), strand=1), type='source', qualifiers=...),
 SeqFeature(SimpleLocation(BeforePosition(0), ExactPosition(8657), strand=-1), type='gene', qualifiers=...),
 SeqFeature(SimpleLocation(BeforePosition(8108), ExactPosition(8657), strand=-1), type='mRNA', qualifiers=...),
 SeqFeature(SimpleLocation(BeforePosition(8108), ExactPosition(8175), strand=-1), type='CDS', qualifiers=...),
 SeqFeature(SimpleLocation(ExactPosition(8118), ExactPosition(8175), strand=-1), type='sig_peptide', qualifiers=...),
 SeqFeature(SimpleLocation(ExactPosition(8108), ExactPosition(8118), strand=-1), type='mat_peptide', qualifiers=...),
 SeqFeature(SimpleLocation(ExactPosition(25777), ExactPosition(41732), strand=1), type='gene', qualifiers=...),
 SeqFeature(CompoundLocation([SimpleLocation(ExactPosition(25777), ExactPosition(26011), strand=1), SimpleLocation(ExactPosition(28840), ExactPosition(28977), strand=1), SimpleLocation(ExactPosition(29396), Exa

In [44]:
for t in record4.features:
    print(t.type, t.location)

source [0:43736](+)
gene [<0:8657](-)
mRNA [<8108:8657](-)
CDS [<8108:8175](-)
sig_peptide [8118:8175](-)
mat_peptide [8108:8118](-)
gene [25777:41732](+)
mRNA join{[25777:26011](+), [28840:28977](+), [29396:29535](+), [29825:29909](+), [30308:30447](+), [30559:30612](+), [30800:30949](+), [31540:31672](+), [31800:31989](+), [32326:32414](+), [32553:32676](+), [34218:34311](+), [34643:34850](+), [37049:37144](+), [37236:37373](+), [37699:37846](+), [38061:38125](+), [38237:38348](+), [38542:38635](+), [40006:40072](+), [40161:40290](+), [40378:41732](+)}
exon [25777:26011](+)
exon [28840:28977](+)
CDS join{[28861:28977](+), [29396:29535](+), [29825:29909](+), [30308:30447](+), [30559:30612](+), [30800:30949](+), [31540:31672](+), [31800:31989](+), [32326:32414](+), [32553:32676](+), [34218:34311](+), [34643:34850](+), [37049:37144](+), [37236:37373](+), [37699:37846](+), [38061:38125](+), [38237:38348](+), [38542:38635](+), [40006:40072](+), [40161:40290](+), [40378:40568](+)}
misc_fea

In [45]:
feat_cds= []
for i in range(len(record4.features)):
    if record4.features[i].type == "CDS":
        feat_cds.append(i)
feat_cds

[3, 10, 38, 42]

In [46]:
print(record4.features[14].qualifiers)
print(record4.features[15].qualifiers)
print(record4.features[16].qualifiers)
print(record4.features[17].qualifiers)
print(record4.features[18].qualifiers)
print(record4.features[19].qualifiers)
print(record4.features[20].qualifiers)
print(record4.features[21].qualifiers)
print(record4.features[22].qualifiers)
print(record4.features[24].qualifiers)

{'gene': ['STAT6'], 'gene_synonym': ['D12S1644; HIES6; IL-4-STAT; STAT6B; STAT6C'], 'note': ['propagated from UniProtKB/Swiss-Prot (P42226.1); Region: Disordered. /evidence=ECO:0000256|SAM:MobiDB-lite']}
{'gene': ['STAT6'], 'gene_synonym': ['D12S1644; HIES6; IL-4-STAT; STAT6B; STAT6C'], 'inference': ['alignment:Splign:2.1.0'], 'number': ['3']}
{'gene': ['STAT6'], 'gene_synonym': ['D12S1644; HIES6; IL-4-STAT; STAT6B; STAT6C'], 'inference': ['alignment:Splign:2.1.0'], 'number': ['4']}
{'gene': ['STAT6'], 'gene_synonym': ['D12S1644; HIES6; IL-4-STAT; STAT6B; STAT6C'], 'inference': ['alignment:Splign:2.1.0'], 'number': ['5']}
{'gene': ['STAT6'], 'gene_synonym': ['D12S1644; HIES6; IL-4-STAT; STAT6B; STAT6C'], 'inference': ['alignment:Splign:2.1.0'], 'number': ['6']}
{'gene': ['STAT6'], 'gene_synonym': ['D12S1644; HIES6; IL-4-STAT; STAT6B; STAT6C'], 'inference': ['alignment:Splign:2.1.0'], 'number': ['7']}
{'gene': ['STAT6'], 'gene_synonym': ['D12S1644; HIES6; IL-4-STAT; STAT6B; STAT6C'], 'i

In [47]:
for i in feat_cds:
    coding_dna = record4.features[i].extract(record4.seq)
    print("DNA: ", coding_dna)
    print("Proteína: ", coding_dna.translate())

DNA:  ATGCTGACCCCGCCGTTGCTCCTGCTGCTGCCCCTGCTCTCAGCTCTGGTCGCGGCGGCTATCGACG
Proteína:  MLTPPLLLLLPLLSALVAAAID
DNA:  ATGTCTCTGTGGGGTCTGGTCTCCAAGATGCCCCCAGAAAAAGTGCAGCGGCTCTATGTCGACTTTCCCCAACACCTGCGGCATCTTCTGGGTGACTGGCTGGAGAGCCAGCCCTGGGAGTTCCTGGTCGGCTCCGACGCCTTCTGCTGCAACTTGGCTAGTGCCCTACTTTCAGACACTGTCCAGCACCTTCAGGCCTCGGTGGGAGAGCAGGGGGAGGGGAGCACCATCTTGCAACACATCAGCACCCTTGAGAGCATATATCAGAGGGACCCCCTGAAGCTGGTGGCCACTTTCAGACAAATACTTCAAGGAGAGAAAAAAGCTGTTATGGAACAGTTCCGCCACTTGCCAATGCCTTTCCACTGGAAGCAGGAAGAACTCAAGTTTAAGACAGGCTTGCGGAGGCTGCAGCACCGAGTAGGGGAGATCCACCTTCTCCGAGAAGCCCTGCAGAAGGGGGCTGAGGCTGGCCAAGTGTCTCTGCACAGCTTGATAGAAACTCCTGCTAATGGGACTGGGCCAAGTGAGGCCCTGGCCATGCTACTGCAGGAGACCACTGGAGAGCTAGAGGCAGCCAAAGCCCTAGTGCTGAAGAGGATCCAGATTTGGAAACGGCAGCAGCAGCTGGCAGGGAATGGCGCACCGTTTGAGGAGAGCCTGGCCCCACTCCAGGAGAGGTGTGAAAGCCTGGTGGACATTTATTCCCAGCTACAGCAGGAGGTAGGGGCGGCTGGTGGGGAGCTTGAGCCCAAGACCCGGGCATCGCTGACTGGCCGGCTGGATGAAGTCCTGAGAACCCTCGTCACCAGTTGCTTCCTGGTGGAGAAGCAGCCCCCCCAGGTACTGAAGACTCAGACCAAGTTCCAGGCTGGAGTTCGATTCC

In [49]:
feat_gene = []
for j in range(len(record4.features)):
    if record4.features[j].type == 'gene':
        feat_gene.append(j)
print('Número de features tipo gene: ', len(feat_gene), feat_gene)

Número de features tipo gene:  3 [1, 6, 35]


In [51]:
print(record4.features[1].qualifiers)
print(record4.features[6].qualifiers)
print(record4.features[35].qualifiers)

{'gene': ['LRP1'], 'gene_synonym': ['A2MR; APOER; APR; CD91; IGFBP-3R; IGFBP3R; IGFBP3R1; KPA; LRP; LRP1A; TGFBR5'], 'note': ['LDL receptor related protein 1'], 'db_xref': ['GeneID:4035', 'HGNC:HGNC:6692', 'MIM:107770']}
{'gene': ['STAT6'], 'gene_synonym': ['D12S1644; HIES6; IL-4-STAT; STAT6B; STAT6C'], 'note': ['signal transducer and activator of transcription 6'], 'db_xref': ['GeneID:6778', 'HGNC:HGNC:11368', 'MIM:601512']}
{'gene': ['NAB2'], 'gene_synonym': ['MADER'], 'note': ['NGFI-A binding protein 2'], 'db_xref': ['GeneID:4665', 'HGNC:HGNC:7627', 'MIM:602381']}


In [53]:
print(record4.features[3].qualifiers)
print(record4.features[10].qualifiers)
print(record4.features[38].qualifiers)
print(record4.features[42].qualifiers)

{'gene': ['LRP1'], 'gene_synonym': ['A2MR; APOER; APR; CD91; IGFBP-3R; IGFBP3R; IGFBP3R1; KPA; LRP; LRP1A; TGFBR5'], 'inference': ['similar to AA sequence (same species):RefSeq:NP_002323.2'], 'exception': ['annotated by transcript or proteomic data'], 'note': ['type V tgf-beta receptor; prolow-density lipoprotein receptor-related protein 1; TbetaR-V/LRP-1/IGFBP-3 receptor; alpha-2-macroglobulin receptor; apolipoprotein E receptor; low density lipoprotein receptor-related protein 1'], 'codon_start': ['1'], 'product': ['prolow-density lipoprotein receptor-related protein 1 preproprotein'], 'protein_id': ['NP_002323.2'], 'db_xref': ['CCDS:CCDS8932.1', 'GeneID:4035', 'HGNC:HGNC:6692', 'MIM:107770'], 'translation': ['MLTPPLLLLLPLLSALVAAAIDAPKTCSPKQFACRDQITCISKGWRCDGERDCPDGSDEAPEICPQSKAQRCQPNEHNCLGTELCVPMSRLCNGVQDCMDGSDEGPHCRELQGNCSRLGCQHHCVPTLDGPTCYCNSSFQLQADGKTCKDFDECSVYGTCSQLCTNTDGSFICGCVEGYLLQPDNRSCKAKNEPVDRPPVLLIANSQNILATYLSGAQVSTITPTSTRQTTAMDFSYANETVCWVHVGDSAAQTQLKCARMPGLKGFVDEHTINISLS