In [1]:
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio import SeqFeature

## Gene TGF-β1 (transforming growth factor beta 1 )

In [2]:
record = SeqIO.read("TGFB1.gb", "genbank") 
record

SeqRecord(seq=Seq('GCCGCCGCCGCCCTTCGCGCCCTGGGCCATCTCCCTCCCACCTCCCTCCGCGGA...CAG'), id='NC_000019.10', name='NC_000019', description='Homo sapiens chromosome 19, GRCh38.p14 Primary Assembly', dbxrefs=['BioProject:PRJNA168', 'Assembly:GCF_000001405.40'])

In [8]:
print(len(record.seq))
print(record.id)
print(record.description)
print(record.name)

23600
NC_000019.10
Homo sapiens chromosome 19, GRCh38.p14 Primary Assembly
NC_000019


In [9]:
print(record.annotations)

{'molecule_type': 'DNA', 'topology': 'linear', 'data_file_division': 'CON', 'date': '07-OCT-2023', 'accessions': ['NC_000019', 'REGION:', 'complement(41330323..41353922)'], 'sequence_version': 10, 'keywords': ['RefSeq'], 'source': 'Homo sapiens (human)', 'organism': 'Homo sapiens', 'taxonomy': ['Eukaryota', 'Metazoa', 'Chordata', 'Craniata', 'Vertebrata', 'Euteleostomi', 'Mammalia', 'Eutheria', 'Euarchontoglires', 'Primates', 'Haplorrhini', 'Catarrhini', 'Hominidae', 'Homo'], 'references': [Reference(title='Finishing the euchromatic sequence of the human genome', ...), Reference(title='The DNA sequence and biology of human chromosome 19', ...), Reference(title='Initial sequencing and analysis of the human genome', ...)], 'comment': 'REFSEQ INFORMATION: The reference sequence is identical to\nCM000681.2.\nOn Feb 3, 2014 this sequence version replaced NC_000019.9.\nAssembly Name: GRCh38.p14 Primary Assembly\nThe DNA sequence is composed of genomic sequence, primarily\nfinished clones tha

In [4]:
print(record.dbxrefs)
print(record.annotations["source"] )
print(record.annotations["taxonomy"])

['BioProject:PRJNA168', 'Assembly:GCF_000001405.40']
Homo sapiens (human)
['Eukaryota', 'Metazoa', 'Chordata', 'Craniata', 'Vertebrata', 'Euteleostomi', 'Mammalia', 'Eutheria', 'Euarchontoglires', 'Primates', 'Haplorrhini', 'Catarrhini', 'Hominidae', 'Homo']


In [6]:
print("Temos", len(record.features), "features")
record.features

Temos 6 features


[SeqFeature(SimpleLocation(ExactPosition(0), ExactPosition(23600), strand=1), type='source', qualifiers=...),
 SeqFeature(SimpleLocation(ExactPosition(0), ExactPosition(23600), strand=1), type='gene', qualifiers=...),
 SeqFeature(CompoundLocation([SimpleLocation(ExactPosition(0), ExactPosition(1233), strand=1), SimpleLocation(ExactPosition(5467), ExactPosition(5628), strand=1), SimpleLocation(ExactPosition(9058), ExactPosition(9176), strand=1), SimpleLocation(ExactPosition(11675), ExactPosition(11753), strand=1), SimpleLocation(ExactPosition(11892), ExactPosition(12040), strand=1), SimpleLocation(ExactPosition(21641), ExactPosition(21795), strand=1), SimpleLocation(ExactPosition(22712), ExactPosition(23600), strand=1)], 'join'), type='mRNA', location_operator='join', qualifiers=...),
 SeqFeature(CompoundLocation([SimpleLocation(ExactPosition(0), ExactPosition(1233), strand=1), SimpleLocation(ExactPosition(5467), ExactPosition(5628), strand=1), SimpleLocation(ExactPosition(9058), ExactP

In [10]:
for t in record.features:
    print(t.type, t.location)

source [0:23600](+)
gene [0:23600](+)
mRNA join{[0:1233](+), [5467:5628](+), [9058:9176](+), [11675:11753](+), [11892:12040](+), [21641:21795](+), [22712:23600](+)}
mRNA join{[0:1233](+), [5467:5628](+), [9058:9176](+), [11675:11753](+), [11889:12040](+), [21641:21795](+), [22712:23600](+)}
CDS join{[878:1233](+), [5467:5628](+), [9058:9176](+), [11675:11753](+), [11892:12040](+), [21641:21795](+), [22712:22871](+)}
CDS join{[878:1233](+), [5467:5628](+), [9058:9176](+), [11675:11753](+), [11889:12040](+), [21641:21795](+), [22712:22871](+)}


In [11]:
feat_cds= []
for i in range(len(record.features)):
    if record.features[i].type == "CDS":
        feat_cds.append(i)
feat_cds

[4, 5]

In [16]:
print(record.features[4].qualifiers)
print(record.features[5].qualifiers)

{'gene': ['TGFB1'], 'gene_synonym': ['CED; DPD1; IBDIMDE; LAP; TGF-beta1; TGFB; TGFbeta'], 'note': ['Derived by automated computational analysis using gene prediction method: BestRefSeq.'], 'codon_start': ['1'], 'product': ['transforming growth factor beta-1 proprotein preproprotein'], 'protein_id': ['NP_000651.3'], 'db_xref': ['CCDS:CCDS33031.1', 'Ensembl:ENSP00000221930.4', 'GeneID:7040', 'HGNC:HGNC:11766', 'MIM:190180'], 'translation': ['MPPSGLRLLPLLLPLLWLLVLTPGRPAAGLSTCKTIDMELVKRKRIEAIRGQILSKLRLASPPSQGEVPPGPLPEAVLALYNSTRDRVAGESAEPEPEPEADYYAKEVTRVLMVETHNEIYDKFKQSTHSIYMFFNTSELREAVPEPVLLSRAELRLLRLKLKVEQHVELYQKYSNNSWRYLSNRLLAPSDSPEWLSFDVTGVVRQWLSRGGEIEGFRLSAHCSCDSRDNTLQVDINGFTTGRRGDLATIHGMNRPFLLLMATPLERAQHLQSSRHRRALDTNYCFSSTEKNCCVRQLYIDFRKDLGWKWIHEPKGYHANFCLGPCPYIWSLDTQYSKVLALYNQHNPGASAAPCCVPQALEPLPIVYYVGRKPKVEQLSNMIVRSCKCS']}
{'gene': ['TGFB1'], 'gene_synonym': ['CED; DPD1; IBDIMDE; LAP; TGF-beta1; TGFB; TGFbeta'], 'note': ['Derived by automated computational analysis using gene predi

In [17]:
for i in feat_cds:
    coding_dna = record.features[i].extract(record.seq)
    print("DNA: ", coding_dna)
    print("Proteína: ", coding_dna.translate())

DNA:  ATGCCGCCCTCCGGGCTGCGGCTGCTGCCGCTGCTGCTACCGCTGCTGTGGCTACTGGTGCTGACGCCTGGCCGGCCGGCCGCGGGACTATCCACCTGCAAGACTATCGACATGGAGCTGGTGAAGCGGAAGCGCATCGAGGCCATCCGCGGCCAGATCCTGTCCAAGCTGCGGCTCGCCAGCCCCCCGAGCCAGGGGGAGGTGCCGCCCGGCCCGCTGCCCGAGGCCGTGCTCGCCCTGTACAACAGCACCCGCGACCGGGTGGCCGGGGAGAGTGCAGAACCGGAGCCCGAGCCTGAGGCCGACTACTACGCCAAGGAGGTCACCCGCGTGCTAATGGTGGAAACCCACAACGAAATCTATGACAAGTTCAAGCAGAGTACACACAGCATATATATGTTCTTCAACACATCAGAGCTCCGAGAAGCGGTACCTGAACCCGTGTTGCTCTCCCGGGCAGAGCTGCGTCTGCTGAGGCTCAAGTTAAAAGTGGAGCAGCACGTGGAGCTGTACCAGAAATACAGCAACAATTCCTGGCGATACCTCAGCAACCGGCTGCTGGCACCCAGCGACTCGCCAGAGTGGTTATCTTTTGATGTCACCGGAGTTGTGCGGCAGTGGTTGAGCCGTGGAGGGGAAATTGAGGGCTTTCGCCTTAGCGCCCACTGCTCCTGTGACAGCAGGGATAACACACTGCAAGTGGACATCAACGGGTTCACTACCGGCCGCCGAGGTGACCTGGCCACCATTCATGGCATGAACCGGCCTTTCCTGCTTCTCATGGCCACCCCGCTGGAGAGGGCCCAGCATCTGCAAAGCTCCCGGCACCGCCGAGCCCTGGACACCAACTATTGCTTCAGCTCCACGGAGAAGAACTGCTGCGTGCGGCAGCTGTACATTGACTTCCGCAAGGACCTCGGCTGGAAGTGGATCCACGAGCCCAAGGGCTACCATGCCAACTTCTGCCTCGGGCCCTGCCCCTACATTTGGAGCC

In [22]:
feat_gene = []
for j in range(len(record.features)):
    if record.features[j].type == 'gene':
        feat_gene.append(j)
print('Número de features tipo gene: ', len(feat_gene))

Número de features tipo gene:  1


## Gene IL-10 (Interleukin 10)

In [23]:
record2 = SeqIO.read("IL10.gb", "genbank") 
record2

SeqRecord(seq=Seq('ACACATCAGGGGCTTGCTCTTGCAAAACCAAACCACAAGACAGACTTGCAAAAG...TCA'), id='NC_000001.11', name='NC_000001', description='Homo sapiens chromosome 1, GRCh38.p14 Primary Assembly', dbxrefs=['BioProject:PRJNA168', 'Assembly:GCF_000001405.40'])

In [24]:
print(len(record2.seq))
print(record2.id)
print(record2.description)
print(record2.name)

4893
NC_000001.11
Homo sapiens chromosome 1, GRCh38.p14 Primary Assembly
NC_000001


In [26]:
print(record2.annotations)

{'molecule_type': 'DNA', 'topology': 'linear', 'data_file_division': 'CON', 'date': '07-OCT-2023', 'accessions': ['NC_000001', 'REGION:', 'complement(206767602..206772494)'], 'sequence_version': 11, 'keywords': ['RefSeq'], 'source': 'Homo sapiens (human)', 'organism': 'Homo sapiens', 'taxonomy': ['Eukaryota', 'Metazoa', 'Chordata', 'Craniata', 'Vertebrata', 'Euteleostomi', 'Mammalia', 'Eutheria', 'Euarchontoglires', 'Primates', 'Haplorrhini', 'Catarrhini', 'Hominidae', 'Homo'], 'references': [Reference(title='The DNA sequence and biological annotation of human chromosome 1', ...), Reference(title='Finishing the euchromatic sequence of the human genome', ...), Reference(title='Initial sequencing and analysis of the human genome', ...)], 'comment': 'REFSEQ INFORMATION: The reference sequence is identical to\nCM000663.2.\nOn Feb 3, 2014 this sequence version replaced NC_000001.10.\nAssembly Name: GRCh38.p14 Primary Assembly\nThe DNA sequence is composed of genomic sequence, primarily\nfin

In [27]:
print(record2.dbxrefs)
print(record2.annotations["source"] )
print(record2.annotations["taxonomy"])

['BioProject:PRJNA168', 'Assembly:GCF_000001405.40']
Homo sapiens (human)
['Eukaryota', 'Metazoa', 'Chordata', 'Craniata', 'Vertebrata', 'Euteleostomi', 'Mammalia', 'Eutheria', 'Euarchontoglires', 'Primates', 'Haplorrhini', 'Catarrhini', 'Hominidae', 'Homo']


In [28]:
print("Temos", len(record2.features), "features")
record2.features

Temos 11 features


[SeqFeature(SimpleLocation(ExactPosition(0), ExactPosition(4893), strand=1), type='source', qualifiers=...),
 SeqFeature(SimpleLocation(BeforePosition(0), ExactPosition(1722), strand=-1), type='gene', qualifiers=...),
 SeqFeature(SimpleLocation(BeforePosition(1416), ExactPosition(1722), strand=-1), type='mRNA', qualifiers=...),
 SeqFeature(SimpleLocation(BeforePosition(1664), ExactPosition(1722), strand=-1), type='mRNA', qualifiers=...),
 SeqFeature(SimpleLocation(ExactPosition(0), ExactPosition(4893), strand=1), type='gene', qualifiers=...),
 SeqFeature(CompoundLocation([SimpleLocation(ExactPosition(0), ExactPosition(224), strand=1), SimpleLocation(ExactPosition(1079), ExactPosition(1139), strand=1), SimpleLocation(ExactPosition(1435), ExactPosition(1588), strand=1), SimpleLocation(ExactPosition(2600), ExactPosition(2666), strand=1), SimpleLocation(ExactPosition(3766), ExactPosition(4893), strand=1)], 'join'), type='mRNA', location_operator='join', qualifiers=...),
 SeqFeature(Compoun

In [29]:
for t in record2.features:
    print(t.type, t.location)

source [0:4893](+)
gene [<0:1722](-)
mRNA [<1416:1722](-)
mRNA [<1664:1722](-)
gene [0:4893](+)
mRNA join{[0:224](+), [1079:1139](+), [1435:1588](+), [2600:2666](+), [3766:4893](+)}
misc_RNA join{[0:224](+), [1079:1139](+), [1435:1588](+), [2175:2413](+), [2600:2666](+), [3766:4893](+)}
CDS join{[59:224](+), [1079:1139](+), [1435:1588](+), [2600:2666](+), [3766:3859](+)}
mRNA join{[1305:1588](+), [2600:2666](+), [3766:4893](+)}
CDS join{[1465:1588](+), [2600:2666](+), [3766:3859](+)}
misc_RNA join{[1891:2096](+), [2600:2666](+), [3766:4893](+)}


In [30]:
feat_cds= []
for i in range(len(record2.features)):
    if record2.features[i].type == "CDS":
        feat_cds.append(i)
feat_cds

[7, 9]

In [31]:
print(record2.features[7].qualifiers)
print(record2.features[9].qualifiers)

{'gene': ['IL10'], 'gene_synonym': ['CSIF; GVHDS; IL-10; IL10A; TGIF'], 'note': ['isoform 1 precursor is encoded by transcript variant 1; Derived by automated computational analysis using gene prediction method: BestRefSeq.'], 'codon_start': ['1'], 'product': ['interleukin-10 isoform 1 precursor'], 'protein_id': ['NP_000563.1'], 'db_xref': ['CCDS:CCDS1467.1', 'Ensembl:ENSP00000412237.1', 'GeneID:3586', 'HGNC:HGNC:5962', 'MIM:124092'], 'translation': ['MHSSALLCCLVLLTGVRASPGQGTQSENSCTHFPGNLPNMLRDLRDAFSRVKTFFQMKDQLDNLLLKESLLEDFKGYLGCQALSEMIQFYLEEVMPQAENQDPDIKAHVNSLGENLKTLRLRLRRCHRFLPCENKSKAVEQVKNAFNKLQEKGIYKAMSEFDIFINYIEAYMTMKIRN']}
{'gene': ['IL10'], 'gene_synonym': ['CSIF; GVHDS; IL-10; IL10A; TGIF'], 'note': ['isoform 2 is encoded by transcript variant 2; Derived by automated computational analysis using gene prediction method: BestRefSeq.'], 'codon_start': ['1'], 'product': ['interleukin-10 isoform 2'], 'protein_id': ['NP_001369553.1'], 'db_xref': ['CCDS:CCDS91154.1', 'GeneID:3586', '

In [32]:
for i in feat_cds:
    coding_dna = record2.features[i].extract(record2.seq)
    print("DNA: ", coding_dna)
    print("Proteína: ", coding_dna.translate())

DNA:  ATGCACAGCTCAGCACTGCTCTGTTGCCTGGTCCTCCTGACTGGGGTGAGGGCCAGCCCAGGCCAGGGCACCCAGTCTGAGAACAGCTGCACCCACTTCCCAGGCAACCTGCCTAACATGCTTCGAGATCTCCGAGATGCCTTCAGCAGAGTGAAGACTTTCTTTCAAATGAAGGATCAGCTGGACAACTTGTTGTTAAAGGAGTCCTTGCTGGAGGACTTTAAGGGTTACCTGGGTTGCCAAGCCTTGTCTGAGATGATCCAGTTTTACCTGGAGGAGGTGATGCCCCAAGCTGAGAACCAAGACCCAGACATCAAGGCGCATGTGAACTCCCTGGGGGAGAACCTGAAGACCCTCAGGCTGAGGCTACGGCGCTGTCATCGATTTCTTCCCTGTGAAAACAAGAGCAAGGCCGTGGAGCAGGTGAAGAATGCCTTTAATAAGCTCCAAGAGAAAGGCATCTACAAAGCCATGAGTGAGTTTGACATCTTCATCAACTACATAGAAGCCTACATGACAATGAAGATACGAAACTGA
Proteína:  MHSSALLCCLVLLTGVRASPGQGTQSENSCTHFPGNLPNMLRDLRDAFSRVKTFFQMKDQLDNLLLKESLLEDFKGYLGCQALSEMIQFYLEEVMPQAENQDPDIKAHVNSLGENLKTLRLRLRRCHRFLPCENKSKAVEQVKNAFNKLQEKGIYKAMSEFDIFINYIEAYMTMKIRN*
DNA:  ATGATCCAGTTTTACCTGGAGGAGGTGATGCCCCAAGCTGAGAACCAAGACCCAGACATCAAGGCGCATGTGAACTCCCTGGGGGAGAACCTGAAGACCCTCAGGCTGAGGCTACGGCGCTGTCATCGATTTCTTCCCTGTGAAAACAAGAGCAAGGCCGTGGAGCAGGTGAAGAATGCCTTTAATAAGCTCCAAGAGAAAGGCATCTACAAAGCCATGAGTGAGTTTGACATCTTCATCAACTACATAGAAGCCTACA

In [33]:
feat_gene = []
for j in range(len(record2.features)):
    if record2.features[j].type == 'gene':
        feat_gene.append(j)
print('Número de features tipo gene: ', len(feat_gene))

Número de features tipo gene:  2


## Gene IL-13 (Interleukin 13)

In [36]:
record3 = SeqIO.read("IL13.gb", "genbank") 
record3

SeqRecord(seq=Seq('ACGCGCGGGGGCGCCCCTGCCCACCGCTCCCGGCAGGGCTTTTGGTGGCCATGG...ATA'), id='NC_000005.10', name='NC_000005', description='Homo sapiens chromosome 5, GRCh38.p14 Primary Assembly', dbxrefs=['BioProject:PRJNA168', 'Assembly:GCF_000001405.40'])

In [37]:
print(len(record3.seq))
print(record3.id)
print(record3.description)
print(record3.name)

4589
NC_000005.10
Homo sapiens chromosome 5, GRCh38.p14 Primary Assembly
NC_000005


In [38]:
print(record3.annotations)

{'molecule_type': 'DNA', 'topology': 'linear', 'data_file_division': 'CON', 'date': '07-OCT-2023', 'accessions': ['NC_000005', 'REGION:', '132656522..132661110'], 'sequence_version': 10, 'keywords': ['RefSeq'], 'source': 'Homo sapiens (human)', 'organism': 'Homo sapiens', 'taxonomy': ['Eukaryota', 'Metazoa', 'Chordata', 'Craniata', 'Vertebrata', 'Euteleostomi', 'Mammalia', 'Eutheria', 'Euarchontoglires', 'Primates', 'Haplorrhini', 'Catarrhini', 'Hominidae', 'Homo'], 'references': [Reference(title='Finishing the euchromatic sequence of the human genome', ...), Reference(title='The DNA sequence and comparative analysis of human chromosome 5', ...), Reference(title='Initial sequencing and analysis of the human genome', ...)], 'comment': 'REFSEQ INFORMATION: The reference sequence is identical to\nCM000667.2.\nOn Feb 3, 2014 this sequence version replaced NC_000005.9.\nAssembly Name: GRCh38.p14 Primary Assembly\nThe DNA sequence is composed of genomic sequence, primarily\nfinished clones t

In [39]:
print(record3.dbxrefs)
print(record3.annotations["source"] )
print(record3.annotations["taxonomy"])

['BioProject:PRJNA168', 'Assembly:GCF_000001405.40']
Homo sapiens (human)
['Eukaryota', 'Metazoa', 'Chordata', 'Craniata', 'Vertebrata', 'Euteleostomi', 'Mammalia', 'Eutheria', 'Euarchontoglires', 'Primates', 'Haplorrhini', 'Catarrhini', 'Hominidae', 'Homo']


In [40]:
print("Temos", len(record3.features), "features")
record3.features

Temos 18 features


[SeqFeature(SimpleLocation(ExactPosition(0), ExactPosition(4589), strand=1), type='source', qualifiers=...),
 SeqFeature(SimpleLocation(BeforePosition(0), ExactPosition(146), strand=1), type='misc_feature', qualifiers=...),
 SeqFeature(SimpleLocation(BeforePosition(0), ExactPosition(146), strand=1), type='regulatory', qualifiers=...),
 SeqFeature(SimpleLocation(ExactPosition(0), ExactPosition(4589), strand=1), type='gene', qualifiers=...),
 SeqFeature(CompoundLocation([SimpleLocation(ExactPosition(0), ExactPosition(109), strand=1), SimpleLocation(ExactPosition(544), ExactPosition(723), strand=1), SimpleLocation(ExactPosition(1768), ExactPosition(1839), strand=1), SimpleLocation(ExactPosition(2896), ExactPosition(2950), strand=1), SimpleLocation(ExactPosition(3202), ExactPosition(3307), strand=1), SimpleLocation(ExactPosition(3653), ExactPosition(4589), strand=1)], 'join'), type='mRNA', location_operator='join', qualifiers=...),
 SeqFeature(CompoundLocation([SimpleLocation(ExactPosition

In [41]:
for t in record3.features:
    print(t.type, t.location)

source [0:4589](+)
misc_feature [<0:146](+)
regulatory [<0:146](+)
gene [0:4589](+)
mRNA join{[0:109](+), [544:723](+), [1768:1839](+), [2896:2950](+), [3202:3307](+), [3653:4589](+)}
mRNA join{[0:109](+), [544:723](+), [2896:2950](+), [3202:3307](+), [3653:4589](+)}
mRNA join{[0:109](+), [1768:1839](+), [2896:2950](+), [3202:3307](+), [3653:4589](+)}
misc_feature [206:296](+)
regulatory [206:296](+)
misc_feature [376:436](+)
regulatory [376:436](+)
mRNA join{[1651:1839](+), [2896:2950](+), [3202:3307](+), [3653:4589](+)}
CDS join{[1665:1839](+), [2896:2950](+), [3202:3307](+), [3653:3761](+)}
sig_peptide [1665:1725](+)
mat_peptide join{[1725:1839](+), [2896:2950](+), [3202:3307](+), [3653:3716](+)}
CDS join{[2917:2950](+), [3202:3307](+), [3653:3761](+)}
CDS join{[2917:2950](+), [3202:3307](+), [3653:3761](+)}
CDS join{[2917:2950](+), [3202:3307](+), [3653:3761](+)}


In [42]:
feat_cds= []
for i in range(len(record3.features)):
    if record3.features[i].type == "CDS":
        feat_cds.append(i)
feat_cds

[12, 15, 16, 17]

In [43]:
print(record3.features[12].qualifiers)
print(record3.features[15].qualifiers)
print(record3.features[16].qualifiers)
print(record3.features[17].qualifiers)

{'gene': ['IL13'], 'gene_synonym': ['IL-13; P600'], 'note': ['isoform 1 precursor is encoded by transcript variant 1; Derived by automated computational analysis using gene prediction method: BestRefSeq.'], 'codon_start': ['1'], 'product': ['interleukin-13 isoform 1 precursor'], 'protein_id': ['NP_002179.2'], 'db_xref': ['CCDS:CCDS4157.1', 'Ensembl:ENSP00000304915.3', 'GeneID:3596', 'HGNC:HGNC:5973', 'MIM:147683'], 'translation': ['MHPLLNPLLLALGLMALLLTTVIALTCLGGFASPGPVPPSTALRELIEELVNITQNQKAPLCNGSMVWSINLTAGMYCAALESLINVSGCSAIEKTQRMLSGFCPHKVSAGQFSSLHVRDTKIEVAQFVKDLLLHLKKLFREGQFN']}
{'gene': ['IL13'], 'gene_synonym': ['IL-13; P600'], 'note': ['isoform b is encoded by transcript variant 4; Derived by automated computational analysis using gene prediction method: BestRefSeq.'], 'codon_start': ['1'], 'product': ['interleukin-13 isoform b'], 'protein_id': ['NP_001341922.1'], 'db_xref': ['GeneID:3596', 'HGNC:HGNC:5973', 'MIM:147683'], 'translation': ['MVWSINLTAGMYCAALESLINVSGCSAIEKTQRMLSGFCPHKV

In [44]:
for i in feat_cds:
    coding_dna = record3.features[i].extract(record3.seq)
    print("DNA: ", coding_dna)
    print("Proteína: ", coding_dna.translate())

DNA:  ATGCATCCGCTCCTCAATCCTCTCCTGTTGGCACTGGGCCTCATGGCGCTTTTGTTGACCACGGTCATTGCTCTCACTTGCCTTGGCGGCTTTGCCTCCCCAGGCCCTGTGCCTCCCTCTACAGCCCTCAGGGAGCTCATTGAGGAGCTGGTCAACATCACCCAGAACCAGAAGGCTCCGCTCTGCAATGGCAGCATGGTATGGAGCATCAACCTGACAGCTGGCATGTACTGTGCAGCCCTGGAATCCCTGATCAACGTGTCAGGCTGCAGTGCCATCGAGAAGACCCAGAGGATGCTGAGCGGATTCTGCCCGCACAAGGTCTCAGCTGGGCAGTTTTCCAGCTTGCATGTCCGAGACACCAAAATCGAGGTGGCCCAGTTTGTAAAGGACCTGCTCTTACATTTAAAGAAACTTTTTCGCGAGGGACAGTTCAACTGA
Proteína:  MHPLLNPLLLALGLMALLLTTVIALTCLGGFASPGPVPPSTALRELIEELVNITQNQKAPLCNGSMVWSINLTAGMYCAALESLINVSGCSAIEKTQRMLSGFCPHKVSAGQFSSLHVRDTKIEVAQFVKDLLLHLKKLFREGQFN*
DNA:  ATGGTATGGAGCATCAACCTGACAGCTGGCATGTACTGTGCAGCCCTGGAATCCCTGATCAACGTGTCAGGCTGCAGTGCCATCGAGAAGACCCAGAGGATGCTGAGCGGATTCTGCCCGCACAAGGTCTCAGCTGGGCAGTTTTCCAGCTTGCATGTCCGAGACACCAAAATCGAGGTGGCCCAGTTTGTAAAGGACCTGCTCTTACATTTAAAGAAACTTTTTCGCGAGGGACAGTTCAACTGA
Proteína:  MVWSINLTAGMYCAALESLINVSGCSAIEKTQRMLSGFCPHKVSAGQFSSLHVRDTKIEVAQFVKDLLLHLKKLFREGQFN*
DNA:  ATGGTATGGAGCATCAACCTGACAGCTGGCATGTACTGTG

In [45]:
feat_gene = []
for j in range(len(record3.features)):
    if record3.features[j].type == 'gene':
        feat_gene.append(j)
print('Número de features tipo gene: ', len(feat_gene))

Número de features tipo gene:  1


## Gene STAT6 (Signal transducer and activator of transcription 6)

In [46]:
record4 = SeqIO.read("STAT6.gb", "genbank") 
record4

SeqRecord(seq=Seq('GGGGCAGCCACTGCTTACACTGAAGAGGGAGGACGGGAGAGGAGTGTGTGTGTG...AAA'), id='NC_000012.12', name='NC_000012', description='Homo sapiens chromosome 12, GRCh38.p14 Primary Assembly', dbxrefs=['BioProject:PRJNA168', 'Assembly:GCF_000001405.40'])

In [47]:
print(len(record4.seq))
print(record4.id)
print(record4.description)
print(record4.name)

15955
NC_000012.12
Homo sapiens chromosome 12, GRCh38.p14 Primary Assembly
NC_000012


In [49]:
print(record4.annotations)

{'molecule_type': 'DNA', 'topology': 'linear', 'data_file_division': 'CON', 'date': '07-OCT-2023', 'accessions': ['NC_000012', 'REGION:', 'complement(57095408..57111362)'], 'sequence_version': 12, 'keywords': ['RefSeq'], 'source': 'Homo sapiens (human)', 'organism': 'Homo sapiens', 'taxonomy': ['Eukaryota', 'Metazoa', 'Chordata', 'Craniata', 'Vertebrata', 'Euteleostomi', 'Mammalia', 'Eutheria', 'Euarchontoglires', 'Primates', 'Haplorrhini', 'Catarrhini', 'Hominidae', 'Homo'], 'references': [Reference(title='The finished DNA sequence of human chromosome 12', ...), Reference(title='Finishing the euchromatic sequence of the human genome', ...), Reference(title='Initial sequencing and analysis of the human genome', ...)], 'comment': 'REFSEQ INFORMATION: The reference sequence is identical to\nCM000674.2.\nOn Feb 3, 2014 this sequence version replaced NC_000012.11.\nAssembly Name: GRCh38.p14 Primary Assembly\nThe DNA sequence is composed of genomic sequence, primarily\nfinished clones that 

In [50]:
print(record4.dbxrefs)
print(record4.annotations["source"] )
print(record4.annotations["taxonomy"])

['BioProject:PRJNA168', 'Assembly:GCF_000001405.40']
Homo sapiens (human)
['Eukaryota', 'Metazoa', 'Chordata', 'Craniata', 'Vertebrata', 'Euteleostomi', 'Mammalia', 'Eutheria', 'Euarchontoglires', 'Primates', 'Haplorrhini', 'Catarrhini', 'Hominidae', 'Homo']


In [51]:
print("Temos", len(record4.features), "features")
record4.features

Temos 28 features


[SeqFeature(SimpleLocation(ExactPosition(0), ExactPosition(15955), strand=1), type='source', qualifiers=...),
 SeqFeature(SimpleLocation(BeforePosition(0), ExactPosition(641), strand=-1), type='gene', qualifiers=...),
 SeqFeature(SimpleLocation(BeforePosition(0), ExactPosition(641), strand=-1), type='ncRNA', qualifiers=...),
 SeqFeature(SimpleLocation(ExactPosition(0), ExactPosition(15955), strand=1), type='gene', qualifiers=...),
 SeqFeature(CompoundLocation([SimpleLocation(ExactPosition(0), ExactPosition(234), strand=1), SimpleLocation(ExactPosition(4048), ExactPosition(4132), strand=1), SimpleLocation(ExactPosition(4531), ExactPosition(4670), strand=1), SimpleLocation(ExactPosition(4782), ExactPosition(4835), strand=1), SimpleLocation(ExactPosition(5023), ExactPosition(5172), strand=1), SimpleLocation(ExactPosition(5763), ExactPosition(5895), strand=1), SimpleLocation(ExactPosition(6023), ExactPosition(6212), strand=1), SimpleLocation(ExactPosition(6549), ExactPosition(6637), strand

In [52]:
for t in record4.features:
    print(t.type, t.location)

source [0:15955](+)
gene [<0:641](-)
ncRNA [<0:641](-)
gene [0:15955](+)
mRNA join{[0:234](+), [4048:4132](+), [4531:4670](+), [4782:4835](+), [5023:5172](+), [5763:5895](+), [6023:6212](+), [6549:6637](+), [6776:6899](+), [8441:8534](+), [8866:9073](+), [11272:11367](+), [11459:11596](+), [11922:12069](+), [12284:12348](+), [12460:12571](+), [12765:12858](+), [14229:14295](+), [14384:14513](+), [14601:15955](+)}
mRNA join{[0:234](+), [3619:3758](+), [4048:4132](+), [4531:4670](+), [4782:4835](+), [5023:5172](+), [5763:5895](+), [6023:6212](+), [6549:6637](+), [6776:6899](+), [8441:8534](+), [8866:9073](+), [11272:11367](+), [11459:11596](+), [11922:12069](+), [12284:12348](+), [12460:12571](+), [12765:12858](+), [14229:14295](+), [14384:14513](+), [14601:15955](+)}
misc_RNA join{[0:234](+), [3063:3200](+), [4048:4132](+), [4531:4670](+), [4782:4835](+), [5023:5172](+), [5763:5895](+), [6023:6212](+), [6549:6637](+), [6776:6899](+), [8441:8534](+), [8866:9073](+), [11272:11367](+), [11

In [53]:
feat_cds= []
for i in range(len(record4.features)):
    if record4.features[i].type == "CDS":
        feat_cds.append(i)
feat_cds

[14, 15, 16, 17, 18, 19, 20, 21, 22, 24]

In [56]:
print(record4.features[14].qualifiers)
print(record4.features[15].qualifiers)
print(record4.features[16].qualifiers)
print(record4.features[17].qualifiers)
print(record4.features[18].qualifiers)
print(record4.features[19].qualifiers)
print(record4.features[20].qualifiers)
print(record4.features[21].qualifiers)
print(record4.features[22].qualifiers)
print(record4.features[24].qualifiers)

{'gene': ['STAT6'], 'gene_synonym': ['D12S1644; IL-4-STAT; STAT6B; STAT6C'], 'note': ['Derived by automated computational analysis using gene prediction method: Gnomon.'], 'codon_start': ['1'], 'product': ['signal transducer and activator of transcription 6 isoform X1'], 'protein_id': ['XP_047285429.1'], 'db_xref': ['GeneID:6778', 'HGNC:HGNC:11368', 'MIM:601512'], 'translation': ['MSLWGLVSKMPPEKVQRLYVDFPQHLRHLLGDWLESQPWEFLVGSDAFCCNLASALLSDTVQHLQASVGEQGEGSTILQHISTLESIYQRDPLKLVATFRQILQGEKKAVMEQFRHLPMPFHWKQEELKFKTGLRRLQHRVGEIHLLREALQKGAEAGQVSLHSLIETPANGTGPSEALAMLLQETTGELEAAKALVLKRIQIWKRQQQLAGNGAPFEESLAPLQERCESLVDIYSQLQQEVGAAGGELEPKTRASLTGRLDEVLRTLVTSCFLVEKQPPQVLKTQTKFQAGVRFLLGLRFLGAPAKPPLVRADMVTEKQARELSVPQGPGAGAESTGEIINNTVPLENSIPGNCCSALFKNLLLKKIKRCERKGTESVTEEKCAVLFSASFTLGPGKLPIQLQALSLPLVVIVHGNQDNNAKATILWDNAFSEMDRVPFVVAERVPWEKMCETLNLKFMAEVGTNRGLLPEHFLFLAQKIFNDNSLSMEAFQHRSVSWSQFNKEILLGRGFTFWQWFDGVLDLTKRCLRSYWSDRLIIGFISKQYVTSLLLNEPDGTFLLRFSDSEIGGITIAHVIRGQDGSPQIENIQPFSAKDLSIRSLGDRIRDLAQLKNLY

In [57]:
for i in feat_cds:
    coding_dna = record4.features[i].extract(record4.seq)
    print("DNA: ", coding_dna)
    print("Proteína: ", coding_dna.translate())

DNA:  ATGTCTCTGTGGGGTCTGGTCTCCAAGATGCCCCCAGAAAAAGTGCAGCGGCTCTATGTCGACTTTCCCCAACACCTGCGGCATCTTCTGGGTGACTGGCTGGAGAGCCAGCCCTGGGAGTTCCTGGTCGGCTCCGACGCCTTCTGCTGCAACTTGGCTAGTGCCCTACTTTCAGACACTGTCCAGCACCTTCAGGCCTCGGTGGGAGAGCAGGGGGAGGGGAGCACCATCTTGCAACACATCAGCACCCTTGAGAGCATATATCAGAGGGACCCCCTGAAGCTGGTGGCCACTTTCAGACAAATACTTCAAGGAGAGAAAAAAGCTGTTATGGAACAGTTCCGCCACTTGCCAATGCCTTTCCACTGGAAGCAGGAAGAACTCAAGTTTAAGACAGGCTTGCGGAGGCTGCAGCACCGAGTAGGGGAGATCCACCTTCTCCGAGAAGCCCTGCAGAAGGGGGCTGAGGCTGGCCAAGTGTCTCTGCACAGCTTGATAGAAACTCCTGCTAATGGGACTGGGCCAAGTGAGGCCCTGGCCATGCTACTGCAGGAGACCACTGGAGAGCTAGAGGCAGCCAAAGCCCTAGTGCTGAAGAGGATCCAGATTTGGAAACGGCAGCAGCAGCTGGCAGGGAATGGCGCACCGTTTGAGGAGAGCCTGGCCCCACTCCAGGAGAGGTGTGAAAGCCTGGTGGACATTTATTCCCAGCTACAGCAGGAGGTAGGGGCGGCTGGTGGGGAGCTTGAGCCCAAGACCCGGGCATCGCTGACTGGCCGGCTGGATGAAGTCCTGAGAACCCTCGTCACCAGTTGCTTCCTGGTGGAGAAGCAGCCCCCCCAGGTACTGAAGACTCAGACCAAGTTCCAGGCTGGAGTTCGATTCCTGTTGGGCTTGAGGTTCCTGGGGGCCCCAGCCAAGCCTCCGCTGGTCAGGGCCGACATGGTGACAGAGAAGCAGGCGCGGGAGCTGAGTGTGCCTCAGGGTCCTGGGG

In [58]:
feat_gene = []
for j in range(len(record4.features)):
    if record4.features[j].type == 'gene':
        feat_gene.append(j)
print('Número de features tipo gene: ', len(feat_gene))

Número de features tipo gene:  3
