# Rosalind Bioinformatics Stronghold Solutions

## Prerequisites

In [4]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [None]:
def read_dataset_str(task_shortcut):
    '''Returns a string.'''
    filename = 'rosalind_{}.txt'.format(task_shortcut)
    with open(filename) as inf:
        return [i.strip() for i in inf.readlines()][0]

In [34]:
def read_dataset_list(task_shortcut):
    '''Returns a list.'''
    filename = 'rosalind_{}.txt'.format(task_shortcut)
    with open(filename) as inf:
        return [i.strip() for i in inf.readlines()]

## Bioinformatics Stronghold

### DNA: Counting DNA Nucleotides

In [30]:
def dna(str_in):
    answer = [str_in.count(nucl) for nucl in ['A', 'C', 'G', 'T']]
    print(*answer)

dna('AGCTTTTCATTCTGACTGCAACGGGCAATATGTCTCTGTGTGGATTAAAAAAAGAGTGTCTGATAGCAGC')
dna(read_dataset_str('rosalind_dna.txt'))

20 12 17 21
246 248 231 264


### RNA: Transcribing DNA into RNA

In [31]:
def rna(str_in):
    return str_in.replace('T', 'U')

rna('GATGGAACTTGACTACGTAAATT')
rna(read_dataset_str('rosalind_rna.txt'))

'GAUGGAACUUGACUACGUAAAUU'

'GGUCACCCGACGAUCCUGGGGGGUCGGAACGGCUGGGACGCAGCCCAGUCCAUAAUACCUGGACGUGGCCCGCCGCUCAUGAGCCUGCGUAACGGGCAAUUCGAACUCAUUACUCAGCUCAGACGAAAAGUAAUUAGUGGUCCGGCUUAUAGCUUGAUAACCCGUAAGUGCUCUAGCCCCCAUACCUUCAAAGGCUAGCGUAGCAAGGGCAGAGAGUCCUGGGUUCAGGCCGUCGAUUGACGAUACAUACCUGAGCGGCUUACUAAAGCUUAGGUAUGUCGGGGGGCCAGACGUAUGCCCCCGCGGAUAAGGUCGUCUUAGGGGUAUCGUCAGUAACAGUUGCAUUAUGGCCCCACGCUGGCAUCGUUUGAAGAUCCGCUUAAAGGCGAAGAAUAAUAAGUUGGGAAUUCUAUCUUCAUCCGAUCCUAUAGUUCUAACCAUGUCAUAGUUCAUAGGUAUAUAGAGUACGCUCAGUGCGGCUAACCUAAAAGUAAAGUGCGUGAUAGGACUGCAGAUGUGAUUCAAAACAGGAGUGUCGAGCACCAUAAGUAUACAAGAACUUCGUGCGGGCCGGCACUAUGCUAUCAUUUAGGACCCGUGUGCUCGAGAAAGGUCUUUGCUGGCAGCUGUGUUGCUAACUACCGUUGAUAUUAAACUGAGCUCGUAAUAUAGAAAUAAGAUGACACAUCAUUCGCAAGCACGCUGUAUGGGCUAUGGUAAAGACCUUAAUGAAAAGGCCAGUAGGCCCGAGCGCAACGCUGCAAUGCGUGGACAACGACCGGCUACCAAUUUUUCCGAAAAGUCGUACUAAGGUAAGACAAGGGCCCCAGUUCGUCCUUCAUGAGGGUAGAAAAUACCCUCGUUGCGACAGUUGCGAUCCCAGAGGAAUUAGUAAUUGACGAAGUCCCCCUAGCCCUCGCCGCCCGUG'

### REVC: Complementing a Strand of DNA

In [39]:
def revc(str_in):
    tr_dict = {'A': 'T', 'T': 'A', 'G': 'C', 'C': 'G'}
    str_out = ''.join(tr_dict.get(nucl) for nucl in str_in)
    return str_out[::-1]

revc('AAAACCCGGT')
revc(read_dataset_str('rosalind_revc.txt'))

'ACCGGGTTTT'

'ACAGAGACGGCGGGTGTGCGTGCCCACAGTTCGAACCCTGGACAACCGCGCAGATGGATGATTTAAGATGCCTCACCTCTTGCATGAGTTGCCTTACACCCGCACTCTATCCGGACCGATCCAACAGACTAGCCCAAAGCTGAGGGAAGACGGGCATAGCGGCACTGCATTAGAAAAACTCCCGCAACACCGGGTAGTCCGAGACCCCCACCCTTAAACATGGTCTAGCGTGGTACAAAGATAGCCCTCGTAATACAACTACCCCTGCAACGTCATCGCAACTTATCGTTGACATACTTGGATAATGCCCTTTTAAGGTTGTGAGTTGGTAATAGTGCCCGGATTGCCGTTGCGGCGGGCAAGTTCTTCCACCAATATCAGGGGTGTTCACCACCTTCGTAGAATTTGACAGATGACCGACCAACAACGCTGTAGGAATCAAGTGAAGTAAACCTGCGGTTCTGAATTTTATGCGTCCTGCCTTGATGGTGGAGTGTAAGTGGACCGTTACGGAATACTCGTACAAAACTGACCCGGGGATCACCGCTCGAACATTCGGCTGCGCATAAACCCGAGGTCCGAATGAGGCCATGGTAAGAGTTAGAGTGGTGACCCTGCGATCGGAGAGAGATAACGGCCGTATGATTGCCTGGCGTCAAGACCCAGGTCCCAGCGTTGGCTTACGATTCCTCAATATCGGTGACGGGAAAGGTGACAAGAGTACAGGTGATCCGACGTCCGGTGTTACGCATACGATCGACAAGCGAAGACTCTACAGGATGCTTTTGATTAGGTATAGGCTTAGGTCACGAGCAGACTAGCCTATGTAGGCCAGCAAATAAATGTCAAGGAAGTTCAGCTCAACGATAAATAGCACGATAATTTGTGGTTCGGTGTACACAAATCATGTCGCCTGGATTGGTGGCGTCCGGACTACTCGCTGTGGCGC'

### GC: Computing GC Content

In [61]:
def fasta_read(fastafile):
    '''Transform fasta in txt into dictionary: name[sequence]'''
    with open(fastafile) as inf:
        fasta_list = [i.strip() for i in inf.readlines()]
    records_dict = {}
    for line in fasta_list:
        if '>' in line:
            new_key = line[1:]
            records_dict[new_key] = ''
            continue
        else:
            records_dict[new_key] += line[:]
    return records_dict

def gc(fastafile):
    records_dict = fasta_read(fastafile)
    GC_content_max = 0

    for record in records_dict.keys():
        dna = records_dict[record]
        GC_content = ((dna.count('G')+dna.count('C'))/len(dna)*100)
        if GC_content > GC_content_max:
            GC_content_max = GC_content
            record_max = record
    print(record_max, round(GC_content_max, 6), sep='\n')

gc('input_gc.txt')
gc('rosalind_gc.txt')

Rosalind_0808
60.91954
Rosalind_2124
53.125


### HAMM: Counting Point Mutations

In [22]:
def hamm(lst_in):
    hamm_count = int()
    for i in range(len(lst_in[0])):
        if lst_in[0][i] != lst_in[1][i]:
            hamm_count += 1
    return hamm_count
hamm(read_dataset_list('input.txt'))
hamm(read_dataset_list('rosalind_hamm.txt'))

7

413

### PROT: Translating RNA into Protein

In [27]:
import re

def prot(rna):
    rna_splitted = re.findall("\w{3}", rna)
    codon_table = {'UUU':'F', 'CUU':'L', 'AUU':'I', 'GUU':'V', 'UUC':'F', 'CUC':'L', 'AUC':'I', 'GUC':'V', 'UUA':'L', 'CUA':'L', 'AUA':'I', 'GUA':'V', 'UUG':'L', 'CUG':'L', 'AUG':'M', 'GUG':'V', 'UCU':'S', 'CCU':'P', 'ACU':'T', 'GCU':'A', 'UCC':'S', 'CCC':'P', 'ACC':'T', 'GCC':'A', 'UCA':'S', 'CCA':'P', 'ACA':'T', 'GCA':'A', 'UCG':'S', 'CCG':'P', 'ACG':'T', 'GCG':'A', 'UAU':'Y', 'CAU':'H', 'AAU':'N', 'GAU':'D', 'UAC':'Y', 'CAC':'H', 'AAC':'N', 'GAC':'D', 'UAA':'Stop', 'CAA':'Q', 'AAA':'K', 'GAA':'E', 'UAG':'Stop', 'CAG':'Q', 'AAG':'K', 'GAG':'E', 'UGU':'C', 'CGU':'R', 'AGU':'S', 'GGU':'G', 'UGC':'C', 'CGC':'R', 'AGC':'S', 'GGC':'G', 'UGA':'Stop', 'CGA':'R', 'AGA':'R', 'GGA':'G', 'UGG':'W', 'CGG':'R', 'AGG':'R', 'GGG':'G'}
    protein = [codon_table[codon] for codon in rna_splitted]
    del protein[-1]
    return ''.join(protein)

prot(read_dataset_str('input'))
prot(read_dataset_str('rosalind_prot.txt'))

'MAMAPRTEINSTRING'

'MERSADLAIKGSEKDVVAVRLITPKYKESVPTCGLRRRLPILERNASGHLGLTLIHPTCNDKNITPLPYDNHHDITDSMLADNMPISSMGHWIPQSGVVTLPATPINACSRGSSRAVNSHYYVITGIVGGVCMATRIEFANGTAVLQATKQLSTLEQPHHSRRPSHESSIPSASINTLSALSKGHQSARLCVYPTINLFNQYTGTALPTPITEYVQHRLSKRLMSRIGRPPPVGGHPKSASQTYYTVPDLTFTRTSSSPSSTELALNLSSFVPCNHWFSRAGCTHVSGLSRSVVCEVGCVARNTQVRLTWCVGVSLHLGAEKRATASDFDGYGRRPSFQCGVSPNIRLVNSTADCPFPRPLLGLTVISACPGWRRLVPLPTRASDPRRNVAKLGTVARPYRYFDSGRSTCESRCGNYALYGRLACLPYEHRAVEMRLDSSLVGSLICKGFPRSCRRAIGRVSQRAHITVVEHGLRVRGPEKGKSSGIDLVIHSPNSLGRRTIAVADLGHRSIRRAQLPSYHWTTNRVGILHRFDQGLKSNEEIGFPPVRTITVHVIAEAARLHEPWSRFRRCIQLLIAVMQPVLSKSTNTVDLLASLCAKGRTYTARIGRVTISASGPPASRRHGSWLDCQLGEGVEKSSASACFAILFILASEARPAEEAITSLPSSAKSMSGGGILANGRRDQGPHPLTIRVHAKCSGLQGATDTVSDHGSYDGEWTFPNGRDVHRTYSNAKLRSTPCTSIILSSAFVEYRRVTIREPWINSRWFDEVVRGSVFVSWPLHGAIHWHVISRPKKVICHCPPCERAFPPSGLSRCRALIDEPVRSFDPSQLRASDARDFFNPVAIALFNTQHIDTGLSYPLPCKFIVILEADRLHKPLRSPPFRLPGACNLIGCTVGPGCVLGRPLTSSLHIPRGIRVSIRPSPRGVYRKKIAHCFRGVVSLQSPVPRIRGETSSSNMHENEPQYHPHYVVAAVPWRLGELEVTGFDNAYSLLTYVRAL

In [35]:
string = """UUU F      CUU L      AUU I      GUU V
UUC F      CUC L      AUC I      GUC V
UUA L      CUA L      AUA I      GUA V
UUG L      CUG L      AUG M      GUG V
UCU S      CCU P      ACU T      GCU A
UCC S      CCC P      ACC T      GCC A
UCA S      CCA P      ACA T      GCA A
UCG S      CCG P      ACG T      GCG A
UAU Y      CAU H      AAU N      GAU D
UAC Y      CAC H      AAC N      GAC D
UAA Stop   CAA Q      AAA K      GAA E
UAG Stop   CAG Q      AAG K      GAG E
UGU C      CGU R      AGU S      GGU G
UGC C      CGC R      AGC S      GGC G
UGA Stop   CGA R      AGA R      GGA G
UGG W      CGG R      AGG R      GGG G"""

coded = "AUGGCCAUGGCGCCCAGAACUGAGAUCAAUAGUACCCGUAUUAACGGGUGA"
decoded = ''

traL =  string.split()
traDict = dict(zip(traL[0::2], traL[1::2]))

for i in range(0, len(coded)-3, 3):
    decoded += traDict[coded[i:i+3]]
print(decoded)

MAMAPRTEINSTRING


### SUBS: Finding a Motif in DNA

In [84]:
def subs(lst_in):
    string, sub_string = lst_in
    res_lst = [0]
    
    while -1 not in res_lst:
        res_lst.append(string.find(sub_string, res_lst[-1] + 1))
    res_lst = [str(i + 1) for i in res_lst[1:-1]]
    print(' '.join(res_lst))

subs(['GATATATGCATATACTT', 'ATAT'])
subs(read_dataset_list('rosalind_subs.txt'))

2 4 10
5 28 35 45 52 69 76 103 240 262 287 488 506 525 604 663 678 702 739


### FIB: Rabbits and Recurrence Relations

In [11]:
def fib(n, k):
    lst_fib = [0,1]
    for _ in range(n-1):
        lst_fib.append(lst_fib[-1] + k * lst_fib[-2])
    return lst_fib[-1]

fib(5, 3)
fib(31, 2)

19

715827883

### IPRB: Mendel's First Law

In [62]:
from scipy.misc import comb

def iprb(k, m, n):
    '''k individuals are homozygous dominant for a factor,
    m are heterozygous, and n are homozygous recessive.'''
    summa = k + m + n
    total_comb = 4 * comb(summa, 2)
    total_comb_rec = 4 * comb(n, 2) + 2 * n * m + comb(m, 2)
    return 1 - total_comb_rec/total_comb

iprb(2,2,2)
iprb(18, 19, 15)

0.78333333333333333

0.78110859728506787

## Bioinformatics Armory

### INI: Introduction to the Bioinformatics Armory

In [14]:
from Bio.Seq import Seq

def ini(sequence):
    my_seq = Seq(sequence)
    nucl_counts = [my_seq.count(i) for i in ['A', 'C', 'G', 'T']]
    print(*nucl_counts)

ini('AGCTTTTCATTCTGACTGCAACGGGCAATATGTCTCTGTGTGGATTAAAAAAAGAGTGTCTGATAGCAGC')
ini(read_dataset_str('rosalind_ini.txt'))

20 12 17 21
203 218 200 193


### DBPR: Introduction to Protein Databases	

In [44]:
from Bio import ExPASy
from Bio import SwissProt

def dbpr(up_id):
    handle = ExPASy.get_sprot_raw(up_id)
    record = SwissProt.read(handle)
    go_functions = [i[2][2:] for i in record.cross_references if i[0] == 'GO' and i[2][0] == 'P']
    for i in go_functions:
        print(i)

dbpr('Q5SLP9')
dbpr(read_dataset_str('dbpr'))

DNA recombination
DNA repair
DNA replication
'de novo' pyrimidine nucleobase biosynthetic process
'de novo' UMP biosynthetic process
