# Rosalind Solutions

## Prerequisites

In [18]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

def read_dataset_str(task_shortcut):
    '''Returns a string.'''
    filename = 'rosalind_{}.txt'.format(task_shortcut)
    with open(filename) as inf:
        return [i.strip() for i in inf.readlines()][0]

def read_dataset_list(task_shortcut):
    '''Returns a list.'''
    filename = 'rosalind_{}.txt'.format(task_shortcut)
    with open(filename) as inf:
        return [i.strip() for i in inf.readlines()]

## Bioinformatics Stronghold

### DNA: Counting DNA Nucleotides

In [30]:
def dna(str_in):
    answer = [str_in.count(nucl) for nucl in ['A', 'C', 'G', 'T']]
    print(*answer)

dna('AGCTTTTCATTCTGACTGCAACGGGCAATATGTCTCTGTGTGGATTAAAAAAAGAGTGTCTGATAGCAGC')
dna(read_dataset_str('rosalind_dna.txt'))

20 12 17 21
246 248 231 264


### RNA: Transcribing DNA into RNA

In [31]:
def rna(str_in):
    return str_in.replace('T', 'U')

rna('GATGGAACTTGACTACGTAAATT')
rna(read_dataset_str('rosalind_rna.txt'))

'GAUGGAACUUGACUACGUAAAUU'

'GGUCACCCGACGAUCCUGGGGGGUCGGAACGGCUGGGACGCAGCCCAGUCCAUAAUACCUGGACGUGGCCCGCCGCUCAUGAGCCUGCGUAACGGGCAAUUCGAACUCAUUACUCAGCUCAGACGAAAAGUAAUUAGUGGUCCGGCUUAUAGCUUGAUAACCCGUAAGUGCUCUAGCCCCCAUACCUUCAAAGGCUAGCGUAGCAAGGGCAGAGAGUCCUGGGUUCAGGCCGUCGAUUGACGAUACAUACCUGAGCGGCUUACUAAAGCUUAGGUAUGUCGGGGGGCCAGACGUAUGCCCCCGCGGAUAAGGUCGUCUUAGGGGUAUCGUCAGUAACAGUUGCAUUAUGGCCCCACGCUGGCAUCGUUUGAAGAUCCGCUUAAAGGCGAAGAAUAAUAAGUUGGGAAUUCUAUCUUCAUCCGAUCCUAUAGUUCUAACCAUGUCAUAGUUCAUAGGUAUAUAGAGUACGCUCAGUGCGGCUAACCUAAAAGUAAAGUGCGUGAUAGGACUGCAGAUGUGAUUCAAAACAGGAGUGUCGAGCACCAUAAGUAUACAAGAACUUCGUGCGGGCCGGCACUAUGCUAUCAUUUAGGACCCGUGUGCUCGAGAAAGGUCUUUGCUGGCAGCUGUGUUGCUAACUACCGUUGAUAUUAAACUGAGCUCGUAAUAUAGAAAUAAGAUGACACAUCAUUCGCAAGCACGCUGUAUGGGCUAUGGUAAAGACCUUAAUGAAAAGGCCAGUAGGCCCGAGCGCAACGCUGCAAUGCGUGGACAACGACCGGCUACCAAUUUUUCCGAAAAGUCGUACUAAGGUAAGACAAGGGCCCCAGUUCGUCCUUCAUGAGGGUAGAAAAUACCCUCGUUGCGACAGUUGCGAUCCCAGAGGAAUUAGUAAUUGACGAAGUCCCCCUAGCCCUCGCCGCCCGUG'

### REVC: Complementing a Strand of DNA

In [39]:
def revc(str_in):
    tr_dict = {'A': 'T', 'T': 'A', 'G': 'C', 'C': 'G'}
    str_out = ''.join(tr_dict.get(nucl) for nucl in str_in)
    return str_out[::-1]

revc('AAAACCCGGT')
revc(read_dataset_str('rosalind_revc.txt'))

'ACCGGGTTTT'

'ACAGAGACGGCGGGTGTGCGTGCCCACAGTTCGAACCCTGGACAACCGCGCAGATGGATGATTTAAGATGCCTCACCTCTTGCATGAGTTGCCTTACACCCGCACTCTATCCGGACCGATCCAACAGACTAGCCCAAAGCTGAGGGAAGACGGGCATAGCGGCACTGCATTAGAAAAACTCCCGCAACACCGGGTAGTCCGAGACCCCCACCCTTAAACATGGTCTAGCGTGGTACAAAGATAGCCCTCGTAATACAACTACCCCTGCAACGTCATCGCAACTTATCGTTGACATACTTGGATAATGCCCTTTTAAGGTTGTGAGTTGGTAATAGTGCCCGGATTGCCGTTGCGGCGGGCAAGTTCTTCCACCAATATCAGGGGTGTTCACCACCTTCGTAGAATTTGACAGATGACCGACCAACAACGCTGTAGGAATCAAGTGAAGTAAACCTGCGGTTCTGAATTTTATGCGTCCTGCCTTGATGGTGGAGTGTAAGTGGACCGTTACGGAATACTCGTACAAAACTGACCCGGGGATCACCGCTCGAACATTCGGCTGCGCATAAACCCGAGGTCCGAATGAGGCCATGGTAAGAGTTAGAGTGGTGACCCTGCGATCGGAGAGAGATAACGGCCGTATGATTGCCTGGCGTCAAGACCCAGGTCCCAGCGTTGGCTTACGATTCCTCAATATCGGTGACGGGAAAGGTGACAAGAGTACAGGTGATCCGACGTCCGGTGTTACGCATACGATCGACAAGCGAAGACTCTACAGGATGCTTTTGATTAGGTATAGGCTTAGGTCACGAGCAGACTAGCCTATGTAGGCCAGCAAATAAATGTCAAGGAAGTTCAGCTCAACGATAAATAGCACGATAATTTGTGGTTCGGTGTACACAAATCATGTCGCCTGGATTGGTGGCGTCCGGACTACTCGCTGTGGCGC'

### GC: Computing GC Content

In [61]:
def read_fasta(fastafile):
    '''Transform fasta in txt into dictionary: name[sequence]'''
    with open(fastafile) as inf:
        fasta_list = [i.strip() for i in inf.readlines()]
    records_dict = {}
    for line in fasta_list:
        if '>' in line:
            new_key = line[1:]
            records_dict[new_key] = ''
            continue
        else:
            records_dict[new_key] += line[:]
    return records_dict

def gc(fastafile):
    records_dict = read_fasta(fastafile)
    GC_content_max = 0

    for record in records_dict.keys():
        dna = records_dict[record]
        GC_content = ((dna.count('G')+dna.count('C'))/len(dna)*100)
        if GC_content > GC_content_max:
            GC_content_max = GC_content
            record_max = record
    print(record_max, round(GC_content_max, 6), sep='\n')

gc('input_gc.txt')
gc('rosalind_gc.txt')

Rosalind_0808
60.91954
Rosalind_2124
53.125


### HAMM: Counting Point Mutations

In [22]:
def hamm(lst_in):
    hamm_count = int()
    for i in range(len(lst_in[0])):
        if lst_in[0][i] != lst_in[1][i]:
            hamm_count += 1
    return hamm_count
hamm(read_dataset_list('input.txt'))
hamm(read_dataset_list('rosalind_hamm.txt'))

7

413

### PROT: Translating RNA into Protein

In [27]:
import re

def prot(rna):
    rna_splitted = re.findall("\w{3}", rna)
    codon_table = {'UUU':'F', 'CUU':'L', 'AUU':'I', 'GUU':'V', 'UUC':'F', 'CUC':'L', 'AUC':'I', 'GUC':'V', 'UUA':'L', 'CUA':'L', 'AUA':'I', 'GUA':'V', 'UUG':'L', 'CUG':'L', 'AUG':'M', 'GUG':'V', 'UCU':'S', 'CCU':'P', 'ACU':'T', 'GCU':'A', 'UCC':'S', 'CCC':'P', 'ACC':'T', 'GCC':'A', 'UCA':'S', 'CCA':'P', 'ACA':'T', 'GCA':'A', 'UCG':'S', 'CCG':'P', 'ACG':'T', 'GCG':'A', 'UAU':'Y', 'CAU':'H', 'AAU':'N', 'GAU':'D', 'UAC':'Y', 'CAC':'H', 'AAC':'N', 'GAC':'D', 'UAA':'Stop', 'CAA':'Q', 'AAA':'K', 'GAA':'E', 'UAG':'Stop', 'CAG':'Q', 'AAG':'K', 'GAG':'E', 'UGU':'C', 'CGU':'R', 'AGU':'S', 'GGU':'G', 'UGC':'C', 'CGC':'R', 'AGC':'S', 'GGC':'G', 'UGA':'Stop', 'CGA':'R', 'AGA':'R', 'GGA':'G', 'UGG':'W', 'CGG':'R', 'AGG':'R', 'GGG':'G'}
    protein = [codon_table[codon] for codon in rna_splitted]
    del protein[-1]
    return ''.join(protein)

prot(read_dataset_str('input'))
prot(read_dataset_str('rosalind_prot.txt'))

'MAMAPRTEINSTRING'

'MERSADLAIKGSEKDVVAVRLITPKYKESVPTCGLRRRLPILERNASGHLGLTLIHPTCNDKNITPLPYDNHHDITDSMLADNMPISSMGHWIPQSGVVTLPATPINACSRGSSRAVNSHYYVITGIVGGVCMATRIEFANGTAVLQATKQLSTLEQPHHSRRPSHESSIPSASINTLSALSKGHQSARLCVYPTINLFNQYTGTALPTPITEYVQHRLSKRLMSRIGRPPPVGGHPKSASQTYYTVPDLTFTRTSSSPSSTELALNLSSFVPCNHWFSRAGCTHVSGLSRSVVCEVGCVARNTQVRLTWCVGVSLHLGAEKRATASDFDGYGRRPSFQCGVSPNIRLVNSTADCPFPRPLLGLTVISACPGWRRLVPLPTRASDPRRNVAKLGTVARPYRYFDSGRSTCESRCGNYALYGRLACLPYEHRAVEMRLDSSLVGSLICKGFPRSCRRAIGRVSQRAHITVVEHGLRVRGPEKGKSSGIDLVIHSPNSLGRRTIAVADLGHRSIRRAQLPSYHWTTNRVGILHRFDQGLKSNEEIGFPPVRTITVHVIAEAARLHEPWSRFRRCIQLLIAVMQPVLSKSTNTVDLLASLCAKGRTYTARIGRVTISASGPPASRRHGSWLDCQLGEGVEKSSASACFAILFILASEARPAEEAITSLPSSAKSMSGGGILANGRRDQGPHPLTIRVHAKCSGLQGATDTVSDHGSYDGEWTFPNGRDVHRTYSNAKLRSTPCTSIILSSAFVEYRRVTIREPWINSRWFDEVVRGSVFVSWPLHGAIHWHVISRPKKVICHCPPCERAFPPSGLSRCRALIDEPVRSFDPSQLRASDARDFFNPVAIALFNTQHIDTGLSYPLPCKFIVILEADRLHKPLRSPPFRLPGACNLIGCTVGPGCVLGRPLTSSLHIPRGIRVSIRPSPRGVYRKKIAHCFRGVVSLQSPVPRIRGETSSSNMHENEPQYHPHYVVAAVPWRLGELEVTGFDNAYSLLTYVRAL

In [35]:
string = """UUU F      CUU L      AUU I      GUU V
UUC F      CUC L      AUC I      GUC V
UUA L      CUA L      AUA I      GUA V
UUG L      CUG L      AUG M      GUG V
UCU S      CCU P      ACU T      GCU A
UCC S      CCC P      ACC T      GCC A
UCA S      CCA P      ACA T      GCA A
UCG S      CCG P      ACG T      GCG A
UAU Y      CAU H      AAU N      GAU D
UAC Y      CAC H      AAC N      GAC D
UAA Stop   CAA Q      AAA K      GAA E
UAG Stop   CAG Q      AAG K      GAG E
UGU C      CGU R      AGU S      GGU G
UGC C      CGC R      AGC S      GGC G
UGA Stop   CGA R      AGA R      GGA G
UGG W      CGG R      AGG R      GGG G"""

coded = "AUGGCCAUGGCGCCCAGAACUGAGAUCAAUAGUACCCGUAUUAACGGGUGA"
decoded = ''

traL =  string.split()
traDict = dict(zip(traL[0::2], traL[1::2]))

for i in range(0, len(coded)-3, 3):
    decoded += traDict[coded[i:i+3]]
print(decoded)

MAMAPRTEINSTRING


### SUBS: Finding a Motif in DNA

In [84]:
def subs(lst_in):
    string, sub_string = lst_in
    res_lst = [0]
    
    while -1 not in res_lst:
        res_lst.append(string.find(sub_string, res_lst[-1] + 1))
    res_lst = [str(i + 1) for i in res_lst[1:-1]]
    print(' '.join(res_lst))

subs(['GATATATGCATATACTT', 'ATAT'])
subs(read_dataset_list('rosalind_subs.txt'))

2 4 10
5 28 35 45 52 69 76 103 240 262 287 488 506 525 604 663 678 702 739


### FIB: Rabbits and Recurrence Relations'

In [11]:
def fib(n, k):
    lst_fib = [0,1]
    for _ in range(n-1):
        lst_fib.append(lst_fib[-1] + k * lst_fib[-2])
    return lst_fib[-1]

fib(5, 3)
fib(31, 2)

19

715827883

### IPRB: Mendel's First Law

In [62]:
from scipy.misc import comb

def iprb(k, m, n):
    '''k individuals are homozygous dominant for a factor,
    m are heterozygous, and n are homozygous recessive.'''
    summa = k + m + n
    total_comb = 4 * comb(summa, 2)
    total_comb_rec = 4 * comb(n, 2) + 2 * n * m + comb(m, 2)
    return 1 - total_comb_rec/total_comb

iprb(2,2,2)
iprb(18,19,15)

0.78333333333333333

0.78110859728506787

## Bioinformatics Armory

### INI: Introduction to the Bioinformatics Armory

In [14]:
from Bio.Seq import Seq

def ini(sequence):
    my_seq = Seq(sequence)
    nucl_counts = [my_seq.count(i) for i in ['A', 'C', 'G', 'T']]
    print(*nucl_counts)

ini('AGCTTTTCATTCTGACTGCAACGGGCAATATGTCTCTGTGTGGATTAAAAAAAGAGTGTCTGATAGCAGC')
ini(read_dataset_str('rosalind_ini.txt'))

20 12 17 21
203 218 200 193


### DBPR: Introduction to Protein Databases	

In [44]:
from Bio import ExPASy
from Bio import SwissProt

def dbpr(up_id):
    handle = ExPASy.get_sprot_raw(up_id)
    record = SwissProt.read(handle)
    go_functions = [i[2][2:] for i in record.cross_references if i[0] == 'GO' and i[2][0] == 'P']
    for i in go_functions:
        print(i)

dbpr('Q5SLP9')
dbpr(read_dataset_str('dbpr'))

DNA recombination
DNA repair
DNA replication
'de novo' pyrimidine nucleobase biosynthetic process
'de novo' UMP biosynthetic process


### CONS: Consensus and Profile       

In [25]:
from read_dataset import read_fasta
from collections import Counter

def cons(str_in):
    dct_fasta = read_fasta(str_in)
    prof_matr = [i for i in dct_fasta.values()]
    nucl_counts = []
    
    for i in range(len(prof_matr[0])):
        str_nucl = ''
        for j in prof_matr:
            str_nucl += j[i]
        nucl_counts.append(Counter(str_nucl))
    
    consensus = ''
    for i in nucl_counts:
        consensus += max(i, key=i.get)
    
    with open('output.txt', 'w') as ouf:
        ouf.write('{}\n'.format(consensus))
        for nucl in ['A', 'C', 'G', 'T']:
            ouf.write('{}: '.format(nucl))
            nucl_numb = [i.get(nucl, 0) for i in nucl_counts]
            ouf.write(' '.join(map(str, nucl_numb)))
            ouf.write('\n')
    
# cons('input')
cons('cons')

### FIBD: Mortal Fibonacci Rabbits'

In [31]:
def fibd(n, m):
    lst_fibd = [1] + [0] * (m - 1)
    for i in range(1, n):
        bun = 0
        for j in range(1, m):
            bun += lst_fibd[(i - j - 1) % m]
        lst_fibd[(i) % m] = bun

    # Total rabbits is the sum of the living rabbits.
    int_ret = sum(lst_fibd)
    return int_ret
    

# fibd(6, 3)
fibd(87, 19)

677623885055661681

### GRPH: Overlap Graphs      

In [44]:
from read_dataset import read_fasta

def grph(str_in):
    dct_fasta = read_fasta(str_in)
    lst_seq = [[key, value[:3], value[-3:]] for key, value in dct_fasta.items()]
    answer = []
    for start in lst_seq:
        for end in lst_seq:
            if start[1] == end[2] and start[0] != end[0]:
                answer.append([end[0], start[0]])
    return [print(*i) for i in answer][0]
    
# grph('input')
grph('grph')

Rosalind_0237 Rosalind_0144
Rosalind_6937 Rosalind_0144
Rosalind_1987 Rosalind_0144
Rosalind_2159 Rosalind_5714
Rosalind_2156 Rosalind_5714
Rosalind_7769 Rosalind_5714
Rosalind_5714 Rosalind_0550
Rosalind_7541 Rosalind_0550
Rosalind_3705 Rosalind_0550
Rosalind_1418 Rosalind_0550
Rosalind_2225 Rosalind_0550
Rosalind_4171 Rosalind_9234
Rosalind_6861 Rosalind_2533
Rosalind_6966 Rosalind_2533
Rosalind_9927 Rosalind_2533
Rosalind_3989 Rosalind_2533
Rosalind_7202 Rosalind_9963
Rosalind_6826 Rosalind_9963
Rosalind_3465 Rosalind_3976
Rosalind_7746 Rosalind_3976
Rosalind_8113 Rosalind_2304
Rosalind_3250 Rosalind_2922
Rosalind_0141 Rosalind_2922
Rosalind_2922 Rosalind_3450
Rosalind_3624 Rosalind_3450
Rosalind_5050 Rosalind_1155
Rosalind_4171 Rosalind_4513
Rosalind_2922 Rosalind_5246
Rosalind_3624 Rosalind_5246
Rosalind_0740 Rosalind_7080
Rosalind_7080 Rosalind_4020
Rosalind_2837 Rosalind_4020
Rosalind_4998 Rosalind_4020
Rosalind_4239 Rosalind_4020
Rosalind_6524 Rosalind_2837
Rosalind_9790 Rosali

### IEV: Calculating Expected Offspring'

In [29]:
def iev(str_in):
    lst_couples = [int(i) for i in str_in.split()]
    lst_prob_coup = [1, 1, 1, .75, .5, 0]
    lst_prob_total = list()
    lst_prob_total.append(2 * no_coup * lst_prob_coup[i] for i, no_coup in enumerate(lst_couples))
    result = sum(*lst_prob_total)
    return result
    
# iev('1 0 0 1 0 1')
iev(read_dataset_str('iev'))

162339.5

         
### MPRT: Finding a Protein Motif     

In [20]:
# def mprt(str_in):
#     pass

from Bio import ExPASy
from Bio import SwissProt

handle = ExPASy.get_sprot_raw('A2Z669')
record = SwissProt.read(handle)

    
# mprt('input')

URLError: <urlopen error [Errno 8] nodename nor servname provided, or not known>

In [52]:
record.

(203, 20439, 'B8FDFF1EFFF554B7')

         
### MRNA: Inferring mRNA from Protein     

In [19]:
from collections import defaultdict

def mrna(protein):
    codon_table = {'UUU':'F', 'CUU':'L', 'AUU':'I', 'GUU':'V', 'UUC':'F', 'CUC':'L', 'AUC':'I', 'GUC':'V', 'UUA':'L', 'CUA':'L', 'AUA':'I', 'GUA':'V', 'UUG':'L', 'CUG':'L', 'AUG':'M', 'GUG':'V', 'UCU':'S', 'CCU':'P', 'ACU':'T', 'GCU':'A', 'UCC':'S', 'CCC':'P', 'ACC':'T', 'GCC':'A', 'UCA':'S', 'CCA':'P', 'ACA':'T', 'GCA':'A', 'UCG':'S', 'CCG':'P', 'ACG':'T', 'GCG':'A', 'UAU':'Y', 'CAU':'H', 'AAU':'N', 'GAU':'D', 'UAC':'Y', 'CAC':'H', 'AAC':'N', 'GAC':'D', 'UAA':'Stop', 'CAA':'Q', 'AAA':'K', 'GAA':'E', 'UAG':'Stop', 'CAG':'Q', 'AAG':'K', 'GAG':'E', 'UGU':'C', 'CGU':'R', 'AGU':'S', 'GGU':'G', 'UGC':'C', 'CGC':'R', 'AGC':'S', 'GGC':'G', 'UGA':'Stop', 'CGA':'R', 'AGA':'R', 'GGA':'G', 'UGG':'W', 'CGG':'R', 'AGG':'R', 'GGG':'G'}
    no_of_codons = defaultdict(int)
    for value in codon_table.values():
        no_of_codons[value] += 1
    
    mult = 3
    for i in protein:
        mult *= no_of_codons[i]
    return mult

mrna('MA')
mrna(read_dataset_str('mrna'))

12

FileNotFoundError: [Errno 2] No such file or directory: 'rosalind_mrna.txt'

         
### PERM: Enumerating Gene Orders     

In [46]:
from itertools import permutations

def perm(int_in):
    int_in = int(int_in)
    lst_numb = list(range(1, int_in + 1))
    lst_perm = list(permutations(lst_numb, int_in))
    with open('output', 'w') as ouf:
        ouf.write('{}\n'.format(str(len(lst_perm))))
        for i in lst_perm:
            ouf.write(' '.join(map(str, i)))
            ouf.write('\n')

# perm(3)
perm(read_dataset_str('perm'))

         
### PRTM: Calculating Protein Mass

In [3]:
def prtm(protein):
    s = """A=71.03711;C=103.00919;D=115.02694;E=129.04259;F=147.06841;G=57.02146;H=137.05891;I=113.08406;K=128.09496;L=113.08406;M=131.04049;N=114.04293;P=97.05276;Q=128.05858;R=156.10111;S=87.03203;T=101.04768;V=99.06841;W=186.07931;Y=163.06333"""
    mass_table = dict(item.split('=') for item in s.split(';'))
    mass = sum(float(mass_table.get(aa, 0.0)) for aa in protein)
    return round(mass, 3)

# prtm('SKADYEK')
prtm(read_dataset_str('prtm'))

102578.929

### LCSM: Finding a Shared Motif     

### LIA: Independent Alleles

         
### ORF: Open Reading Frames