## 011 - Consensus and Profile

In [1]:
from Bio import SeqIO
import numpy as np
import pandas as pd

In [3]:
with open('Data/011_rosalind_cons.fasta') as fasta_file:
    dna_strings = {}
    for seq_record in SeqIO.parse(fasta_file, 'fasta'):
        dna_strings[seq_record.id] = [char for char in seq_record.seq]

In [112]:
# Data Frame of DNA Strings

df = pd.DataFrame(dna_strings).t

In [None]:
# Profile

df2 = pd.DataFrame()
for i in range(len(df.axes[1])):
    df2 = df2.append(df.iloc[:, i].value_counts())

print(df2.t.fillna(0).to_string(header = False).replace(".0", "").replace("  ", " "))

In [114]:
# Most Likely Common Ancestor:

print(df.describe().loc["top"].to_string(header = False, index = False).replace("\n", ""))

GGACACACGCATTTGCCAACTTGCCACTTATCGACCATATGCCTAGCCTAGTTTAGCCTATAGGAATCCGATTCCTGGGCCGGTCAGAAGCTAACTTGGCAAAGGGACGTCTTAGTGCGCTGCCTATAGATGCACTCAGCTCTCGTCAAACTCTCGATTTATTTATTTTTCGCAGAGTAAGAGTGCCAGTAATAGACCTAAATCTCGAAAGGCACTGCCGACTACATTCATGGGGACATGCCCAGGGAAAGTCTGAGATCTCCGGTGACAACACTATGCGATTGATTGAGCGCGTCGTGCGACGAGGGCGTGTAACACTTAGATATTGAGGCGCTAGCAAGGCATCCCTATATGGATAAACAGGCAATGCGAGATATATCTTACAACTGGCATTGCCATCGCCGAGTCATCGTAGTCACGCGGGACCACAGGCTTTCCAAGGCCTCCTGCGATACAACGGTTTCTCACATCAGTTAGTCTATAACACGGGGCCGGAAAGAAGTACCCTACGTTACGAAATGAATTTCTATATACGTTCTATTCACTGCCACTTATGATAGTGATGAAGTCGTTGCGTCGCGACGAGCAAATTCGGGGATTTGGATTAATAAAGAAGAGACCAGTGCTGGCGAACTGCGACGTGTGAGTTTCACTCTTAGCCTGTCCGTTGGACGCACGATTCGTTTAATCCATATACTACCACGCGGTTGGCTGGTGTCACTAGGTCCGTAAGGGGGAGTTATTAGTCAGCTGGGCATAATGGTGGGATAGGGTAGAGATTTAGAAACATGCTGGTGCGCACGCGATCAACGTTTACCGGAAACACAAGTGAGCCAGCAAATGAATAAGACTCCAATCTAAGCGGTACCTTTTTATAATTTGCTCTCTTCCAGTTAAGCGAGGCCACTGTACGGTATTTAAAGTAGTGCGGTTGACGAGCAACCTTGCGTCTCACGTCCCACCCGGAATCCAGAGGTGG


## 012 - Overlap Graphs

In [21]:
with open('Data/012_rosalind_grph.fasta') as fasta_file:
    dna_strings = {}
    for seq_record in SeqIO.parse(fasta_file, 'fasta'):
        dna_strings[seq_record.id] = seq_record.seq

In [None]:
k = 3 # Length of the suffix and prefix overlap

for i in dna_strings.keys():
    for j in dna_strings.keys():
        if i == j:
            continue
        if dna_strings[i][-k:] == dna_strings[j][0:k]:
            print(i, j)

## 013 - Calculating Expected Offspring

In [33]:
# AA-AA 100%
# AA-Aa 100%
# AA-aa 100%
# Aa-Aa 75%
# Aa-aa 50%
# aa-aa 0%

p = [1, 1, 1, .75, .5, 0] # Probabilities

pop = "18523 18594 16885 18423 16503 17027".split(" ") # Population
pop = [eval(i) for i in pop]

exp = 0
for i in range(6):
    exp += p[i]*pop[i]

exp*2 # 2 offsprings per couple

152141.5

## 014 - Finding a Shared Motif

In [25]:
with open('Data/014_rosalind_lcsm.fasta') as fasta_file:
    dna_strings = []
    for seq_record in SeqIO.parse(fasta_file, 'fasta'):
        dna_strings.append(seq_record.seq)

In [26]:
# Function to check if a subset is common among a list of strings

def is_common(subset, string_list):
    for i in range(len(string_list)):
        if (subset in dna_strings[i]) == False:
            return False
    return True

# Function to get all substrings of a string

def get_all_substrings(str):
    result = []
    n = len(str)
    for i in range(n):
        temp = ""
        for j in range(i, n):
            temp += str[j]
            result.append(temp)
    
    return(result)

def longest_common_substring(string_list):

    # Get shortest string

    base_str = min(dna_strings, key = len)
    base_sub = get_all_substrings(base_str)

    # Check each substring, starting with the longest ones

    base_sub = sorted(base_sub, key = len, reverse = true)

    for i in range(len(base_sub)):
        if is_common(base_sub[i], string_list):
            return(base_sub[i])

In [27]:
longest_common_substring(dna_strings)

'TTACGATTTTCGTTGGGACATCAAGGCCGAGCAATTAGCCAATCTCCTGGACGGAAGGTAGTTCGGTCCCGACATGCCAGCTGCTCTGCGGACGTGACAGTACATCCGGGTACAATGCGCTCGAGGAGATGATAACTGTTCATGTTGATTAACACACCTTAGTCGAGCAGGCACCTTTACAGCATAACCCAATCCATATATTGCCAGGTTTTTTACGACAGGTCTTGCAGCGGGTATAC'

## 015 - Independent Alleles

In [2]:
import math

In [40]:
p = 1/4 # Fraction of Aa Bb in Punnett Squares
k = 5   # Generation
N = 9   # At least N both heterozygous

pop = 2**k # Population of the generation

prb = 0

for i in range(N, pop + 1):
    # Aa Bb * not Aa Bb * sequencing
    prb += (p**i)*((1-p)**(pop-i))*(math.factorial(pop)/(math.factorial(i)*math.factorial(pop-i)))

prb

0.40648834849733056

## 016 - Finding a Protein Motif

In [1]:
import requests as r
from io import StringIO

In [2]:
def get_key(val, dict):
    for key, value in dict.items():
        if val == value:
            return key
 
    return "key doesn't exist"

In [3]:
with open("Data/016_rosalind_mprt.txt", "r") as file:
    protein_id_list = file.read().splitlines()

protein_id_dict = {}

for i in range(len(protein_id_list)):
    protein_id_dict[protein_id_list[i]] = protein_id_list[i].split("_")[0]

print(protein_id_dict)

{'B4R8K2': 'B4R8K2', 'Q1LI56': 'Q1LI56', 'A9M5H3': 'A9M5H3', 'P07585_PGS2_HUMAN': 'P07585', 'P72173': 'P72173', 'P46096_SYT1_MOUSE': 'P46096', 'P04921_GLPC_HUMAN': 'P04921', 'P04233_HG2A_HUMAN': 'P04233', 'P13473_LMP2_HUMAN': 'P13473', 'P01046_KNL1_BOVIN': 'P01046', 'P02786_TRSR_HUMAN': 'P02786', 'Q9V730': 'Q9V730'}


In [6]:
protein_seq_dict = {}
base_url = "http://www.uniprot.org/uniprot/"

for i in protein_id_dict.keys():

    current_url = base_url + protein_id_dict[i] + ".fasta"
    response = r.post(current_url)
    data = ''.join(response.text)

    Seq = StringIO(data)
    for seq_record in SeqIO.parse(Seq, 'fasta'):
        protein_seq_dict[seq_record.id.split("|")[1]] = seq_record.seq

protein_seq_dict

{'B4R8K2': Seq('MAKQPKRMQKWTGDVAATHALDEAVKLVKANANAKFDETVEIAVNLGVDPRHAD...LGV'),
 'Q1LI56': Seq('MQLNNLKPADGSKHAKRRVGRGIGSGLGKTAGRGHKGQKSRSGGFHKVGFEGGQ...IEA'),
 'A9M5H3': Seq('MSISASLVKELRDLTGAGMMDCKAALAATEGKIEAAVDWLRAKGIAKADKKAGR...AKG'),
 'P07585': Seq('MKATIILLLLAQVSWAGPFQQRGLFDFMLEDEASGIGPEVPDDRDFEPSLGPVC...NYK'),
 'P72173': Seq('MSLFSAVEMAPRDPILGLNEAFNADTRPGKINLGVGVYYNEEGRIPLLRAVQAA...QVL'),
 'P46096': Seq('MVSASRPEALAAPVTTVATLVPHNATEPASPGEGKEDAFSKLKQKFMNELHKIP...VKK'),
 'P04921': Seq('MWSTRSPNSTAWPLSLEPDPGMASASTTMHTTTIAEPDPGMSGWPDGRMETSTP...YFI'),
 'P04233': Seq('MHRRRSRSCREDQKPVMDDQRDLISNNEQLPMLGRRPGAPESKCSRGALYTGFS...VPM'),
 'P13473': Seq('MVCFRLFPVPGSGLVLVCLVLGAVRSYALELNLTDSENATCLYAKWQMNFTVRY...EQF'),
 'P01044': Seq('MKLITILFLCSRLLPSLTQESSQEIDCNDQDVFKAVDAALTKYNSENKSGNQFV...ALL'),
 'P02786': Seq('MMDQARSAFSNLFGGEPLSYTRFSLARQVDGDNSHVEMKLAVDEEENADNNTKA...NEF'),
 'Q9V730': Seq('MQAKKRYILVFVSCAFLAYAYFGGYRLKVSPLRPRRAQHESAKDGGVQPHEQLP...VGS')}

In [7]:
for i in protein_seq_dict.keys():
    locations = []
    for j in range(len(protein_seq_dict[i])):
        subset = protein_seq_dict[i][j:j+4]
        if (subset[0] == "N") and (subset[1] != "P") and ((subset[2] == "S") or (subset[2] == "t")) and (subset[3] != "P"):
            locations.append(j + 1)
    if (len(locations) > 0):
        print(get_key(i, protein_id_dict))
        print(locations)

A9M5H3
[133]
P07585_PGS2_HUMAN
[211, 262, 303]
P72173
[87, 284, 383]
P46096_SYT1_MOUSE
[24, 340, 381]
P04921_GLPC_HUMAN
[8]
P04233_HG2A_HUMAN
[130, 136, 256, 270]
P13473_LMP2_HUMAN
[32, 38, 49, 58, 75, 101, 123, 179, 229, 242, 257, 275, 300, 307, 317, 356]
key doesn't exist
[47, 87, 168, 169, 197, 204]
P02786_TRSR_HUMAN
[50, 55, 251, 317, 727]
Q9V730
[71, 327, 476]


## 017 - Inferring mRNA from Protein

In [2]:
protein_string = open("Data/017_rosalind_mrna.txt", "r").read().splitlines()[0]

In [4]:
codon_count = {'G': 4,
               'A': 4,
               'V': 4,
               'L': 6,
               'I': 3,
               'P': 4,
               'F': 2,
               'Y': 2,
               'W': 1,
               'S': 6,
               'T': 4,
               'C': 2,
               'M': 1,
               'N': 2,
               'Q': 2,
               'K': 2,
               'R': 6,
               'H': 2,
               'D': 2,
               'E': 2,
               'B': 4,
               'Z': 4}

In [5]:
possible_comb_count = 3 # Due to the stop message

for amino_acid in protein_string:

    possible_comb_count *= codon_count[amino_acid]

possible_comb_count % (10**6)

164992

## 018 - Open Reading Frames

In [91]:
with open('Data/018_rosalind_orf.txt', "r") as file:
    dna_string = file.read().splitlines()[1:]
    dna_string = "".join(dna_string)

In [93]:
dna_codon_table = {"TTT": "F", "TTC": "F", "TTA": "L", "TTG": "L", "TGG": "W",             
                   "TCT": "S", "TCC": "S", "TCA": "S", "TCG": "S", "TAT": "Y", "TAC": "Y",                  
                   "TAA": "Stop", "TAG": "Stop", "TGT": "C", "TGC": "C", "TGA": "Stop",                  
                   "CTT": "L", "CTC": "L", "CTA": "L", "CTG": "L", "CCT": "P", "CCC": "P",
                   "CCA": "P", "CCG": "P", "CAT": "H", "CAC": "H", "CAA": "Q", "CAG": "Q",
                   "CGT": "R", "CGC": "R", "CGA": "R", "CGG": "R", "ATT": "I", "ATC": "I",
                   "ATA": "I", "ATG": "M", "ACT": "T", "ACC": "T", "ACA": "T", "ACG": "T",
                   "AAT": "N", "AAC": "N", "AAA": "K", "AAG": "K", "AGT": "S", "AGC": "S",
                   "AGA": "R", "AGG": "R", "GTT": "V", "GTC": "V", "GTA": "V", "GTG": "V",
                   "GCT": "A", "GCC": "A", "GCA": "A", "GCG": "A", "GAT": "D", "GAC": "D",
                   "GAA": "E", "GAG": "E", "GGT": "G", "GGC": "G", "GGA": "G", "GGG": "G"}

In [94]:
# Ayrı dosyaya çek her çözümü

def find_reverse_complement(dna_string):
    s_c = dna_string.replace('A', '%temp%').replace('T', 'A').replace('%temp%', 'T')
    s_c = s_c.replace('G', '%temp%').replace('C', 'G').replace('%temp%', 'C')
    s_c = s_c[::-1]
    return s_c

In [95]:
def find_candidate_protein_strings(dna_string):

    sequences = []

    for i in range(len(dna_string) - 2):
        
        if dna_string[i:i+3] == "ATG":

            new_seq = ""

            for j in range(0, len(dna_string) - i - 2, 3):

                protein = dna_codon_table[dna_string[i+j:i+j+3]]

                if protein == "Stop":
                    sequences.append(new_seq)
                    break

                new_seq += protein

    return(sequences)


In [96]:
results = set(find_candidate_protein_strings(dna_string) + find_candidate_protein_strings(find_reverse_complement(dna_string)))

for result in results:
    print(result)

MPAITIR
MFV
MVAFS
MSTLC
MYQLYRTLMPAITIR
MICESLKPLGHIS
MAPTPTQPRGMLPSGYLINFQQRIRFLCTSFRKSMGR
MTTAGNYLFLFHWQVFQWGGT
MTVATDTLFCS
MLPSGYLINFQQRIRFLCTSFRKSMGR
ME
MLGSHYVPPHWNTCQWNRNR
MVIWWPLANMSEGFQRLTNHLYESMMTVATDTLFCS
MGR
MEVPATTLYRRRGMLGSHYVPPHWNTCQWNRNR
MSLARFCHTCGQPLRGRGCRRGSGRRGLTGGWRRKASAEWKRVRPTAM
MRPTAARQGLSERQR
MTSPTHTPATPSGPDVLKLVRPVLSTALCSSNGSARYNTL
MSEGFQRLTNHLYESMMTVATDTLFCS
MCGLRPVE
MMTVATDTLFCS
MNQ
MNTDVRAPTCRMTQREELNYGNMVAFS
MTQREELNYGNMVAFS
M


## 019 - Enumerating Gene Orders

In [76]:
def permute(seq):

    # Base conditions:

    if len(seq) == 0:
        return []
 
    if len(seq) == 1:
        return [seq]
 
    permutations = []

    for i in range(len(seq)):
       m = seq[i]
 
       # Remove seq[i] from seq

       subseq = seq[:i] + seq[i+1:]
 
       for p in permute(subseq):
           
           permutations.append([m] + p)

    return(permutations)

In [80]:
# Input
n = 7

# Output
seq = list(range(1, n+1))
print(math.factorial(n))
for permutation in permute(seq):
    print(*permutation)

5040
1 2 3 4 5 6 7
1 2 3 4 5 7 6
1 2 3 4 6 5 7
1 2 3 4 6 7 5
1 2 3 4 7 5 6
1 2 3 4 7 6 5
1 2 3 5 4 6 7
1 2 3 5 4 7 6
1 2 3 5 6 4 7
1 2 3 5 6 7 4
1 2 3 5 7 4 6
1 2 3 5 7 6 4
1 2 3 6 4 5 7
1 2 3 6 4 7 5
1 2 3 6 5 4 7
1 2 3 6 5 7 4
1 2 3 6 7 4 5
1 2 3 6 7 5 4
1 2 3 7 4 5 6
1 2 3 7 4 6 5
1 2 3 7 5 4 6
1 2 3 7 5 6 4
1 2 3 7 6 4 5
1 2 3 7 6 5 4
1 2 4 3 5 6 7
1 2 4 3 5 7 6
1 2 4 3 6 5 7
1 2 4 3 6 7 5
1 2 4 3 7 5 6
1 2 4 3 7 6 5
1 2 4 5 3 6 7
1 2 4 5 3 7 6
1 2 4 5 6 3 7
1 2 4 5 6 7 3
1 2 4 5 7 3 6
1 2 4 5 7 6 3
1 2 4 6 3 5 7
1 2 4 6 3 7 5
1 2 4 6 5 3 7
1 2 4 6 5 7 3
1 2 4 6 7 3 5
1 2 4 6 7 5 3
1 2 4 7 3 5 6
1 2 4 7 3 6 5
1 2 4 7 5 3 6
1 2 4 7 5 6 3
1 2 4 7 6 3 5
1 2 4 7 6 5 3
1 2 5 3 4 6 7
1 2 5 3 4 7 6
1 2 5 3 6 4 7
1 2 5 3 6 7 4
1 2 5 3 7 4 6
1 2 5 3 7 6 4
1 2 5 4 3 6 7
1 2 5 4 3 7 6
1 2 5 4 6 3 7
1 2 5 4 6 7 3
1 2 5 4 7 3 6
1 2 5 4 7 6 3
1 2 5 6 3 4 7
1 2 5 6 3 7 4
1 2 5 6 4 3 7
1 2 5 6 4 7 3
1 2 5 6 7 3 4
1 2 5 6 7 4 3
1 2 5 7 3 4 6
1 2 5 7 3 6 4
1 2 5 7 4 3 6
1 2 5 7 4 6 3
1 2 5 7 6 3 4
1

## 20 - Calculating Protein Mass

In [83]:
monoisotopic_mass_table = {"A": 71.03711,
                           "C": 103.00919,
                           "D": 115.02694,
                           "E": 129.04259,
                           "F": 147.06841,
                           "G": 57.02146,
                           "H": 137.05891,
                           "I": 113.08406,
                           "K": 128.09496,
                           "L": 113.08406,
                           "M": 131.04049,
                           "N": 114.04293,
                           "P": 97.05276,
                           "Q": 128.05858,
                           "R": 156.10111,
                           "S": 87.03203,                           
                           "T": 101.04768,
                           "V": 99.06841,
                           "W": 186.07931,
                           "Y": 163.06333}

monoisotopic_mass_of_water = 18.01056

# From rosalind.info/glossary/monoisotopic-mass-table/, in Da (Dalton)

In [87]:
def find_protein_mass(protein):

    mass = 0
    for amino_acid in protein:
        mass += monoisotopic_mass_table[amino_acid]

    return(round(mass, 3))

In [91]:
protein_string = open("Data/020_rosalind_prtm.txt", "r").read().splitlines()[0]
find_protein_mass(protein_string)

103558.01