In [6]:
# Valid DNA sequence
validSeq = 'ATGAAATTATGAATGAGCCTCAGCTGAAGCATCGCGCATCAGACTACGCTCAGACTCAGACTCAGCATTATAGTGAATGTTAATAAATAAAATAA'
invalidSeq = 'ATGAAATTATGAATGAGCCTCAGCTGAAGCATCGCGCATCAGACTACGCTCAGACTCAGACTCAGCATTATAGTGAATGTTAATAAATAAAATAAD'

dic = set(['A', 'T', 'C', 'G'])

def isValidDNA(seq):
    return all(i in dic for i in seq)

In [7]:
isValidDNA(str)

True

In [8]:
isValidDNA(invalidSeq)

False

In [19]:
# Frequency of Symbols
def frequency(seq):
    dic = {}
    for i in seq:
        if(i not in dic):
            dic[i] = 1
        else:
            dic[i] += 1
    return dic

In [20]:
frequency('ABCDABC')

{'A': 2, 'B': 2, 'C': 2, 'D': 1}

In [29]:
# Sort dictionary
def sortDictionary(dic):
    sortedDic = sorted(dic.items(), key = lambda x : x[1], reverse = True)
    
    return sortedDic

sortDictionary(frequency('ABCDABCDABCD'))

[('A', 3), ('B', 3), ('C', 3), ('D', 3)]

In [37]:
# Genes are tipically found in GC-rich regions of the genome. 
# This means that more than 60% of the sequence is formed by cytosine (C) or guanine (G) bases

# Calculate the GC content (percentage of 'G' and 'C') of the sequence.

def symbol_content(seq, symbols):
    count = 0 
    
    for i in seq:
        if(i in symbols):
            count += 1
    return count/len(seq)

In [39]:
symbol_content('GCADSGFCD', ['G', 'C'])

0.4444444444444444

In [60]:
def symbol_non_overlapping(seq, k, symbols):
    freqs = []
    
    for i in range(0, len(seq) - k + 1, k):
        sub_seq = seq[i: i + k]
                
        freqs.append(symbol_content(sub_seq, symbols))
        
    return freqs

In [61]:
symbol_non_overlapping(validSeq, 5, ['G', 'C'])

[0.2,
 0.0,
 0.4,
 0.6,
 0.6,
 0.6,
 0.6,
 0.6,
 0.4,
 0.6,
 0.6,
 0.4,
 0.6,
 0.2,
 0.4,
 0.2,
 0.0,
 0.0,
 0.0]

In [42]:
len('ATGAAATTATGAATGAGCCTCAGCTGAAGCATCGCGCATCAGACTACGCTCAGACTCAGACTCAGCATTATAGTGAATGTTAATAAATAAAATAA')

95

In [68]:
# Transcription is the first step required to produce a protein

# The new RNA molecule will be created as a complement to one of the strands where the gene is located. 
# The resulting sequence will be similar to the one of the other DNA strand, but with 'U' instead of 'T' nucleotides.

# As a general practice convert sequences to upper case letters

def transcription(dna_seq):
    assert isValidDNA(dna_seq), 'ERROR: Invalid DNA sequence'
    
    return dna_seq.upper().replace('T', 'U')

In [70]:
transcription(validSeq)
transcription(invalidSeq)

AssertionError: ERROR: Invalid DNA sequence

In [None]:
# Given the complementary of the DNA molecule, it is often necessary to compute
# the content of one strand given the sequence of the other strand. This is called
# reverse complement

# Complement -> replace 'A' <-> 'T', 'C' <-> 'G'
reverse_complement(dna_seq):
    assert isValidDNA(dna_seq), 'ERROR: Invalid DNA sequence'
    
    dna_seq = dna_seq.upper()
    
    return 
    
    

In [77]:
# Read from file 

def read_dic_aminoacids():
    dic = {}
    fd = open('genetic_code.txt')
    
    for line in fd:
        triplet = line[1:4]
        aminoacid = line[7]
        dic[triplet] = aminoacid
    
    return dic

In [78]:
read_dic_aminoacids()

{'GCT': 'A',
 'GCC': 'A',
 'GCA': 'A',
 'GCG': 'A',
 'TGT': 'C',
 'TGC': 'C',
 'GAT': 'D',
 'GAC': 'D',
 'GAA': 'E',
 'GAG': 'E',
 'TTT': 'F',
 'TTC': 'F',
 'GGT': 'G',
 'GGC': 'G',
 'GGA': 'G',
 'GGG': 'G',
 'CAT': 'H',
 'CAC': 'H',
 'ATA': 'I',
 'ATT': 'I',
 'ATC': 'I',
 'AAA': 'K',
 'AAG': 'K',
 'TTA': 'L',
 'TTG': 'L',
 'CTT': 'L',
 'CTC': 'L',
 'CTA': 'L',
 'CTG': 'L',
 'ATG': 'M',
 'AAT': 'N',
 'AAC': 'N',
 'CCT': 'P',
 'CCC': 'P',
 'CCA': 'P',
 'CCG': 'P',
 'CAA': 'Q',
 'CAG': 'Q',
 'CGT': 'R',
 'CGC': 'R',
 'CGA': 'R',
 'CGG': 'R',
 'AGA': 'R',
 'AGG': 'R',
 'TCT': 'S',
 'TCC': 'S',
 'TCA': 'S',
 'TCG': 'S',
 'AGT': 'S',
 'AGC': 'S',
 'ACT': 'T',
 'ACC': 'T',
 'ACA': 'T',
 'ACG': 'T',
 'GTT': 'V',
 'GTC': 'V',
 'GTA': 'V',
 'GTG': 'V',
 'TGG': 'W',
 'TAT': 'Y',
 'TAC': 'Y',
 'TAA': '_',
 'TAG': '_',
 'TGA': '_'}