In [284]:
from __future__ import division
from collections import Counter
from copy import deepcopy
from functools import partial
import random

import string

import numpy as np
import nltk

ALPHABET = string.ascii_lowercase
N_LETTERS = len(ALPHABET)
N_LETTERS

CHARS = string.ascii_lowercase + u' '

PUNCTUATION_REMOVER = {ord(c): None for c in string.punctuation + string.digits}
PUNCTUATION_REMOVER.update({
    ord(u'\xe6'): None,
    ord(u'\xe8'): None,
    ord(u'\xe9'): None,
    ord(u'\xee'): None,
    ord(u'\x1a'): None
})

LETTER_TO_INDEX = {unicode(s): i for i, s in enumerate(string.ascii_lowercase + u' ')}

In [292]:
def clean(s):
    return " ".join(s.translate(PUNCTUATION_REMOVER).lower().split())

class Cipher(object):
    chars = unicode(string.ascii_lowercase + ' ')
    ords = tuple(ord(c) for c in chars)
    
    def __init__(self, encoder):
        self._encoder = {ord(k): ord(v) for k, v in encoder.iteritems()}
        self._decoder = {ord(v): ord(k) for k, v in encoder.iteritems()}
        
    @classmethod
    def get_random_cipher(cls):
        c = Cipher({c: c for c in cls.chars})
        return c.mutate_key(27)
    
    @property
    def encoder(self):
        return deepcopy(self._encoder)
    
    @property
    def decoder(self):
        return deepcopy(self._decoder)
    
    def encipher(self, msg):
        msg = clean(unicode(msg))
        return msg.translate(self._encoder)
    
    def decipher(self, msg):
        return msg.translate(self._decoder)
    
    def mutate_key(self, n_swaps):
        to_swap = random.sample(self.ords, n_swaps)
        candidates = [self.encoder[c] for c in to_swap]
        random.shuffle(candidates)
        self._encoder.update(dict(zip(to_swap, candidates)))
        self._decoder = {v: k for k, v in self._encoder.iteritems()}
        return self
        
c = Cipher.get_random_cipher()
print c.encipher('hello world')
print c.decipher(c.encipher('hello world'))

 toovhnvmok
hello world


In [294]:
corpus = " ".join(clean(nltk.corpus.gutenberg.raw()).lower().split())
with open('gutenberg.txt', 'w') as f:
    f.write(corpus)

In [295]:
letter_counts = Counter(corpus)
letters = sorted(letter_counts.keys())
n_letters = len(letters)
letter_counts

Counter({u' ': 2102545,
         u'a': 731203,
         u'b': 139846,
         u'c': 185849,
         u'd': 400494,
         u'e': 1119617,
         u'f': 209239,
         u'g': 172048,
         u'h': 650743,
         u'i': 577691,
         u'j': 15946,
         u'k': 66676,
         u'l': 375313,
         u'm': 230032,
         u'n': 615091,
         u'o': 678136,
         u'p': 136173,
         u'q': 7552,
         u'r': 502402,
         u's': 556863,
         u't': 827161,
         u'u': 252211,
         u'v': 83829,
         u'w': 201292,
         u'x': 9160,
         u'y': 176040,
         u'z': 5525})

In [293]:
init_letter_probs = np.array([letter_counts[l] for l in letters])
init_letter_probs = np.log(init_letter_probs / init_letter_probs.sum())

def get_trans(n_elements, element_to_index, corpus, order=1, smoothing_factor=1):
    """
    Returns a matrix of log transition probabilities, T, where T[ix] is the 
    conditional probability of seeing element ix[-1] after sequence ix[:-1]
    """
    trans = np.ones((n_elements,)*(1 + order)) * smoothing_factor 
    
    for i in xrange(len(corpus) - order):
        ix = tuple(element_to_index[e] for e in corpus[i: i + order + 1])
        trans[ix] += 1

    return np.log(trans / trans.sum(axis=1, keepdims=True))


In [197]:
def get_likelihood(message, init_probs, trans_probs, element_to_index):
    """
    Returns the likelihood of a given message according
    to probabilities init_probs and trans_probs, where element_to_index
    converts from characters (or words if message is a list of words) to 
    array indicies.
    """
    order = trans_probs.ndim - 1
    prob = 0
    for i in xrange(len(message)):
        if i < order:
            prob += init_probs[element_to_index[message[i]]] 
        else: 
            ix = tuple(element_to_index[e] for e in message[i - order: i + 1])
            prob += trans_probs[ix]
    return prob

In [208]:
get_likelihood('cat', init_letter_probs, letter_trans_probs, LETTER_TO_INDEX)

-8.4732328719145915