In [None]:
import pandas as pd
import multiprocessing as mp
from string import ascii_lowercase
import itertools
import functools
import numpy as np

## Prob Table Generation

### COCA

In [None]:
COCA = pd.DataFrame([['defeat',21947],['decet',6],['defect',3976],['deft',1240],['defer',2239], ['Deeft',0]], columns=['word', 'frequency'])
COCA_pop = 1e9
COCA['P(w)'] = COCA['frequency']/COCA_pop
COCA['rank'] = COCA['frequency'].rank(ascending=False).astype(int)

In [None]:
COCA.head()

Unnamed: 0,word,frequency,P(w),rank
0,defeat,21947,2.1947e-05,1
1,decet,6,6e-09,5
2,defect,3976,3.976e-06,2
3,deft,1240,1.24e-06,4
4,defer,2239,2.239e-06,3


### WIKI

In [None]:
WIKI = pd.DataFrame([['defeat',121408],['decet',81],['defect',7793],['deft',814],['defer',1416],['Deeft',0]], columns=['word','frequency'])
WIKI_pop = 1.9e9
WIKI['P(w)'] = WIKI['frequency']/WIKI_pop
WIKI['rank'] = WIKI['frequency'].rank(ascending=False).astype(int)

In [None]:
WIKI.head()

Unnamed: 0,word,frequency,P(w),rank
0,defeat,121408,6.389895e-05,1
1,decet,81,4.263158e-08,5
2,defect,7793,4.101579e-06,2
3,deft,814,4.284211e-07,4
4,defer,1416,7.452632e-07,3


### IULA

In [None]:
IULA = pd.DataFrame([['defeat',11],['decet',0],['defect',180],['deft',0],['defer',11],['Deeft',0]],columns=['word','frequency'])
IULA_pop = 2.1e6
IULA['P(w)'] = IULA['frequency']/IULA_pop
IULA['rank'] = IULA['frequency'].rank(ascending=False).astype(int)

In [None]:
IULA.head()

Unnamed: 0,word,frequency,P(w),rank
0,defeat,11,5e-06,2
1,decet,0,0.0,5
2,defect,180,8.6e-05,1
3,deft,0,0.0,5
4,defer,11,5e-06,2


## NORVIG

In [None]:
norvig = pd.read_csv('http://norvig.com/ngrams/count_1edit.txt', sep='\t', encoding='ISO-8859-1', header=None)
norvig.columns = ['term', 'edit']
norvig = norvig.set_index('term')
norvig.head()

Unnamed: 0_level_0,edit
term,Unnamed: 1_level_1
e|i,917
a|e,856
i|e,771
e|a,749
a|i,559


In [None]:
norvig_orig = pd.read_csv('http://norvig.com/ngrams/count_big.txt', sep='\t', encoding='ISO-8859-1', header=None)
norvig_orig = norvig_orig.dropna()
norvig_orig.columns = ['tern', 'freq']
norvig_orig.head()

Unnamed: 0,tern,freq
0,a,21160
1,aah,1
2,aaron,5
3,ab,2
4,aback,3


In [None]:
def get_count(c, norvig_orig):
    return norvig_orig.apply(lambda x: x.tern.count(c) * x.freq, axis=1).sum()

In [None]:
character_set = list(map(''.join, itertools.product(ascii_lowercase, repeat=1))) + list(map(''.join, itertools.product(ascii_lowercase, repeat=2)))

with mp.Pool(processes=8) as pool:
  freq_list = pool.map(functools.partial(get_count, norvig_orig=norvig_orig), character_set)

In [None]:
freq_df = pd.DataFrame([character_set, freq_list], index=['char', 'freq']).T
freq_df = freq_df.set_index('char')

In [None]:
freq_df.head()

Unnamed: 0_level_0,freq
char,Unnamed: 1_level_1
a,407349
b,73161
c,144964
d,215698
e,632999


In [None]:
COCA['P(x|w)'] = [(norvig.loc['e|ea'].values / freq_df.loc['ea'].values)[0], 
                  (norvig.loc['f|c'].values / freq_df.loc['c'].values)[0],
                  (norvig.loc['e|ec'].values / freq_df.loc['ec'].values)[0],
                  (norvig.loc['e| '].values / freq_df.loc['e'].values)[0],
                  (norvig.loc['t|r'].values / freq_df.loc['r'].values)[0],
                  (norvig.loc['fe|ef'].values / freq_df.loc['ef'].values)[0]]
COCA['109 P(x|w)P(w)'] = 1e9 * COCA['P(w)'] * COCA['P(x|w)']
COCA.head()

Unnamed: 0,word,frequency,P(w),rank,P(x|w),109 P(x|w)P(w)
0,defeat,21947,2.1947e-05,1,0.012834,281.667621
1,decet,6,6e-09,5,2.8e-05,0.000166
2,defect,3976,3.976e-06,2,0.003167,12.591604
3,deft,1240,1.24e-06,4,3e-06,0.003918
4,defer,2239,2.239e-06,3,3.6e-05,0.079565


In [None]:
IULA['P(x|w)'] = COCA['P(x|w)']
IULA['109 P(x|w)P(w)'] = 1e9 * IULA['P(w)'] * IULA['P(x|w)']
IULA.head()

Unnamed: 0,word,frequency,P(w),rank,P(x|w),109 P(x|w)P(w)
0,defeat,11,5e-06,2,0.012834,67.225672
1,decet,0,0.0,5,2.8e-05,0.0
2,defect,180,8.6e-05,1,0.003167,271.448786
3,deft,0,0.0,5,3e-06,0.0
4,defer,11,5e-06,2,3.6e-05,0.186141


## Demonstrate

In [None]:
COCA_D = pd.DataFrame([['a defeat', 607], ['a defect', 453], ['defeat free', 1], ['defect free', 5]], columns=['word', 'frequency'])
COCA_D.set_index('word', inplace=True)
COCA_D['eval'] = [
  COCA_D['frequency']['a defeat']/COCA['frequency'][COCA[COCA['word'] == 'defeat'].index].values[0],
  COCA_D['frequency']['a defect']/COCA['frequency'][COCA[COCA['word'] == 'defect'].index].values[0],
  COCA_D['frequency']['defeat free']/256258 ,
  COCA_D['frequency']['defect free']/256258,
]
COCA_D.head()

Unnamed: 0_level_0,frequency,eval
word,Unnamed: 1_level_1,Unnamed: 2_level_1
a defeat,607,0.027658
a defect,453,0.113934
defeat free,1,4e-06
defect free,5,2e-05


In [None]:
print('a defeat free:', COCA_D['eval']['a defeat']*COCA_D['eval']['defeat free'])
print('a defect free:', COCA_D['eval']['a defect']*COCA_D['eval']['defect free'])

a defeat free: 1.0792848853794279e-07
a defect free: 2.2230252637899683e-06
