## Building the Cooc Table

In [3]:
import pprint as pp
from collections import defaultdict
import re

In [4]:
def build_cooc_table(filepath_fr, filepath_en) :
    
    cooc_table = {}
    
    with open(filepath_fr) as f1, open(filepath_en) as f2:
        fr = f1.readlines()
        en = f2.readlines()
    # stopwords = ['the', 'a', 'it', 'is', 'of', 'and'] #... add as many as you would like to try it out
    for line_fr, line_en in zip(fr, en):
        line_fr = re.sub('[\.,:;]','', line_fr) # simple regular expression to get rid of basic punctuation
        line_en = re.sub('[\.,:;]','', line_en)
        line_fr, line_en = line_fr.strip().lower().split(), line_en.strip().lower().split()
        line_fr, line_en = set(line_fr), set(line_en)  # use set to remove any duplicates

        for word_fr in line_fr:
            # create count dict variable for the English sentence
            counts_en = cooc_table.get(word_fr, defaultdict(int))
           
            for word_en in line_en:
                # if word_en not in stopwords: # uncomment if you want to remove stopwords
                counts_en[word_en] += 1

            cooc_table[word_fr] = counts_en # add the counts for the English sentence to the cooc_table

    return cooc_table

In [5]:
d = build_cooc_table('french_example.txt', 'english_example.txt')
d
# its = d.items()
# dict(its)

{'la': defaultdict(int, {'cow': 1, 'calf': 1, 'and': 1, 'the': 1}),
 'veau': defaultdict(int, {'cow': 1, 'calf': 1, 'and': 1, 'the': 1}),
 'vache': defaultdict(int, {'cow': 1, 'calf': 1, 'and': 1, 'the': 1}),
 'le': defaultdict(int,
             {'cow': 1, 'calf': 1, 'and': 2, 'the': 2, 'dog': 1, 'cat': 1}),
 'et': defaultdict(int,
             {'cow': 1, 'calf': 1, 'and': 2, 'the': 2, 'dog': 1, 'cat': 1}),
 'chat': defaultdict(int, {'and': 1, 'dog': 1, 'cat': 1, 'the': 1}),
 'chien': defaultdict(int, {'and': 1, 'dog': 1, 'cat': 1, 'the': 1})}

In [4]:
cooc_table = build_cooc_table('french_example.txt', 'english_example.txt')
cooc_table

{'vache': defaultdict(int, {'calf': 1, 'cow': 1, 'and': 1, 'the': 1}),
 'la': defaultdict(int, {'calf': 1, 'cow': 1, 'and': 1, 'the': 1}),
 'et': defaultdict(int,
             {'calf': 1, 'cow': 1, 'and': 2, 'the': 2, 'dog': 1, 'cat': 1}),
 'veau': defaultdict(int, {'calf': 1, 'cow': 1, 'and': 1, 'the': 1}),
 'le': defaultdict(int,
             {'calf': 1, 'cow': 1, 'and': 2, 'the': 2, 'dog': 1, 'cat': 1}),
 'chien': defaultdict(int, {'and': 1, 'dog': 1, 'cat': 1, 'the': 1}),
 'chat': defaultdict(int, {'and': 1, 'dog': 1, 'cat': 1, 'the': 1})}

In [3]:
def build_cooc_table(filepath_fr, filepath_en) :
    
    # defaultdict provides a default value for the key that does not exist.
    
    cooc_table = {}
    
    with open(filepath_fr) as f:
        fr = f.readlines()
    with open(filepath_en) as f:
        en = f.readlines()
    # stopwords = ['the', 'a', 'it', 'is', 'of', 'and'] #... add as many as you like
    for line_fr, line_en in zip(fr, en):
        line_fr = re.sub('[\.,:;]','', line_fr) # simple regular expression to get rid of basic punctuation
        line_en = re.sub('[\.,:;]','', line_en)
        line_fr, line_en = line_fr.strip().lower().split(), line_en.strip().lower().split()
        # print(cooc_table)
        # use set to remove any duplicates
        for word_fr in set(line_fr):
            # build count dict for the English sentence
            if word_fr in cooc_table :
                # copy dict if the word in French has already been seen and exists in the cooc table
                counts_en = cooc_table[word_fr]
            else:
                # otherwise initialize a defaultdict =>  "int" specifies the type and means we can directly add an int 
                # value to the count without initializing anything (a default of 0 is set)
                counts_en = defaultdict(int)
         
            for word_en in set(line_en):
                # if word_en not in stopwords:
                counts_en[word_en] += 1

            cooc_table[word_fr] = counts_en
        #print (cooc_table)
    
    return cooc_table

In [4]:
cooc_table = build_cooc_table('french.corpus', 'english.corpus')
cooc_table

{'lune': defaultdict(int,
             {'to': 76,
              'earth': 27,
              'the': 142,
              'moon': 131,
              'from': 21,
              'has': 11,
              'at': 30,
              '"': 41,
              'you': 12,
              'colleagues': 1,
              'one': 18,
              'is': 35,
              'heard': 1,
              'brave': 1,
              'or': 8,
              'of': 103,
              'my': 3,
              'no': 11,
              'seen': 2,
              'among': 2,
              'least': 4,
              'not': 30,
              'who': 4,
              'there': 11,
              'it': 43,
              'speak': 1,
              'roared': 1,
              'gun': 5,
              '!': 8,
              'cheers': 1,
              'voice': 1,
              'club': 5,
              'for': 19,
              'three': 2,
              'with': 25,
              'been': 9,
              'continued': 3,
              'gentlemen': 2,
    

## Sorting the cooc table and printing it to a file

In [2]:
def sorted_cooc(cooc_table):
    """
    Extract top co-occurrences for each French word
    """
    top_coocs = {}
    for word_fr in cooc_table:
        # we sort the cooccurrences for each french word by looking at the frequency
        # for each english word that has been encountered and sort highest to lowest
        
        # the sorted function has a `key` parameter which takes a function specifying which elements shoud be compared.
        # since we are using the frequencies to order our tuples (position 2 in each tuple), the elmt in idx 1 is 
        # what the function should return
        
        # lambda functions are a quick way of writing functions :
        # lambda cooc_tuple : cooc_tuple[1] 
        
        sorted_coocs = sorted(cooc_table[word_fr].items(), key=lambda x: x[1], reverse=True) #.items returns an iterable with keys and values as a tuples
        # print(sorted_coocs)

        # sorted_coocs is  a list of tuple (word_en, freq) in descending order
        # we now retrieve the top occurring tuple sorted_coocs[0] and create a new tuple with
        # the french word, the english word (elmt [0] of the top tuple) and the freq (elmt[1] of the top tuple)

        top_coocs[word_fr] = sorted(sorted_coocs[:2]) # alphabetical order, keep the top 2 co-occurrences here
    
    top_coocs = dict(sorted(top_coocs.items())) # alphabetical order, key= lambda x:x[0] is by default
    return top_coocs

In [6]:
sorted_cooc(d)

{'chat': [('and', 1), ('dog', 1)],
 'chien': [('and', 1), ('dog', 1)],
 'et': [('and', 2), ('the', 2)],
 'la': [('calf', 1), ('cow', 1)],
 'le': [('and', 2), ('the', 2)],
 'vache': [('calf', 1), ('cow', 1)],
 'veau': [('calf', 1), ('cow', 1)]}

In [8]:
# we can finish by writing this lexicon to a file
import json
with open("./naive_lexicon.json", 'w') as f:
    json.dump(sorted_cooc(d), f, indent=2)

### Mini Topo sur les fonctions lambda
Lambda functions can be very practical sometimes :  usually a shortcut for declaring small single-expression anonymous functions.
They behave just like regular functions declared with the "def" keyword.
Lambdas are restricted to a songle expression, so there isn't even a return statement...

In practice:
Most frequently used to write short and concise "key functions" for sorting iterables by an alternate key, like in the sorted_cooc function above.

In [2]:
# Some examples:
add = lambda x, y: x + y 
print(add(5,3))

# Can be used directly inline as an expression :
(lambda x, y: x + y)(5,3)

8


8

In [6]:
# For sorting :
tuples = [(1, 'd'), (2, 'b'), (3, 'a')]
print(sorted(tuples, key=lambda x : x[1]))

print(sorted(range(-5, 6), key=lambda x: x * x))

[(3, 'a'), (2, 'b'), (1, 'd')]
[0, -1, 1, -2, 2, -3, 3, -4, 4, -5, 5]


In [29]:
# Caveat :
# Although it can look "cool" to use lambdas whenever you can, it's not always the clearest way to write your code...
# Take a second to think if using a lambda function is really the best way to go
# If you find yourself doing something remotely complex with a lambda function, using a classic "def" function is usually a better idea

# When filtering a list for example:
print(list(filter(lambda x: x % 2 == 0, range(16)))) # not necessarily as readable

# vs.
print([x  for x in range(16) if x % 2 == 0]) # usually a little clearer

#vs.

def filter_out_odd_numbers(nums_list):
    only_evens = []
    for x in nums_list:
        if x % 2 == 0:
            only_evens.append(x)
    return only_evens
print(filter_out_odd_numbers(range(16)))

[0, 2, 4, 6, 8, 10, 12, 14]
[0, 2, 4, 6, 8, 10, 12, 14]
[0, 2, 4, 6, 8, 10, 12, 14]


In [9]:
# The "Zen of Python" Easter Egg by Tim Peters
# Just a couple of guidelines by the creator you can revisit as much as you like to become a better pythonista !
import this

The Zen of Python, by Tim Peters

Beautiful is better than ugly.
Explicit is better than implicit.
Simple is better than complex.
Complex is better than complicated.
Flat is better than nested.
Sparse is better than dense.
Readability counts.
Special cases aren't special enough to break the rules.
Although practicality beats purity.
Errors should never pass silently.
Unless explicitly silenced.
In the face of ambiguity, refuse the temptation to guess.
There should be one-- and preferably only one --obvious way to do it.
Although that way may not be obvious at first unless you're Dutch.
Now is better than never.
Although never is often better than *right* now.
If the implementation is hard to explain, it's a bad idea.
If the implementation is easy to explain, it may be a good idea.
Namespaces are one honking great idea -- let's do more of those!
