# Cross-lingual Token Co-occurrence

In [None]:
with open("Data/mt-data/eng-spa.txt") as f:
    english_spanish = f.read().split('\n')

# Let's take a look at the data

In [None]:
english_spanish[0]

# Now let's parse the data to make a list of spanish sentences and a list of english sentences

In [None]:
from tqdm import tqdm

def parse_lines(lines):
    l1_sentences = []
    l2_sentences = []
    for line in tqdm(lines):
        try:
            parts = line.split('\t')
            l1 = parts[0]
            l2 = parts[1]
        except:
            continue
        l1_sentences.append(l1)
        l2_sentences.append(l2)

    return l1_sentences, l2_sentences


english, spanish = parse_lines(english_spanish)
    

pmi(a,b) = log(p(a,b)/(p(a)p(b)))

# We are going to implement a function that compute pointwise mutual information

pmi(a,b) = log(p(a,b)/(p(a)p(b)))

[Pointwise Mutual Information](https://en.wikipedia.org/wiki/Pointwise_mutual_information)

Let's start writing what components we need in order to compute the PMI of `token_A` and of `token_B`

In [None]:
def pointwise_mutual_information(token_A, token_B, A_B_probabilities, A_probabilities, B_Probabilities):
    return

We need to keep track of the probabilities of A, and the probabilties of B, so these are really just unigram dictionaries for each language

# Let's prepare a preprocessing function to obtain tokens

In [None]:
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize

def preprocess(string, language):
    tokens = word_tokenize(string.lower(), language=language)
    return tokens

test_string = english[0]
tokens = preprocess(test_string, 'english')
print(test_string)
print(tokens)

test_string = spanish[0]
tokens = preprocess(test_string, 'spanish')
print(test_string)
print(tokens)

In [None]:
english[2]

# Count the english unigrams and spanish unigrams

In [None]:
from collections import defaultdict

english_unigrams = defaultdict(lambda:0)

for sentence in tqdm(english):
    tokens = preprocess(sentence, 'english')
    for token in tokens:
        english_unigrams[token] += 1
        
spanish_unigrams = defaultdict(lambda:0)

for sentence in tqdm(spanish):
    tokens = preprocess(sentence, 'spanish')
    for token in tokens:
        spanish_unigrams[token] += 1

# Compute English and Spanish unigram probabilities

In [None]:
english_unigram_probabilities = {}
total_english_unigrams = sum(english_unigrams.values())
for token in tqdm(english_unigrams):
    token_count = english_unigrams[token]
    english_unigram_probabilities[token] = token_count / total_english_unigrams
    
spanish_unigram_probabilities = {}
total_spanish_unigrams = sum(spanish_unigrams.values())
for token in tqdm(spanish_unigrams):
    token_count = spanish_unigrams[token]
    spanish_unigram_probabilities[token] = token_count / total_spanish_unigrams
    

# To compute probabilities of encountering english unigram and spanish unigram in parallel sentences, first count the co-occurrences

In [None]:
en_es_cooc_count = defaultdict(lambda:defaultdict(lambda:0))

for english_sentence, spanish_sentence in tqdm(zip(english, spanish)):
    english_tokens = preprocess(english_sentence, 'english')
    spanish_tokens = preprocess(spanish_sentence, 'spanish')
    
    english_token_set = set(english_tokens)
    spanish_token_set = set(spanish_tokens)
    
    for english_token in list(english_token_set):
        for spanish_token in spanish_token_set:
            en_es_cooc_count[english_token][spanish_token] += 1
            
    
    


# Now compute the probabilities

In [None]:
en_es_cooc_probabilities = defaultdict(lambda:defaultdict(lambda:0))

# first compute the total pairs
total_pairs = 0
for en_tok in tqdm(en_es_cooc_count):
    total_pairs += sum(en_es_cooc_count[en_tok].values())
      
for en_tok in tqdm(en_es_cooc_count):
    for es_tok in en_es_cooc_count[en_tok]:
        top = en_es_cooc_count[en_tok][es_tok]
        bottom = total_pairs
        
        pr = top / bottom
        en_es_cooc_probabilities[en_tok][es_tok] = pr
    

# Now we have all of the components we need for the PMI function

In [None]:
def pointwise_mutual_information(token_A, token_B, A_B_probabilities, A_probabilities, B_Probabilities):
    top = A_B_probabilities[token_A][token_B]
    bottom = A_probabilities[token_A] * B_Probabilities[token_B]
    
    pmi = top / bottom
    return pmi

en_es_pmis = {}

for en_tok in tqdm(en_es_cooc_count):
    for es_tok in en_es_cooc_count[en_tok]:
        pmi = pointwise_mutual_information(en_tok, es_tok, en_es_cooc_probabilities, english_unigram_probabilities, spanish_unigram_probabilities)
        pair = en_tok + '\t' + es_tok
        en_es_pmis[pair] = pmi
        

# Sort our dictionary of PMI's to see the pairs with high pmi scores

In [None]:
import operator

sorted_pmis = sorted(en_es_pmis.items(), key=operator.itemgetter(1), reverse=True)

# Print out the words with the top ten highest PMI's

In [None]:
for pair, pmi in sorted_pmis[:10]:
    parts = pair.split('\t')
    en = parts[0]
    es = parts[1]
    print(en, '--->', es)