In [1]:
def read_in_sentence(fn, start_token="<s>", end_token="</s>"):
    sents = []
    with open(fn) as text:
        sent = []
        for line in text.readlines():
            line = line.strip()
            sent.append(line)
            if line == end_token:
                sents.append(sent)
                sent = []
        
    return sents

In [2]:
source = read_in_sentence("./train-05/train-source.txt")
target = read_in_sentence("./train-05/train-target.txt")
#sanity check..
print(f"num sentences in source: {len(source)}")
print(f"num sentences in target: {len(target)}")


num sentences in source: 45171
num sentences in target: 45171


- 47254 tokens aligned one-to-one out of 48209 tokens
- 371 two to one
- 369 two to one
- 186 two to two
- 10 three to two
- 38102 unchanged

In [10]:
from typing import List
from itertools import chain, permutations
from collections import defaultdict, Counter

# Looking at this, it looks like we need to generate 
# "up to bigram: s1_t1, s1s2_t1 "
# "up to trigram_bigram: s1s2_t1t2, s1s2s3_t1t2"
class NGram:
    def __init__(self, source: List[List[str]], target: List[List[str]]):
        self.source = source
        self.target = target
        self.N = len(self.source)
        # alpha smoothing here to prevent divided by 0...
        self.target_word_appearance = defaultdict(int)

    def generate_ngram(self, n_maps = [(1,1),(2,1),(2,2), (3,2)]):
        #pre computed ngram
        ngram_dict = defaultdict()
        aligns_with_perturb = []
        for ind in range(self.N):
            ssent, tsent = self.source[ind], self.target[ind]
            # FIXME: i am not sure how to align them. We are not given a map. 
            # We know that source is always longer than target
            # but that is about it. So I am just going to align all different possible maps in the sentence...
            # Hope there is enough overlap to tell the difference
            # Example, I don't know what if target_i maps to source_i-1 source_i. 
            # And I don't know how many maps are given per sentence...
            # so target_i -> source_i, target_i -> source_i source_i+1 as I implement
            target_count = set()
            for n_gram in n_maps:
                nsource, ntarget = n_gram
                
                target_sent_inplace = [tuple(tsent[i:i+ntarget]) for i in range(len(tsent)-ntarget+1)]
                source_sent_inplace = [tuple(ssent[i:i+nsource]) for i in range(len(ssent)-nsource+1)]
                if ntarget not in target_count:
                    for t in target_sent_inplace:
                        self.target_word_appearance[t] += 1
                    target_count.add(ntarget)

                aligns_with_perturb += [(sword, tar)
                                        for tar in target_sent_inplace for sword in source_sent_inplace]
            print(f"{ind}/{self.N}", end = "\r")
        return Counter(aligns_with_perturb) #(sword, starget)
            

class BiTextWordAlignment:
    #implemented as in IBM Model 1 and character tmat as well for unknown words
    def __init__(self, source: List[List[str]], target: List[List[str]]):
        self.source = source
        self.target = target
        self.fidelity_count, self.fidelity_target_count = None, None
        self.fluency, self.wordCountbyN = self.calculate_fluency(self.target) #wordCountbyN is to turned into percentage afterwards
        self.fluency_prob = defaultdict(float)

    def load_fidelty_stats(self, fidel_count, overall_count):
        self.fideltiy_count, self.fidelity_target_count = fidel_count, overall_count
        
    def calculate_fluency(self, corpus): #That is P(t)
        uniword = Counter(chain(*corpus)) #counter |t| = 1
        uniwordCount = sum(uniword.values())
        biword  = Counter(chain(*map(lambda x: tuple(zip(x, x[1:])), corpus))) #counter for |t| = 2
        biwordCount = sum(biword.values())
        return Counter({**uniword, **biword}), [uniword,biword]

    def _get_fluency_prob(self, tword):
        # I precomputed fluency count already
        return self.fluency[tword]/self.wordCountbyN[(len(tword)-1)]
    
    def calculate_word_fertilities(self):
        pass
    



    def train_lexical_prob(self, lexicalP, n_maps=[(1, 1), (2, 1), (2, 2), (3, 2)]):
        """modified from instructor's code to align for different scenarios:
        https://colab.research.google.com/drive/1V6ZwTsBe2s7tAmHTkYqpENrzt4zOrnu3?usp=sharing#scrollTo=NLmGPFha8982
        """
        
        total = defaultdict(float)  # keys are source language words

        for ssent, tsent in zip(self.source, self.target):
            sent_totals = defaultdict(float)
            for sword in ssent:
                for tword in tsent:
                    sent_totals[sword] += lexicalP[(sword, tword)]
            for s in ssent:
                for t in tsent:
                    self.fidelity_count[(s, t)] += lexicalP[(s, t)]/sent_totals[s]
                    total[t] += lexicalP[(s, t)]/sent_totals[s]
        for t in tv:
            for s in sv:
                lexicalP[(s, t)] = self.fidelity_count[(s, t)]/total[t]

    def align_chars(self):
        pass
class HMM:
    def __init__(self):
        pass
    def train(self, fluency, fidelity):




In [13]:
#All training data here...
#import pickle
#ngram_train = NGram(source, target)
#ngram_train.generate_ngram()
#with open('train-05/ngram_count.train', 'wb') as outputfile:
#    pickle.dump(ngram_count, outputfile)
#with open('train-05/ngram_count_target.train', 'wb') as out:
#    pickle.dump(ngram_train.target_word_appearance, out)


In [None]:
#btwa = BiTextWordAlignment(source, target)

In [None]:
fluent = btwa.fluency
sum(fluent.values())

In [None]:
{i: (v/sum(fluent.values())) for i, v in fluent.items()}
#%timeit
#dict(map(lambda x: (x[0], x[1]/sum(fluent.values())), dict(fluent).items()))


In [None]:
from itertools import combinations
tsent, ssent = [1, 2, 3, 4], [1, 2, 3,4]
ntarget, nsource = 2,3
target_sent_inplace = [tuple(tsent[i:i+ntarget])
                       for i in range(len(tsent)-ntarget+1)]
source_sent_inplace = [tuple(ssent[i:i+nsource])
                       for i in range(len(ssent)-nsource+1)]

[(sword, tar) for tar in target_sent_inplace for sword in source_sent_inplace]

