<a href="https://colab.research.google.com/github/bhardwaj1230/NMT/blob/master/number_mismatch_samples.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import re
import os
import codecs
import numpy as np
import pandas as pd
import nltk
import string
from collections import Counter
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
from tqdm import tqdm_notebook as tqdm
from keras.preprocessing.text import Tokenizer

Using TensorFlow backend.


## Finding mis-match between corpora :

In [0]:
def read_mega_corpus(location):

    #Get all the paralel corpus file location
    fr_loc = []
    en_loc = []

    for path, subdirs, files in os.walk(location):
        for name in files:
            if 'tmx.fr' in name:
                fr_loc.append(os.path.join(path, name))
            elif 'tmx.en' in name:
                en_loc.append(os.path.join(path, name))
                
    aligned_en_loc = []
    aligned_fr_loc = []
    
    for loc in en_loc:
        for loc_fr in fr_loc:
            if loc[:-2] == loc_fr[:-2]:
                aligned_en_loc.append(loc)
                aligned_fr_loc.append(loc_fr)
            
    en_loc = aligned_en_loc
    fr_loc = aligned_fr_loc

    #Read all files in each folder
    train_en = []
    train_fr = []

    for fr, en in tqdm(zip(fr_loc, en_loc)):

        with open(fr) as f:
            eng = [[fr, idx, re.sub(r'[^\w\s]', '', line).strip(' \n').lower()] for idx, line in enumerate(f)]
        train_fr.append(eng)

        with open(en) as e:
            eng = [[en, idx, re.sub(r'[^\w\s]', '', line).strip(' \n').lower()] for idx, line in enumerate(e)]
        train_en.append(eng)

    #Create Data Frame for all data
    flat_en = [item for sublist in train_en for item in sublist]
    flat_en_df = pd.DataFrame(flat_en)

    flat_fr = [item for sublist in train_fr for item in sublist]
    flat_fr_df = pd.DataFrame(flat_fr)

    flat_en_df.columns = ['path','id','data']
    flat_fr_df.columns = ['path','id','data']

    return flat_en_df, flat_fr_df
    

In [0]:
list_of_folders_to_score = ['ALIGNMENT-QUALITY']


for folder in list_of_folders_to_score:
    location = '/data/rali8/Tmp/rali/bt/burtrad/corpus_renamed/'+folder+'/'


    test__en, test__fr = read_mega_corpus(location)


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [0]:
eng = [nltk.word_tokenize(token) for token in tqdm(test__en['data'])]
frc = [nltk.word_tokenize(token) for token in tqdm(test__fr['data'])]


HBox(children=(IntProgress(value=0, max=96083), HTML(value='')))




HBox(children=(IntProgress(value=0, max=96083), HTML(value='')))




In [0]:
#converting only single digits to words:

en_num = {1:'one', 2:'two', 3:'three', 4:'four',5:'five',6:'six',7:'seven',8:'eight',9:'nine',0:'zero' }
fr_num = {1:'un', 2:'deux', 3:'trois', 4:'quatre',5:'cinq',6:'six',7:'sept',8:'huit',9:'neuf',0:'zéro' }


def convert_digit_word(input_data,word,lang):
    cnt = 0
    data = []
    
    for idx, line in tqdm(enumerate(input_data)):
        value = []
        for token in line:
            if len(token)==1 and token.isdigit():
                cnt +=1
                try:
                    token = word[int(token)]
                except ValueError:
                    token = token
            value.append(token)
        data.append(value)
    print('Number of digits converted in :',lang,': ',cnt)
    return data
    
eng = convert_digit_word(eng,en_num,'English')
frc = convert_digit_word(frc,fr_num,'French')


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


Number of digits converted in : English :  10996


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


Number of digits converted in : French :  11823


In [0]:
# find all the index where we have any digits:

def find_digit_idx(input_data,lang):
    cnt = 0
    if_digit = {}
    for idx, line in tqdm(enumerate(input_data)):
        value = []
        for token in line:
            digits = re.sub("[^0-9]", "", token)
            if digits != '':
                cnt += 1
                value.append(digits)
        if value != []:
            if_digit[idx] = value
            
    #Flattening the values of each key:
    test = {}
    for k,v in if_digit.items():
        flat_list = [item for sublist in v for item in sublist]
        test[k] = flat_list
    if_digit = test

    print('Number of digits found in :',lang,': ',cnt)
    return if_digit


if_digit_en = find_digit_idx(eng,'English')
if_digit_fr = find_digit_idx(frc,'French')

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


Number of digits found in : English :  50149


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


Number of digits found in : French :  52494


In [0]:
def diff(li1, li2): 
    li_dif = [i for i in li1 + li2 if i not in li1 or i not in li2] 
    return li_dif 

#if we dont find same key , then its definetly there is a problem:
no_matches = diff([*if_digit_en],[*if_digit_fr])

#mached keys in parallel corpora:
matches = list((set([*if_digit_fr]).intersection([*if_digit_en])))

#Naive method to find difference in corpora based on digits:
unmatched_values = []
for key in matches:
    if if_digit_fr[key] != if_digit_en[key]:
        unmatched_values.append(key)
        
all_unmatched = no_matches + unmatched_values

In [0]:
len(all_unmatched)

17196

In [0]:
print('Total Unmached records: ',round(( len(all_unmatched))/max(len(if_digit_fr),len(if_digit_en))*100,2) ,'%')

Total Unmached records:  61.41 %


In [0]:
#Edit Distance calculation
def edit_distance(s1, s2):
    m=len(s1)+1
    n=len(s2)+1
    max_ = max(m,n)
    
    tbl = {}
    for i in range(m): tbl[i,0]=i
    for j in range(n): tbl[0,j]=j
    for i in range(1, m):
        for j in range(1, n):
            cost = 0 if s1[i-1] == s2[j-1] else 1
            tbl[i,j] = min(tbl[i, j-1]+1, tbl[i-1, j]+1, tbl[i-1, j-1]+cost)

    return tbl[i,j]/(max_ -1)
    #return tbl[i,j]


In [0]:
ed = {}
for key in matches:
    ed[key] = edit_distance(if_digit_fr[key], if_digit_en[key])


In [0]:
edit_distance(['2','0','0','2'],['2','0','5','2'])

0.25

In [0]:
big_unmatched = []
for key in ed:
    if (len(if_digit_fr[key]) > 2 or len(if_digit_en[key]) > 2) and ed[key] > 0.90:
        big_unmatched.append(key)
len(big_unmatched)

1095

In [0]:
print('Total Unmached records: ',round((len(big_unmatched)+len(no_matches))/max(len(if_digit_fr),len(if_digit_en))*100,2),'%')

Total Unmached records:  34.14 %


In [0]:
for key in ed:
    if (len(if_digit_fr[key]) > 2 or len(if_digit_en[key]) > 2) and ed[key] > 0.90:
        print(key,if_digit_en[key],' --> ', if_digit_fr[key])
       

2461 ['1', '3']  -->  ['0', '0', '0', '5']
2465 ['0', '0', '0', '5']  -->  ['1', '3']
2828 ['6', '0', '1', '4', '0']  -->  ['5', '5']
3639 ['1', '9', '9', '0', '1', '9', '9', '9']  -->  ['8', '2']
3831 ['1', '1']  -->  ['5', '4', '6']
3846 ['1', '2']  -->  ['2', '0', '0', '7']
4099 ['1', '7', '8', '0']  -->  ['2', '9', '9']
4121 ['2', '8']  -->  ['1', '0', '0']
4428 ['1', '3']  -->  ['3', '2', '4', '2', '6', '7', '5']
4430 ['3', '2', '4', '2', '6', '7', '5']  -->  ['1', '3', '1', '0']
4546 ['4', '5']  -->  ['3', '6', '2', '9']
5040 ['1', '5', '1', '9', '9', '4', '1', '9', '9', '4']  -->  ['2', '0', '0', '2']
5058 ['3', '5', '1', '6', '3', '1', '6', '3', '5', '1', '8', '1', '1', '8']  -->  ['4', '8']
5066 ['1', '8', '3', '5', '1', '1', '8', '8', '2', '1', '8', '1', '6']  -->  ['5', '5']
5082 ['3', '5', '1']  -->  ['2', '9']
5083 ['3', '5', '1']  -->  ['1', '1', '7', '9']
5101 ['1', '6']  -->  ['2', '0', '0', '2', '1', '2', '5', '2', '0', '0', '2', '3', '5', '8', '1', '9']
5106 ['8', '3'

41653 ['1', '7', '2', '8']  -->  ['0', '9']
41658 ['7', '8', '8', '9']  -->  ['2']
41671 ['9']  -->  ['5', '5', '1', '7']
41693 ['2', '6']  -->  ['0', '8', '8', '1', '5', '7']
41694 ['9', '0', '0']  -->  ['5', '5']
41704 ['0', '6', '2', '8']  -->  ['1']
41705 ['7']  -->  ['9', '0', '5']
41709 ['2', '5']  -->  ['0', '7', '8', '6', '8', '4']
41711 ['1']  -->  ['7', '9', '3']
41721 ['4', '6', '6']  -->  ['7', '5']
41722 ['7', '7', '5']  -->  ['0', '4']
41724 ['1', '3', '1', '5', '1', '1']  -->  ['0', '9', '4', '8', '0']
41729 ['9', '7', '2']  -->  ['6']
41732 ['7', '8', '3', '9', '5']  -->  ['0', '6', '1', '0', '0', '3']
41735 ['3', '0', '8']  -->  ['4']
41737 ['4', '4']  -->  ['3', '3', '8', '2', '6', '3']
41762 ['1', '5', '6', '9']  -->  ['8', '4', '8', '0']
41768 ['0', '5', '0', '7', '7']  -->  ['6', '2', '9', '2']
41770 ['9']  -->  ['7', '8', '1']
41779 ['1', '3', '5', '2']  -->  ['7', '9', '4']
41781 ['4', '5', '3', '0', '8', '7']  -->  ['7', '2', '2', '3']
41785 ['8', '0']  -->  ['9

In [0]:
for key in ed:
    if (len(if_digit_fr[key]) > 2 or len(if_digit_en[key]) > 2) and ed[key] > 0.90:
        print('\n\n',key,':\n',eng[key],' --> ','\n', frc[key],'\n\n')



 2461 :
 ['dichlorobenzene', '13', 'mdcb']  -->  
 ['0005', 'ddt', 'total'] 




 2465 :
 ['0005', 'ddt', 'total']  -->  
 ['13dichlorobenzène', 'mdcb'] 




 2828 :
 ['hwsb', '60', '140', 'yes', 'if', 'recovery', 'is', 'outside', 'of', 'specified', 'limits', 'repeat', 'if', 'possible']  -->  
 ['oui', 'si', 'la', 'récupération', 'se', 'situe', 'à', 'lextérieur', 'des', 'limites', 'indiquées', 'reprendre', 'au', 'besoin', 'voir', '55', 'matrice', 'enrichie'] 




 3639 :
 ['decision', '1990', 'r', 'v', 'marshall', 'decision', '1999', 'and', 'ongoing', 'legislative', 'considerations']  -->  
 ['lorsquon', 'les', 'combine', 'avec', 'les', 'différentes', 'couches', 'topographiques', 'individuelles', 'nous', 'concluons', 'quenviron', '82', 'de', 'toutes', 'les', 'donnéescadres', 'proviennent', 'des', 'cartes', 'de', 'base', 'topographiques'] 




 3831 :
 ['table', '11', 'data', 'confidentiality', 'information', 'sharing', 'requests', 'or', 'direct', 'purchase', 'here', 'there', 'is', 'a

 ['this', 'is', 'a', 'real', 'increase', 'of', '69', 'offenders']  -->  
 ['la', 'population', 'carcérale', 'autochtone', 'a', 'augmenté', 'de', '95', 'de', '20092010', 'à', '20102011', 'tandis', 'que', 'celle', 'des', 'nonautochtones', 'a', 'augmenté', 'de', 'quatre'] 




 39746 :
 ['nonaboriginal', 'gang', 'membership', 'increased', 'from', '76', 'to', '77', 'of', 'all', 'nonaboriginal', 'offenders']  -->  
 ['le', 'nombre', 'total', 'de', 'délinquants', 'autochtones', 'sous', 'surveillance', 'a', 'diminué', 'de', '14'] 




 39747 :
 ['gang', 'affiliated', 'aboriginal', 'women', 'offenders', 'represent', '137', 'of', 'the', 'total', 'incarcerated', 'aboriginal', 'women']  -->  
 ['le', 'nombre', 'total', 'de', 'délinquants', 'incarcérés', 'a', 'augmenté', 'de', '28', 'tandis', 'que', 'le', 'nombre', 'total', 'de', 'délinquants', 'sous', 'surveillance', 'a', 'diminué', 'de', '08'] 




 39771 :
 ['adjournments', 'for', 'aboriginal', 'offenders', 'increased', 'by', '244', 'and', 'non

 44656 :
 ['15664', 'kilometres', 'away', 'world', 'rabies', 'day']  -->  
 ['le', 'rapport', 'du', 'comité', 'permanent', 'httpwww2parlgccahousepublicationspublicationaspx'] 




 45389 :
 ['table', '11']  -->  
 ['5902', 'réseau', 'des', 'laboratoires', 'de', 'latlantique'] 




 49820 :
 ['boland', 'fj', 'burrill', 'r', 'duwyn', 'm', 'karp', 'j', '1998', 'fetal', 'alcohol', 'syndrome', 'implications', 'for', 'correctional', 'service', 'research', 'report', 'r71']  -->  
 ['boland', 'f', 'm', 'duwyn', 'et', 'r', 'serin', 'le', 'syndrome', 'de', 'lalcoolisme', 'fœtal', 'et', 'ses', 'conséquences', 'forum', 'recherche', 'sur', 'lactualité', 'correctionnelle', '2000'] 




 49826 :
 ['canada', 'report', 'of', 'the', 'royal', 'commission', 'on', 'aboriginal', 'peoples', 'bridging', 'the', 'cultural', 'divide', 'ottawa', 'supply', 'and', 'services', 'canada', '1996']  -->  
 ['bureau', 'de', 'lenquêteur', 'correctionnel', 'rapport', 'une', 'mort', 'évitable', 'consulté', 'le', '20', 'juin

In [0]:
test__en['data'][5040]

'on july 15 1994 the applicant crossed the border and travelled to the city of goma in zaire in december 1994 the applicant and his spouse then left zaire for nairobi in kenya'

In [0]:
test__fr['data'][5040]

'lhistorique m ebebe est arrivé au canada depuis le brésil comme passager clandestin sur un navire et est arrivé à montréal le 6 juillet 2002'

In [0]:
# where either of the corpora is non numeric
no_matches

In [0]:
line = 110
print('EN: \n',test__en['data'][line],'\n\n','FR: \n',test__fr['data'][line])

EN: 
 dnd takes the lead for the air component of sar cases while coast guard assumes the lead for the maritime component a force of some 5000 pleasure boaters and other vessel operators make up the coast guard auxiliary 

 FR: 
 protecting fish habitat coast guard vessels make that work possible we also support organizations tasked with responsibility for canadas maritime security which includes dnd the rcmp transport canada and the canada border services agency


## Similirity between two Corpora :

In [0]:
def similarity(query, document):
    intersection = set(query).intersection(set(document))
    union = set(query).union(set(document))
    return len(intersection)/len(union)

In [0]:

cnt=0

for en, fr in zip(eng,frc):
    cnt+=1
    score = similarity(en, fr)
    if score > 0.75:
        #cnt+=1
        print(cnt, score)
        print('English:',eng[cnt-1],'\n French:',frc[cnt-1],'\n\n')
        

        
    
    

93 0.9090909090909091
English: ['in', 'the', 'time', 'we', 'have', 'together', 'id', 'like', 'to', 'explain'] 
 French: ['15', 'in', 'the', 'time', 'we', 'have', 'together', 'id', 'like', 'to', 'explain'] 


94 0.9375
English: ['much', 'of', 'the', 'work', 'done', 'by', 'the', 'coast', 'guard', 'has', 'been', 'going', 'on', 'since', 'before', 'confederation'] 
 French: ['16', 'much', 'of', 'the', 'work', 'done', 'by', 'the', 'coast', 'guard', 'has', 'been', 'going', 'on', 'since', 'before', 'confederation'] 


174 0.95
English: ['the', 'following', 'five', 'priorities', 'were', 'identified', 'in', 'our', 'business', 'plan', 'last', 'year', 'and', 'have', 'been', 'the', 'focus', 'of', 'our', 'work', 'in', '200809'] 
 French: ['27', 'the', 'following', 'five', 'priorities', 'were', 'identified', 'in', 'our', 'business', 'plan', 'last', 'year', 'and', 'have', 'been', 'the', 'focus', 'of', 'our', 'work', 'in', '200809'] 


891 0.8
English: ['ca', 'r', 'v', 'morrison', '1970', 'two', 'or', 

7908 0.875
English: ['kuptana', 'rosemarie', 'president', 'inuit', 'tapirisat', 'of', 'canada'] 
 French: ['kuptana', 'rosemarie', 'president', 'of', 'inuit', 'tapirisat', 'du', 'canada'] 


7909 0.9
English: ['paper', 'prepared', 'for', 'the', 'rights', 'and', 'humanity', 'roundtable', 'strengthening', 'commitment', 'to', 'the', 'universality', 'of', 'human', 'rights', 'amman', 'jordan', 'april', '57', '1993'] 
 French: ['paper', 'prepared', 'for', 'the', 'rights', 'and', 'humanity', 'roundtable', 'strengthening', 'commitment', 'to', 'the', 'universality', 'of', 'human', 'rights', 'amman', 'jordanie', 'april', '57', '1993'] 


7910 0.7777777777777778
English: ['morse', 'bradford', 'w', 'robert', 'groves', 'and', 'darcy', 'vermette'] 
 French: ['morse', 'bradford', 'w', 'robert', 'groves', 'et', 'darcy', 'vermette'] 


7912 0.9411764705882353
English: ['issue', 'paper', 'prepared', 'for', 'the', 'national', 'aboriginal', 'womens', 'summit', 'june', '2022', '2007', 'in', 'corner', 'broo

10543 1.0
English: ['millions'] 
 French: ['millions'] 


11518 1.0
English: ['total'] 
 French: ['total'] 


11707 1.0
English: ['europe'] 
 French: ['europe'] 


11907 1.0
English: ['manitoba'] 
 French: ['manitoba'] 


11912 1.0
English: ['yukon'] 
 French: ['yukon'] 


11919 1.0
English: ['ontario'] 
 French: ['ontario'] 


11920 1.0
English: ['nunavut'] 
 French: ['nunavut'] 


12311 1.0
English: ['introduction'] 
 French: ['introduction'] 


12314 1.0
English: ['conclusion'] 
 French: ['conclusion'] 


12799 1.0
English: ['sept'] 
 French: ['sept'] 


12800 1.0
English: ['nov'] 
 French: ['nov'] 


12849 0.875
English: ['en', '2005', 'lors', 'de', 'mes', 'études', 'de', 'premier', 'cycle', 'à', 'luniversité', 'de', 'la', 'colombiebritannique', 'je', 'me', 'suis', 'joint', 'au', 'laboratoire', 'du', 'dr', 'kurt', 'haas', 'au', 'brain', 'research', 'centre', 'en', 'tant', 'quadjoint', 'de', 'recherche', 'bénévole'] 
 French: ['en', '2005', 'dans', 'le', 'cadre', 'de', 'mes', 'étude

 French: ['metropolitan', 'infrastructure', 'sustainability', 'study', 'united', 'states', 'conference', 'of', 'mayors'] 


16488 0.8333333333333334
English: ['2009', 'april', 'new', 'natural', 'gas', 'pipeline', 'brings', 'cleaner', 'energy', 'to', 'whistler'] 
 French: ['new', 'natural', 'gas', 'pipeline', 'brings', 'cleaner', 'energy', 'to', 'whistler', 'avril', '2009'] 


16593 0.8181818181818182
English: ['four', 'doppelt', '2003', 'moore', '1994', 'tns', 'canada', '2009', 'lindberg', '2009', '2010', '2011', 'the', 'union', 'of', 'nova', 'scotia', 'municipalities', 'municipal', 'sustainability', 'office', '2011'] 
 French: ['quatre', 'doppelt', '2003', 'moore', '1994', 'tns', 'canada', '2009', 'lindberg', '2009', '2010', 'et', '2011', 'union', 'of', 'nova', 'scotia', 'municipalities', 'municipal', 'sustainability', 'office', '2011'] 


18634 0.9523809523809523
English: ['this', 'presentation', 'was', 'created', 'by', 'david', 'stickley', 'and', 'bruce', 'osborne', 'with', 'assista

18893 0.8181818181818182
English: ['each', 'region', 'plus', 'd', 'cdts', 'was', 'given', 'a', 'banner', 'to', 'bring', 'back', 'and', 'to', 'use', 'when', 'they', 'present', 'fortress', 'v2'] 
 French: ['each', 'region', 'plus', 'd', 'cdts', 'was', 'given', 'a', 'banner', 'to', 'bring', 'back', 'and', 'to', 'use', 'when', 'they', 'make', 'presentation', 'of', 'fortress', 'v2'] 


18902 1.0
English: ['information', 'about', 'fv2', 'will', 'have', 'to', 'come', 'from', 'the', 'organization', 'and', 'not', 'from', 'it', 'because', 'the', 'owner', 'is', 'the', 'organization', 'not', 'it'] 
 French: ['information', 'about', 'fv2', 'will', 'have', 'to', 'come', 'from', 'the', 'organization', 'and', 'not', 'from', 'it', 'because', 'the', 'owner', 'is', 'the', 'organization', 'and', 'not', 'it'] 


18903 0.9166666666666666
English: ['this', 'will', 'help', 'reinforce', 'the', 'feeling', 'of', 'ownership', 'of', 'fv2', 'in', 'the', 'organization'] 
 French: ['this', 'will', 'help', 'reinforce'

 French: ['grenade', 'riot', 'cs', 'no', '518', 'grenade', 'antiémeute', 'cs', 'no', '518'] 


19216 1.0
English: ['mine', 'ap', 'm16a1a2'] 
 French: ['mine', 'ap', 'm16a1a2', 'mine', 'ap', 'm16a1a2'] 


19222 1.0
English: ['mine', 'ffv028'] 
 French: ['mine', 'ffv028', 'mine', 'ffv028'] 


19232 1.0
English: ['capsules', 'cs'] 
 French: ['capsules', 'cs', 'capsules', 'cs'] 


19254 0.8
English: ['charge', 'explosive', 'training', 'c2a1'] 
 French: ['charge', 'explosive', 'training', 'c2a1', 'charge', 'explosive', 'dexercice', 'c2a1'] 


19262 0.8
English: ['match', 'fuzee', 'safety', 'bxs'] 
 French: ['match', 'fuzee', 'safety', 'bxs', 'allumettetison', 'bxs'] 


19300 0.8333333333333334
English: ['cartridge', 'lmde', 'n12', 'mk1', 'service'] 
 French: ['cartridge', 'lmde', 'n12', 'mk1', 'service', 'cart', 'lmde', 'n12', 'mk1', 'service'] 


19302 1.0
English: ['cart', '556', 'mm', 'x', '45', 'hpt', 'c83'] 
 French: ['cart', '556', 'mm', 'x', '45', 'hpt', 'c83', 'cart', '556', 'mm', '

29920 0.8181818181818182
English: ['encl', 'filed', 'at', 'gb', 'env', 'ohazmat', 'managementhma', 'reportshmra', 'delinquencies'] 
 French: ['encl', 'filed', 'at', 'gb', 'env', 'ohazmat', 'managementhma', 'reportshmra', 'delinquencies', 'bm', 'pellerin'] 


29921 0.8333333333333334
English: ['lists', 'a', 'b', 'c', 'e'] 
 French: ['lists', 'a', 'b', 'c', 'e', 'listes', 'a', 'b', 'c', 'e'] 


29972 0.875
English: ['when', 'should', 'you', 'use', 'a', 'controlled', 'fall'] 
 French: ['aoa_q155', 'when', 'should', 'you', 'use', 'a', 'controlled', 'fall'] 


29976 0.875
English: ['how', 'is', 'the', 'ski', 'joring', 'technique', 'performed'] 
 French: ['aoa_q158', 'how', 'is', 'the', 'ski', 'joring', 'technique', 'performed'] 


29978 0.9333333333333333
English: ['what', 'are', 'the', 'seven', 'enemies', 'of', 'survival', 'that', 'can', 'impact', 'your', 'will', 'to', 'survive'] 
 French: ['aoa_q332', 'what', 'are', 'the', 'seven', 'enemies', 'of', 'survival', 'that', 'can', 'impact', 'yo

32120 0.8571428571428571
English: ['dallas', 'tx', 'american', 'heart', 'association', '1994'] 
 French: ['dallas', 'tx', 'american', 'heart', 'association', '1994', 's'] 


32426 0.8333333333333334
English: ['laryngeal', 'mask', 'airway', 'limited', 'wwwlmanacom'] 
 French: ['laryngeal', 'mask', 'airway', 'limited', 'adresse', 'wwwlmanacom'] 


32690 1.0
English: ['to', 'dest'] 
 French: ['dest', 'to'] 


32694 1.0
English: ['comments', 'remarques'] 
 French: ['remarques', 'comments'] 


32877 0.8
English: ['en', 'réserve', 'avoir', 'garder', 'mettre', 'tenir', 'qqch', 'en', 'réserve', 'accumuler', 'amasser', 'garder'] 
 French: ['en', 'reserve', 'avoir', 'garder', 'mettre', 'tenir', 'qqch', 'en', 'reserve', 'accumuler', 'amasser', 'garder'] 


32878 0.92
English: ['the', 'random', 'house', 'dictionary', 'of', 'the', 'english', 'language', '2nd', 'ed', '1987', 'store', 'vt', '12', 'to', 'deposit', 'in', 'a', 'storehouse', 'warehouse', 'or', 'other', 'place', 'for', 'keeping'] 
 French

33087 0.9090909090909091
English: ['us', 'epa', 'ghg', 'regulations', 'for', 'the', 'oil', 'natural', 'gas', 'industry'] 
 French: ['min149848', 'us', 'epa', 'ghg', 'regulations', 'for', 'the', 'oil', 'natural', 'gas', 'industry'] 


33091 0.8333333333333334
English: ['albertas', 'greenhouse', 'gas', 'ghg', 'regulations'] 
 French: ['mos150080', 'albertas', 'greenhouse', 'gas', 'ghg', 'regulations'] 


33092 0.8571428571428571
English: ['final', 'code', 'of', 'practice', 'for', 'tetrabutyltin'] 
 French: ['min149451', 'final', 'code', 'of', 'practice', 'for', 'tetrabutyltin'] 


33093 0.8
English: ['ecoaction', 'community', 'funding', 'program'] 
 French: ['min149461', 'ecoaction', 'community', 'funding', 'program'] 


33098 0.8571428571428571
English: ['volatile', 'organic', 'compound', 'voc', 'concentration', 'limits'] 
 French: ['min150434', 'volatile', 'organic', 'compound', 'voc', 'concentration', 'limits'] 


33099 0.8
English: ['aboriginal', 'traditional', 'knowledge', 'subcommi

36000 0.7777777777777778
English: ['ils', 'ont', 'terminé', 'premier', 'et', 'deuxième', 'du', 'biathlon'] 
 French: ['ils', 'ont', 'terminé', 'premier', 'et', 'deuxième', 'au', 'biathlon'] 


36001 0.8
English: ['ils', 'ont', 'terminé', 'premier', 'et', 'deuxième', 'du', 'slalom'] 
 French: ['ils', 'ont', 'terminé', 'premier', 'et', 'deuxième', 'du', 'slalom', 'ski', 'alpin'] 


36004 0.8461538461538461
English: ['remporté', 'une', 'médaille', 'aux', 'jeux', 'dété', 'et', 'dhiver', 'la', 'même', 'année'] 
 French: ['remporté', 'une', 'médaille', 'à', 'la', 'fois', 'aux', 'jeux', 'dhiver', 'et', 'aux', 'jeux', 'dété', 'la', 'même', 'année'] 


36005 0.9
English: ['remporté', 'une', 'médaille', 'lors', 'de', 'quatre', 'jeux', 'olympiques', 'dhiver', 'consécutifs'] 
 French: ['remporté', 'une', 'médaille', 'lors', 'de', 'quatre', 'jeux', 'olympiques', 'consécutifs'] 


36007 0.8888888888888888
English: ['les', 'deux', 'femmes', 'à', 'avoir', 'remporté', 'deux', 'titres', 'olympiques', 'c

37853 0.8461538461538461
English: ['pour', 'annuler', 'une', 'réservation', 'vous', 'pouvez', 'suivre', 'le', 'lien', 'emedit', 'bookingem', 'approprié'] 
 French: ['pour', 'annuler', 'une', 'réservation', 'vous', 'pouvez', 'suivre', 'le', 'lien', 'emedit', 'bookingem', 'pertinent'] 


37867 0.8
English: ['envoyez', 'mot', 'de', 'passe'] 
 French: ['envoyez', 'un', 'mot', 'de', 'passe'] 


37880 0.8
English: ['réservations', 'pour', 'le', 'mois'] 
 French: ['réservations', 'pour', 'le', 'mois', 'courant'] 


37888 0.8
English: ['les', 'détails', 'de', 'la', 'réservation', 'ont', 'été', 'envoyés', 'à'] 
 French: ['les', 'détails', 'sur', 'la', 'réservation', 'ont', 'été', 'envoyés', 'à'] 


37911 0.85
English: ['si', 'vous', 'avez', 'oublié', 'votre', 'mot', 'de', 'passe', 'et', 'en', 'obtenir', 'un', 'nouveau', 'veuillez', 'entrer', 'votre', 'adresse', 'courriel', 'cidessous'] 
 French: ['si', 'vous', 'avez', 'oublié', 'votre', 'mot', 'de', 'passe', 'et', 'si', 'vous', 'souhaitez', 'en

48486 0.9166666666666666
English: ['engaging', 'the', 'private', 'sector', 'to', 'promote', 'homeland', 'security', 'law', 'enforcementprivate', 'security', 'partnerships'] 
 French: ['2005', 'engaging', 'the', 'private', 'sector', 'to', 'promote', 'homeland', 'security', 'law', 'enforcementprivate', 'security', 'partnerships'] 


48488 0.8
English: ['future', 'trends', 'in', 'policing'] 
 French: ['2014', 'future', 'trends', 'in', 'policing'] 


48496 0.8235294117647058
English: ['private', 'security', 'and', 'urban', 'crime', 'mitigation', 'a', 'bid', 'for', 'bids', 'criminal', 'justice', '53', '23355'] 
 French: ['vindevogel', 'f', '2005', 'private', 'security', 'and', 'urban', 'crime', 'mitigation', 'a', 'bid', 'for', 'bids', 'criminal', 'justice', '53', '23355'] 


48497 0.8666666666666667
English: ['58', 'believe', 'g4s', 'and', 'serco', 'should', 'be', 'banned', 'from', 'public', 'service', 'bids', 'september', '16'] 
 French: ['58', 'believe', 'g4s', 'and', 'serco', 'should', '

English: ['teacher', 'seeks', 'healing', 'through', 'truth', 'commission', '18', 'june', '2010', 'httpwwwcbccanewscanadamanitobastory20100618mbtruthreconciliationhealingteacherswinnipeghtml'] 
 French: ['teacher', 'seeks', 'healing', 'through', 'truth', 'commission', '18', 'juin', '2010', 'httpwwwcbccanewscanadamanitobastory20100618mbtruthreconciliationhealingteacherswinnipeghtml'] 


50869 0.8333333333333334
English: ['civilization', 'museum', 'now', 'the', 'canadian', 'museum', 'of', 'history', '12', 'december', '2013', 'httpwwwcbccanewscanadaottawacivilizationmuseumnowthecanadianmuseumofhistory12461738'] 
 French: ['civilization', 'museum', 'now', 'the', 'canadian', 'museum', 'of', 'history', '12', 'décembre', '2013', 'httpwwwcbccanewscanadaottawacivilizationmuseumnowthecanadianmuseumofhistory12461738'] 


50871 0.8666666666666667
English: ['murdered', 'and', 'missing', 'aboriginal', 'women', 'deserve', 'inquiry', 'rights', 'group', 'says', '12', 'january', '2015', 'httpwwwcbccanews

50921 0.8333333333333334
English: ['historic', 'agreement', 'signed', 'on', 'national', 'aboriginal', 'day', '21', 'june', '2013', 'httpumanitobacanewsblogsblog20130621historicagreementsignedonnationalaboriginalday'] 
 French: ['historic', 'agreement', 'signed', 'on', 'national', 'aboriginal', 'day', '21', 'juin', '2013', 'httpumanitobacanewsblogsblog20130621historicagreementsignedonnationalaboriginalday'] 


50923 0.8333333333333334
English: ['uwinnipeg', 'healing', 'quilt', 'gifted', 'to', 'trc', 'commissioners', '17', 'june', '2010', 'httpwwwuwinnipegcaindexuwnewsactionstory364titleuwinnipeghealingquiltgiftedtotrccommissioners'] 
 French: ['uwinnipeg', 'healing', 'quilt', 'gifted', 'to', 'trc', 'commissioners', '17', 'juin', '2010', 'httpwwwuwinnipegcaindexuwnewsactionstory364titleuwinnipeghealingquiltgiftedtotrccommissioners'] 


50924 0.8823529411764706
English: ['historic', 'childrens', 'paintings', 'on', 'display', 'at', 'the', 'bc', 'national', 'event', 'learning', 'centre', '1

English: ['laisser', '24', 'heures', 'au', 'représentant', 'du', 'ministère', 'représentant', 'de', 'cdc', 'consultant', 'pour', 'examiner', 'léchantillon', 'et', 'ne', 'pas', 'entreprendre', 'les', 'travaux', 'avant', 'que', 'celuici', 'ait', 'été', 'approuvé'] 
 French: ['laisser', '24', 'heures', 'au', 'représentant', 'du', 'ministère', 'représentant', 'de', 'cdc', 'consultant', 'pour', 'examiner', 'léchantillon', 'de', 'louvrage', 'et', 'ne', 'pas', 'entreprendre', 'les', 'travaux', 'avant', 'que', 'celuici', 'ait', 'été', 'approuvé'] 


55091 0.9375
English: ['une', 'fois', 'accepté', 'léchantillon', 'constituera', 'la', 'norme', 'de', 'qualité', 'à', 'respecter', 'pour', 'les', 'présents', 'travaux'] 
 French: ['une', 'fois', 'accepté', 'léchantillon', 'de', 'louvrage', 'constituera', 'la', 'norme', 'de', 'qualité', 'à', 'respecter', 'pour', 'les', 'présents', 'travaux'] 


55092 0.9230769230769231
English: ['remove', 'mockup', 'at', 'completion', 'of', 'work', 'as', 'directed', 

58099 0.9615384615384616
English: ['the', 'establishment', 'of', 'a', 'coordinated', 'entryexit', 'information', 'system', 'to', 'capture', 'reconcile', 'and', 'store', 'data', 'on', 'the', 'entry', 'and', 'exit', 'of', 'travellers', 'into', 'and', 'out', 'of', 'canada', 'in', 'the', 'air', 'and', 'land', 'modes'] 
 French: ['hhhhhhthe', 'establishment', 'of', 'a', 'coordinated', 'entryexit', 'information', 'system', 'to', 'capture', 'reconcile', 'and', 'store', 'data', 'on', 'the', 'entry', 'and', 'exit', 'of', 'travellers', 'into', 'and', 'out', 'of', 'canada', 'in', 'the', 'air', 'and', 'land', 'modes'] 


59687 0.8
English: ['je', 'suis', 'arriv', 'la', 'conclusion', 'que', 'vous', 'ne', 'satisfaites', 'pas', 'aux', 'exigences', 'dimmigration', 'au', 'canada', 'dans', 'cette', 'catgorie'] 
 French: ['je', 'suis', 'arrivé', 'la', 'conclusion', 'que', 'vous', 'ne', 'satisfaites', 'pas', 'aux', 'exigences', 'dimmigration', 'au', 'canada', 'dans', 'cette', 'catégorie'] 


59704 0.86956

English: ['the', 'current', 'decline', 'in', 'global', 'commodity', 'prices', 'is', 'expected', 'to', 'have', 'a', 'negative', 'impact', 'on', 'fiscal', 'accounts', 'and', 'to', 'reduce', 'foreign', 'investment', 'especially', 'in', 'capitalintensive', 'areas', 'such', 'as', 'mining', 'and', 'hydrocarbon', 'projects'] 
 French: ['the', 'current', 'decline', 'in', 'global', 'commodity', 'prices', 'is', 'expected', 'to', 'negatively', 'impact', 'fiscal', 'accounts', 'and', 'reduce', 'foreign', 'investment', 'particularly', 'in', 'capitalintensive', 'areas', 'such', 'as', 'mining', 'and', 'hydrocarbon', 'projects'] 


65139 0.8636363636363636
English: ['slower', 'growth', 'in', 'latin', 'america', 'including', 'honduras', 'combined', 'with', 'weaker', 'fiscal', 'finances', 'could', 'undo', 'poverty', 'reduction', 'achievements', 'achieved', 'in', 'recent', 'years'] 
 French: ['slower', 'growth', 'in', 'latin', 'america', 'including', 'honduras', 'combined', 'with', 'weaker', 'fiscal', 'fi

English: ['for', 'example', 'based', 'the', 'eventual', 'strategic', 'narrative', 'for', 'pch', 'which', 'may', 'well', 'result', 'in', 'adjustments', 'to', 'our', 'descriptions', 'of', 'the', 'strategic', 'outcomes', 'and', 'pa', 'we', 'may', 'determine', 'that', 'the', 'highlighted', 'boxes', 'insufficiently', 'contribute', 'to', 'the', 'achievement', 'of', 'the', 'mission'] 
 French: ['quatre', 'for', 'example', 'based', 'the', 'eventual', 'strategic', 'narrative', 'for', 'pch', 'which', 'may', 'well', 'result', 'in', 'adjustments', 'to', 'our', 'descriptions', 'of', 'the', 'strategic', 'outcomes', 'and', 'pa', 'we', 'may', 'determine', 'that', 'the', 'highlighted', 'boxes', 'insufficiently', 'contribute', 'to', 'the', 'achievement', 'of', 'the', 'mission'] 


68480 0.9285714285714286
English: ['lévaluation', 'a', 'également', 'pris', 'en', 'considération', 'toutes', 'informations', 'disponibles', 'se', 'rapportant', 'audelà', 'de', 'cette', 'période', 'afin', 'doffrir', 'un', 'port

English: ['de', '1997', 'à', '2008', 'en', 'tenant', 'compte', 'de', 'linflation', 'les', 'dépenses', 'des', 'canadiens', 'en', 'achat', 'de', 'livres', 'excluant', 'les', 'manuels', 'scolaires', 'ont', 'augmenté', 'de', '24', 'alors', 'que', 'la', 'population', 'canadienne', 'connaissait', 'une', 'croissance', 'de', '14', 'au', 'cours', 'de', 'la', 'même', 'période'] 
 French: ['de', '1997', 'à', '2008', 'en', 'tenant', 'compte', 'de', 'linflation', 'les', 'dépenses', 'des', 'canadiens', 'en', 'achat', 'de', 'livres', 'les', 'manuels', 'scolaires', 'non', 'compris', 'ont', 'augmenté', 'de', '24', 'alors', 'que', 'la', 'population', 'canadienne', 'connaissait', 'une', 'croissance', 'de', '14', 'au', 'cours', 'de', 'la', 'même', 'période'] 


68729 0.8125
English: ['lalberta', 'arrive', 'en', 'tête', 'de', 'toutes', 'les', 'provinces', 'en', 'matière', 'de', 'dépenses', 'par', 'habitant', 'sur', 'lachat', 'de', 'livres', '52', 'suivie', 'par', 'la', 'colombiebritannique', '47', 'le', 'm

English: ['13', 'ces', 'moyennes', 'sont', 'calculées', 'par', 'léquipe', 'dévaluation', 'à', 'partir', 'des', 'chiffres', 'provenant', 'de', 'la', 'base', 'de', 'données', 'de', 'la', 'dppm', 'pour', 'chacune', 'des', 'années', 'couvertes', 'par', 'lévaluation'] 
 French: ['12', 'ces', 'moyennes', 'sont', 'calculées', 'par', 'léquipe', 'dévaluation', 'à', 'partir', 'des', 'chiffres', 'provenant', 'de', 'la', 'base', 'de', 'données', 'de', 'la', 'dppm', 'pour', 'chacune', 'des', 'années', 'couvertes', 'par', 'lévaluation'] 


69328 0.8666666666666667
English: ['19', 'les', 'activités', 'liées', 'à', 'ce', 'volet', 'ont', 'pris', 'fin', 'le', '31', 'mars', '2013'] 
 French: ['18', 'les', 'activités', 'liées', 'à', 'ce', 'volet', 'ont', 'pris', 'fin', 'le', '31', 'mars', '2013'] 


69330 0.9310344827586207
English: ['21', 'entre', '20072008', 'et', '20112012', 'un', 'maximum', 'de', '22', 'maisons', 'de', 'disque', 'indépendantes', 'a', 'été', 'soutenu', 'financièrement', 'à', 'tous', 'l

74522 1.0
English: ['sarian', 'f'] 
 French: ['sarian', 'f', 'sarian', 'f'] 


74533 1.0
English: ['halifax'] 
 French: ['halifax', 'halifax'] 


74559 1.0
English: ['869', 'mhz'] 
 French: ['869', 'mhz', '869', 'mhz'] 


74609 1.0
English: ['150', 'vhf'] 
 French: ['vhf', '150'] 


74610 1.0
English: ['450', 'uhf'] 
 French: ['uhf', '450'] 


74611 1.0
English: ['800', 'uhf'] 
 French: ['uhf', '800'] 


74613 1.0
English: ['900', 'uhf'] 
 French: ['uhf', '900'] 


75163 1.0
English: ['radar', 'applications'] 
 French: ['applications', 'radar'] 


75283 1.0
English: ['maritime', 'mobile', 'service'] 
 French: ['service', 'mobile', 'maritime'] 


76409 0.9
English: ['41', 'the', 'public', 'safety', 'nationwide', 'interoperable', 'broadband', 'network', 'a', 'new', 'model', 'for', 'capacity', 'performance', 'and', 'cost', 'june', '2010', 'httptransitionfccgovpshsdocsreleasesdoc298799a1pdf'] 
 French: ['41', 'the', 'public', 'safety', 'nationwide', 'interoperable', 'broadband', 'network',

77475 0.7857142857142857
English: ['les', 'noms', 'ladresse', 'de', 'lentreprise', 'et', 'ladresse', 'postale', 'les', 'courriels', 'les', 'numéros', 'de', 'téléphone', 'et', 'les', 'noms', 'des', 'personnesressources'] 
 French: ['les', 'noms', 'ladresse', 'de', 'lentreprise', 'et', 'ladresse', 'postale', 'les', 'adresses', 'électroniques', 'les', 'numéros', 'de', 'téléphone', 'et', 'les', 'noms', 'des', 'personnesressources'] 


77480 0.782608695652174
English: ['la', 'liste', 'des', 'sites', 'des', 'usines', 'y', 'compris', 'le', 'nom', 'des', 'usines', 'ladresse', 'de', 'lentreprise', 'et', 'ladresse', 'postale', 'les', 'courriels', 'les', 'numéros', 'de', 'téléphone', 'et', 'les', 'noms', 'des', 'succursales'] 
 French: ['la', 'liste', 'des', 'sites', 'des', 'scieries', 'y', 'compris', 'le', 'nom', 'des', 'scieries', 'ladresse', 'de', 'lentreprise', 'et', 'ladresse', 'postale', 'les', 'adresses', 'électroniques', 'les', 'numéros', 'de', 'téléphone', 'et', 'les', 'noms', 'des', 'su

80320 1.0
English: ['johnpaul', 'boyd', 'ma', 'llb'] 
 French: ['johnpaul', 'boyd', 'ma', 'llb'] 


80321 1.0
English: ['joanne', 'j', 'paetsch', 'ba'] 
 French: ['joanne', 'j', 'paetsch', 'ba'] 


80322 1.0
English: ['lorne', 'd', 'bertrand', 'phd'] 
 French: ['lorne', 'd', 'bertrand', 'phd'] 


80323 1.0
English: ['zoe', 'suche', 'llb'] 
 French: ['zoe', 'suche', 'llb'] 


80333 1.0
English: ['introduction'] 
 French: ['introduction'] 


80366 1.0
English: ['idaho'] 
 French: ['idaho'] 


80367 1.0
English: ['arizona'] 
 French: ['arizona'] 


80369 1.0
English: ['alaska'] 
 French: ['alaska'] 


80370 1.0
English: ['vermont'] 
 French: ['vermont'] 


80371 1.0
English: ['massachusetts'] 
 French: ['massachusetts'] 


80385 1.0
English: ['figure', '31'] 
 French: ['figure', '31'] 


80387 1.0
English: ['figure', '32'] 
 French: ['figure', '32'] 


80389 1.0
English: ['figure', '33'] 
 French: ['figure', '33'] 


80391 1.0
English: ['figure', '34'] 
 French: ['figure', '34'] 


80395 

English: ['en', 'outre', 'les', 'avocats', 'spécialisés', 'en', 'droit', 'de', 'la', 'famille', 'devraient', 'songer', 'à', 'tenir', 'compte', 'des', 'répercussions', 'potentielles', 'de', 'telles', 'ordonnances', 'sur', 'des', 'procédures', 'subséquentes', 'donc', 'à', 'inclure', 'des', 'dispositions', 'comme', 'cellesci', 'sous', 'réserve', 'de', 'toute', 'ordonnance', 'rendue', 'subséquemment', 'par', 'un', 'tribunal', 'pénal', 'en', 'réaction', 'à', 'des', 'faits', 'survenus', 'après', 'la', 'date', 'de', 'la', 'présente', 'ordonnance', 'sous', 'réserve', 'de', 'toute', 'ordonnance', 'rendue', 'subséquemment', 'par', 'un', 'tribunal', 'pénal', 'après', 'avoir', 'pris', 'en', 'compte', 'le', 'présent', 'accord', 'ou', 'la', 'présente', 'ordonnance', 'sous', 'réserve', 'des', 'dispositions', 'relatives', 'à', 'la', 'communication', 'prises', 'après', 'la', 'date', 'de', 'la', 'présente', 'ordonnance', 'par', 'lorganisme', 'de', 'protection', 'de', 'la', 'jeunesse', 'sous', 'réserve',

 French: ['en', 'effet', 'les', 'instances', 'en', 'matière', 'de', 'protection', 'de', 'la', 'jeunesse', 'sont', 'régies', 'par', 'des', 'échéances', 'rigoureuses', 'sans', 'compter', 'quà', 'certains', 'endroits', 'au', 'pays', 'un', 'enfant', 'placé', 'en', 'famille', 'daccueil', 'doit', 'à', 'léchéance', 'dun', 'délai', 'prescrit', 'soit', 'être', 'rendu', 'à', 'sa', 'famille', 'soit', 'être', 'déclaré', 'pupille', 'de', 'la', 'couronnepupille', 'sous', 'tutelle', 'judiciaire'] 


81591 0.9636363636363636
English: ['lorsque', 'des', 'parents', 'se', 'réconcilient', 'et', 'que', 'laccusé', 'en', 'instance', 'pénale', 'refuse', 'dadmettre', 'le', 'moindre', 'acte', 'de', 'violence', 'familiale', 'il', 'ne', 'sera', 'probablement', 'pas', 'possible', 'daborder', 'les', 'questions', 'de', 'protection', 'de', 'la', 'jeunesse', 'dans', 'les', 'délais', 'prescrits', 'auquel', 'cas', 'lenfant', 'en', 'cause', 'pourra', 'devenir', 'pupille', 'de', 'la', 'couronnepupille', 'sous', 'tutelle',

81771 0.9166666666666666
English: ['le', 'safe', 'communities', 'project', 'projet', 'de', 'sécurité', 'des', 'collectivités', 'safecom', 'une', 'initiative', 'menée', 'en', 'alberta', 'encourage', 'la', 'collaboration', 'interministérielle', 'en', 'matière', 'de', 'prévention', 'de', 'la', 'criminalité', '386'] 
 French: ['le', 'safe', 'communities', 'project', 'projet', 'de', 'sécurité', 'des', 'collectivités', 'safecom', 'une', 'initiative', 'menée', 'en', 'alberta', 'encourage', 'la', 'collaboration', 'interministérielle', 'en', 'matière', 'de', 'prévention', 'de', 'la', 'criminalité', '355'] 


81772 0.8
English: ['autrement', 'dit', 'cette', 'démarche', 'permet', 'lintégration', 'des', 'services', '387'] 
 French: ['autrement', 'dit', 'cette', 'démarche', 'permet', 'lintégration', 'des', 'services', '356'] 


81773 0.8372093023255814
English: ['au', 'nouveaubrunswick', 'le', 'projet', 'pilote', 'de', 'tribunal', 'chargé', 'des', 'causes', 'de', 'violence', 'conjugale', 'de', 'mon

 French: ['50', 'y', 'compris', 'la', 'protection', 'contre', 'les', 'fouilles', 'perquisitions', 'et', 'saisies', 'abusives', 'et', 'contre', 'la', 'détention', 'arbitraire', 'durant', 'lenquête', 'articles', 'huit', 'et', 'neuf', 'le', 'droit', 'de', 'savoir', 'pourquoi', 'on', 'est', 'détenu', 'et', 'davoir', 'recours', 'à', 'lassistance', 'dun', 'avocat', 'article', '10', 'ainsi', 'que', 'le', 'droit', 'à', 'un', 'procès', 'juste', 'avec', 'présomption', 'dinnocence', 'et', 'le', 'droit', 'dêtre', 'jugé', 'dans', 'un', 'délai', 'raisonnable', 'articles', 'sept', 'et', '11'] 


81938 0.9
English: ['57', 'jeffrey', 'wilson', 'wilson', 'on', 'children', 'and', 'the', 'law', 'markham', 'lexis', 'nexis', '1994', 'recueil', 'de', 'feuillets', 'mobiles', 'mars', '2012', '31'] 
 French: ['51', 'jeffrey', 'wilson', 'wilson', 'on', 'children', 'and', 'the', 'law', 'markham', 'lexis', 'nexis', '1994', 'recueil', 'de', 'feuillets', 'mobiles', 'mars', '2012', '31'] 


81939 0.7931034482758621
E


82017 0.8823529411764706
English: ['136', 'conformément', 'à', 'larticle', '7261', 'du', 'code', 'criminel', 'sous', 'réserve', 'des', 'limites', 'prévues', 'à', 'larticle', '725', 'autres', 'infractions'] 
 French: ['121', 'conformément', 'à', 'larticle', '7261', 'du', 'code', 'criminel', 'sous', 'réserve', 'des', 'limites', 'prévues', 'à', 'larticle', '725', 'autres', 'infractions'] 


82018 0.875
English: ['137', 'us', 'department', 'of', 'justice', 'the', 'national', 'domestic', 'violence', 'fatality', 'review', 'initiative', 'en', 'ligne', 'httpwwwndvfriorg'] 
 French: ['122', 'us', 'department', 'of', 'justice', 'the', 'national', 'domestic', 'violence', 'fatality', 'review', 'initiative', 'en', 'ligne', 'httpwwwndvfriorg'] 


82019 0.9047619047619048
English: ['138ontario', 'ministère', 'de', 'la', 'sécurité', 'communautaire', 'et', 'des', 'services', 'correctionnels', 'comité', 'dexamen', 'des', 'décès', 'dus', 'à', 'la', 'violence', 'familiale', 'en', 'ligne', 'httpwwwmcscsju

 French: ['322', 'plutôt', 'que', 'de', 'linformation', 'sur', 'la', 'façon', 'dont', 'le', 'procureur', 'de', 'la', 'couronne', 'a', 'eu', 'recours', 'à', 'son', 'jugement', 'p', 'ex', 'dans', 'le', 'cadre', 'de', 'la', 'négociation', 'du', 'plaidoyer'] 


82231 0.9789473684210527
English: ['350', 'voir', 'par', 'ex', 'lhonorable', 'leonard', 'edwards', 'steve', 'baron', 'et', 'george', 'ferrick', 'a', 'comment', 'on', 'william', 'j', 'howe', 'and', 'hugh', 'mcisaacs', 'article', 'finding', 'the', 'balance', 'published', 'in', 'the', 'january', '2008', 'issue', 'of', 'family', 'court', 'review', '2008', '464', 'family', 'court', 'review', '586', 'anne', 'fuchs', 'considering', 'the', 'needs', 'of', 'domestic', 'violence', 'victims', 'the', 'exceptions', 'to', 'minnesotas', 'alternative', 'dispute', 'resolution', 'rule', '114', 'minneapolis', 'mn', 'the', 'battered', 'womens', 'justice', 'project', 'avril', '2011', 'en', 'ligne', 'en', 'anglais', 'httpwwwbwjporgfilesbwjparticlesconside

85319 0.8
English: ['report', 'on', 'plans', 'and', 'priorities', 'organizational', 'priorities', 'instructions', 'rapport', 'sur', 'les', 'plans', 'et', 'les', 'priorités', 'priorités', 'organisationnelles', 'directives'] 
 French: ['report', 'on', 'plans', 'and', 'priorities', 'organizational', 'priorities', 'instructions_x000d_rapport', 'sur', 'les', 'plans', 'et', 'les', 'priorités', 'priorités', 'organisationnelles', 'directives'] 


85322 0.875
English: ['two', 'pid', 'backfill', 'for', 'employee', 'going', 'on', 'assignment'] 
 French: ['pid', 'backfill', 'for', 'employee', 'going', 'on', 'assignment'] 


85874 0.7619047619047619
English: ['total', 'intake', 'in', '2013', '62927', 'q4', 'fy', '1213', '8921', 'q115196', 'q218220', 'q320590', 'total', 'processed', 'in', '2013', '56797', 'q4', 'fy', '1213', '5545', 'q113232', 'q216919', 'q321101'] 
 French: ['total', 'intake', 'in', '2013', '62927', 'q4', 'fy', '1213', '8921', 'q115196', 'q218220', 'q320590total', 'processed', 'in'

English: ['internally', 'cic', 'could', 'increase', 'issuance', 'of', 'tablets', 'or', 'similar', 'mobile', 'devices', 'to', 'create', 'a', 'more', 'agile', 'and', 'effective', 'public', 'service', 'that', 'supports', 'a', 'flexible', 'and', 'modern', 'workplace'] 
 French: ['internally', 'cic', 'could', 'increase', 'the', 'use', 'of', 'tablets', 'or', 'similar', 'mobile', 'devices', 'to', 'create', 'a', 'more', 'agile', 'and', 'effective', 'public', 'service', 'that', 'supports', 'a', 'flexible', 'and', 'modern', 'workplace'] 


86272 0.7857142857142857
English: ['corporate', 'communications', 'has', 'already', 'begun', 'work', 'to', 'improve', 'the', 'search', 'capability', 'of', 'the', 'cic', 'intranet', 'site', 'making', 'it', 'easier', 'for', 'employees', 'to', 'find', 'the', 'information', 'they', 'are', 'looking', 'for'] 
 French: ['corporate', 'communications', 'has', 'already', 'begun', 'work', 'to', 'improve', 'the', 'search', 'capability', 'of', 'the', 'cic', 'intranet', 'si

English: ['supply', 'bill', 'projet', 'de', 'loi', 'des', 'crédits'] 
 French: ['projet', 'de', 'loi', 'des', 'crédits', 'supply', 'bill'] 


88418 1.0
English: ['treasury', 'board', 'secretariat', 'tbs', 'secrétariat', 'du', 'conseil', 'du', 'trésor', 'du', 'canada', 'sct'] 
 French: ['secrétariat', 'du', 'conseil', 'du', 'trésor', 'du', 'canada', 'sct', 'treasury', 'board', 'secretariat', 'tbs'] 


88422 1.0
English: ['treasury', 'board', 'submission', 'présentation', 'au', 'conseil', 'du', 'trésor'] 
 French: ['présentation', 'au', 'conseil', 'du', 'trésor', 'treasury', 'board', 'submission'] 


88431 1.0
English: ['vote', 'crédit'] 
 French: ['crédit', 'vote'] 


88433 1.0
English: ['vote', 'realignment', 'réalignement', 'des', 'crédits'] 
 French: ['réalignement', 'des', 'crédits', 'vote', 'realignment'] 


88487 0.8571428571428571
English: ['prestations', 'demploi', 'et', 'mesures', 'de', 'soutien'] 
 French: ['pems', 'prestations', 'demploi', 'et', 'mesures', 'de', 'soutien'] 



 French: ['as', 'only', '38', 'of', 'the', 'true', 'positive', 'cows', 'can', 'be', 'detected', 'by', 'culture', 'at', 'first', 'sampling', 'because', 'of', 'low', 'shedding', '36', 'the', 'true', 'performance', 'of', 'the', 'dna', 'extraction', 'kit', 'b', 'is', 'thus', 'much', 'higher', 'than', 'kits', 'a', 'or', 'c'] 


92115 0.8666666666666667
English: ['the', 'reasons', 'for', 'the', 'discordant', 'results', 'from', 'spiked', 'and', 'naturally', 'infected', 'samples', 'are', 'not', 'clear'] 
 French: ['the', 'reasons', 'why', 'results', 'from', 'spiked', 'and', 'naturally', 'infected', 'samples', 'are', 'discordant', 'are', 'not', 'clear'] 


92117 0.8333333333333334
English: ['indeed', 'when', 'samples', 'are', 'spiked', 'heaps', 'of', 'map', 'due', 'to', 'clumping', 'are', 'largely', 'reduced', 'when', 'the', 'map', 'culture', 'is', 'homogenized'] 
 French: ['indeed', 'when', 'spiking', 'samples', 'heaps', 'of', 'map', 'due', 'to', 'clumping', 'are', 'largely', 'reduced', 'when'

92836 0.7857142857142857
English: ['the', 'information', 'that', 'you', 'provide', 'will', 'be', 'part', 'of', 'the', 'staffing', 'file'] 
 French: ['de', 'quand', 'à', 'quand', 'the', 'information', 'that', 'you', 'provide', 'will', 'be', 'part', 'of', 'the', 'staffing', 'file'] 


92839 1.0
English: ['page'] 
 French: ['page'] 


93093 1.0
English: ['tctaraerattctcgcca'] 
 French: ['tctaraerattctcgcca'] 


93272 0.9583333333333334
English: ['this', 'initiative', 'is', 'a', 'collaboration', 'not', 'only', 'between', 'the', 'canada', 'ontario', 'and', 'québec', 'governments', 'but', 'also', 'with', 'several', 'stakeholders', 'from', 'the', 'public', 'and', 'private', 'sectors'] 
 French: ['this', 'initiative', 'is', 'a', 'collaboration', 'not', 'only', 'between', 'the', 'canada', 'ontario', 'and', 'québec', 'governments', 'but', 'also', 'with', 'several', 'stakeholders', 'from', 'the', 'public', 'and', 'private', 'sectors', 'deux'] 


93273 0.9375
English: ['i', 'would', 'like', 'to', 

In [0]:
with open('/data/rali5/Tmp/shiven/classification/data/test/sample.en','r') as r:
    test__en = [item.strip('\n') for item in tqdm(r.readlines())]

with open('/data/rali5/Tmp/shiven/classification/data/test/sample.fr','r') as r:
    test__fr = [item.strip('\n') for item in tqdm(r.readlines())]


eng = [nltk.word_tokenize(token) for token in tqdm(test__en)]
frc = [nltk.word_tokenize(token) for token in tqdm(test__fr)]


big_unmatched=[]
for idx, en in enumerate(test__en):  
    for  e_w in en: 
        if e_w.isdigit(): 
            big_unmatched.append(idx)

data=[] 
for idx, (en, fr) in enumerate(zip(test__en, test__fr)): 
    if idx in big_unmatched: 
        data.append((idx,en,fr,0)) 
    else: 
        data.append((idx,en,fr,1)) 

        
pd.DataFrame(data).to_csv('/data/rali5/Tmp/shiven/classification/data/test/number_annotaion.csv', index=None, header=False) 