In [None]:
# For CPU
!apt install libomp-dev
# !pip install faiss
!pip install faiss-gpu
!pip install sentence_transformers

In [None]:
source_file = "/content/drive/MyDrive/aligner-test-files/india_2020_en-ta/total_en_2020_January_sen.txt"
target_file = "/content/drive/MyDrive/aligner-test-files/india_2020_en-ta/total_ta_2020_January_sen.txt"

with open(source_file, encoding="utf-16") as f:
    source_sentences = f.readlines()
with open(target_file, encoding="utf-16") as f:
    target_sentences = f.readlines()

source_sentences = list(map(lambda s: s.strip(), source_sentences))
target_sentences = list(map(lambda s: s.strip(), target_sentences))
print(len(source_sentences))
print(len(target_sentences))

70199
2497


In [None]:
import faiss
import numpy as np
import time
import gzip
import lzma

########  Functions to find and score candidates
def score(x, y, fwd_mean, bwd_mean, margin):
    return margin(x.dot(y), (fwd_mean + bwd_mean) / 2)


def score_candidates(x, y, candidate_inds, fwd_mean, bwd_mean, margin):
    scores = np.zeros(candidate_inds.shape)
    for i in range(scores.shape[0]):
        for j in range(scores.shape[1]):
            k = candidate_inds[i, j]
            scores[i, j] = score(x[i], y[k], fwd_mean[i], bwd_mean[k], margin)
    return scores


def kNN(x, y, k):
    start_time = time.time()
    print("Perform exact search")
    d = 64  
    cpu_index = faiss.IndexFlatL2(y.shape[1])
    print("index done")
    gpu_index = faiss.index_cpu_to_all_gpus(cpu_index)
    print("added to index")
    gpu_index.add(np.ascontiguousarray(y) ) 
    print("add successfull")
    sim, ind = gpu_index.search(np.ascontiguousarray(x), k)    
    print("Done: {:.2f} sec".format(time.time()-start_time))
    return sim, ind

In [None]:
%%time
from sentence_transformers import SentenceTransformer, models
import numpy as np
import gzip
import tqdm
import torch
import pandas as pd

#Model we want to use for bitext mining. LaBSE achieves state-of-the-art performance
model_name = 'LaBSE'
model = SentenceTransformer(model_name, device='cuda')

HBox(children=(FloatProgress(value=0.0, max=1754318854.0), HTML(value='')))


CPU times: user 51.5 s, sys: 10.7 s, total: 1min 2s
Wall time: 2min 8s


In [None]:
%%time
source_sentences = list(source_sentences)
print("Encode source sentences", len(source_sentences))
source_embeddings = model.encode(source_sentences, show_progress_bar=True, convert_to_numpy=True)

Encode source sentences 70199


HBox(children=(FloatProgress(value=0.0, description='Batches', max=2194.0, style=ProgressStyle(description_wid…


CPU times: user 2min 20s, sys: 15 s, total: 2min 35s
Wall time: 2min 35s


In [None]:
%%time
target_sentences = list(target_sentences)
print("Encode target sentences", len(target_sentences))
target_embeddings = model.encode(target_sentences, show_progress_bar=True, convert_to_numpy=True)

Encode target sentences 2497


HBox(children=(FloatProgress(value=0.0, description='Batches', max=79.0, style=ProgressStyle(description_width…


CPU times: user 6.02 s, sys: 46.1 ms, total: 6.06 s
Wall time: 6.01 s


In [None]:
%%time

# We base the scoring on k nearest neighbors for each element
knn_neighbors = 2

# Min score for text pairs. Note, score can be larger than 1
min_threshold = 1


x = source_embeddings
y = target_embeddings


# Perform kNN in both directions
x2y_sim, x2y_ind = kNN(x, y, knn_neighbors)
x2y_mean = x2y_sim.mean(axis=1)
print(x2y_mean[0])

y2x_sim, y2x_ind = kNN(y, x, knn_neighbors)
y2x_mean = y2x_sim.mean(axis=1)
print(y2x_mean[0])

# Compute forward and backward scores
margin = lambda a, b: a / b
fwd_scores = score_candidates(x, y, x2y_ind, x2y_mean, y2x_mean, margin)
bwd_scores = score_candidates(y, x, y2x_ind, y2x_mean, x2y_mean, margin)
fwd_best = x2y_ind[np.arange(x.shape[0]), fwd_scores.argmax(axis=1)]
bwd_best = y2x_ind[np.arange(y.shape[0]), bwd_scores.argmax(axis=1)]

indices = np.stack([np.concatenate([np.arange(x.shape[0]), bwd_best]), np.concatenate([fwd_best, np.arange(y.shape[0])])], axis=1)
scores = np.concatenate([fwd_scores.max(axis=1), bwd_scores.max(axis=1)])
seen_src, seen_trg = set(), set()

#Extact list of parallel sentences
src_out = []
tgt_out = []
score_out = []
sentences_written = 0
for i in np.argsort(-scores):
    src_ind, trg_ind = indices[i]
    src_ind = int(src_ind)
    trg_ind = int(trg_ind)

    if scores[i] < min_threshold:
        break

    if src_ind not in seen_src and trg_ind not in seen_trg:
        seen_src.add(src_ind)
        seen_trg.add(trg_ind)
        # fOut.write("{:.4f}\t{}\t{}\n".format(scores[i], source_sentences[src_ind].replace("\t", " "), target_sentences[trg_ind].replace("\t", " ")))
        src_out.append(source_sentences[src_ind].replace("\t", " "))
        tgt_out.append(target_sentences[trg_ind].replace("\t", " "))
        score_out.append(scores[i])
        sentences_written += 1


Perform exact search
index done
added to index
add successfull
Done: 0.27 sec
1.1421453
Perform exact search
index done
added to index
add successfull
Done: 0.23 sec
0.96488225
CPU times: user 1.68 s, sys: 437 ms, total: 2.12 s
Wall time: 1.99 s


In [None]:
df = pd.DataFrame(list(zip(src_out, tgt_out, score_out)),columns = ['src_out', 'tgt_out','score_out'])
df

Unnamed: 0,src_out,tgt_out,score_out
0,"— Kunal Kamra (@kunalkamra88) January 28, 2020","— Kunal Kamra (@kunalkamra88) January 28, 2020",11.512191
1,"— Ben Smith (@BuzzFeedBen) January 13, 2020","— Ben Smith (@BuzzFeedBen) January 13, 2020",6.531665
2,— Hardeep Singh Puri (@HardeepSPuri) January 2...,— Hardeep Singh Puri (@HardeepSPuri) January 2...,6.116069
3,"Imam, who is pursuing PhD in Modern Indian His...",நவீன இந்திய வரலாற்றில் பிஎச்டி படித்து வரும் இ...,5.515931
4,"Imam, who is pursuing PhD in Modern Indian His...",நவீன இந்திய வரலாற்றில் பிஎச்டி படித்து வரும் இ...,5.515927
...,...,...,...
694,"In September 2013, a fast-track court held the...",2013ம் ஆண்டு விரைவு நீதிமன்றங்கள் மூலமாக கூட்ட...,1.004952
695,"Earlier this month, I had the opportunity to i...","இந்த மாதத்தின் தொடக்கத்தில், கருமமே கண்ணாக இரு...",1.004757
696,"At Delhi’s protest march, called by the JNUSU ...","ஆர்ப்பாட்டத்தில், முன்னாள் ஜே. என். யு. மாணவர்...",1.004210
697,BJP is ruling in India.,"டெல்லியை ஆளும் ஆம் ஆத்மிக்கும், மத்தியில் ஆளும...",1.002981


In [None]:
df.to_json("output_file.json", force_ascii=False, indent=2, orient='records')