<a href="https://colab.research.google.com/github/arqavan94/Persian_NLP_Task/blob/main/parallel_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install laserembeddings
# !python -m laserembeddings download-models /content/drive/MyDrive/lasermodels

In [None]:
import torch
from laserembeddings import Laser
import numpy as np
import random
from torchmetrics.functional import pairwise_cosine_similarity

# **Load Data**

In [None]:
with open("/content/drive/MyDrive/task/de.unpar.txt", "r") as f:
    german_sentences = [line.strip() for line in f.readlines()]
print("The length of German sentences is: " ,len(german_sentences))

with open("/content/drive/MyDrive/task/pt.unpar.txt", "r") as f:
    portuguese_sentences = [line.strip() for line in f.readlines()]
print("The length of Portuguese sentences is : ", len(portuguese_sentences))

The length of German sentences is:  150
The length of Portuguese sentences is :  130


# **Load the pre-trained laser embeddings models**

In [None]:
path_to_bpe_codes = '/content/drive/MyDrive/lasermodels/93langs.fcodes'
path_to_bpe_vocab = '/content/drive/MyDrive/lasermodels/93langs.fvocab'
path_to_encoder = '/content/drive/MyDrive/lasermodels/bilstm.93langs.2018-12-26.pt'

laser = Laser(path_to_bpe_codes, path_to_bpe_vocab, path_to_encoder)


In [None]:
# Embed each sentence using the pre-trained laser embeddings model
german_embeddings = laser.embed_sentences(german_sentences, lang="de")
portuguese_embeddings = laser.embed_sentences(portuguese_sentences, lang="pt")

In [None]:
german_embeddings.shape

(150, 1024)

In [None]:
portuguese_embeddings.shape

(130, 1024)

# **Cosine Similarity function to extract pairwise similarity**

In [None]:
cosine_similarities = np.matmul(german_embeddings, portuguese_embeddings.T)
cosine_similarities /= np.linalg.norm(german_embeddings, axis=1)[:, None]
cosine_similarities /= np.linalg.norm(portuguese_embeddings, axis=1)[None, :]

# **Pick the top-100 similar sentences**

In [None]:
similarity_array = cosine_similarities.flatten()

top_100_indices = np.argsort(similarity_array)[::-1][:100]
top_100_pairs = [(i // cosine_similarities.shape[1], i % cosine_similarities.shape[1])
                 for i in top_100_indices]

# **Create parallel data**

In [None]:
with open('parallel_data.txt', 'w', encoding='utf-8') as f:
    for i,j in top_100_pairs:
        f.write(f'{german_sentences[i]}\t{portuguese_sentences[j]}\n')