In [1]:
pip install -U sentence-transformers gdown

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sentence-transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[K     |████████████████████████████████| 85 kB 1.8 MB/s 
Collecting gdown
  Downloading gdown-4.5.1.tar.gz (14 kB)
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
Collecting transformers<5.0.0,>=4.6.0
  Downloading transformers-4.21.0-py3-none-any.whl (4.7 MB)
[K     |████████████████████████████████| 4.7 MB 36.5 MB/s 
Collecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 49.7 MB/s 
[?25hCollecting huggingface-hub>=0.4.0
  Downloading huggingface_hub-0.8.1-py3-none-any.whl (101 kB)
[K     |████████████████████████████████| 101 kB 10.9 MB/s 
[?25hCollecting pyyam

In [2]:
!gdown https://drive.google.com/uc?id=1yWkmOg7isP7EnQtgsRuyEwrT5s28zxlr

Downloading...
From: https://drive.google.com/uc?id=1yWkmOg7isP7EnQtgsRuyEwrT5s28zxlr
To: /content/quran-simple-clean.txt
  0% 0.00/744k [00:00<?, ?B/s]100% 744k/744k [00:00<00:00, 114MB/s]


In [3]:
from sentence_transformers import SentenceTransformer

# Prepare the model
model = SentenceTransformer('symanto/sn-xlm-roberta-base-snli-mnli-anli-xnli')

# Prepare/Read Quran data
f = open("quran-simple-clean.txt")

# Single list of all sentences
verses = f.readlines()
verses = [verse[:-1] for verse in verses]

Downloading:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.72k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/677 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/356 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/229 [00:00<?, ?B/s]

Note: The cell below might take +10 minutes to excute

In [4]:
from sentence_transformers import util

# Compute embeddings
query_embedding = model.encode('الوفاء بالعهد', convert_to_tensor=True)
# Notice that we pass the convert_to_tensor=True parameter to the encode function to return a pytorch tensor containing our embeddings.

score2verse = []
for verse in verses:
  # Notic that this model is trained on short statements :(
  passage_embedding = model.encode(verse, convert_to_tensor=True)

  # Now, We can then call util.cos_sim(A, B) which computes the cosine similarity.
  cosine_score = util.cos_sim(query_embedding, passage_embedding).item()
  score2verse.append((cosine_score, verse))

# Sort by cosine_score
score2verse = sorted(score2verse, key=lambda x: -x[0])

In [5]:
# Find the highest cosine similarity scores
print("أقرب 50 جملة:\n")

for i in range(15):
  print(score2verse[i])

أقرب 50 جملة:

(0.5371328592300415, 'فالمدبرات أمرا')
(0.42029204964637756, 'وزروع ومقام كريم')
(0.4174216389656067, 'ذو العرش المجيد')
(0.4130367040634155, 'فضلا من ربك ذلك هو الفوز العظيم')
(0.4063193202018738, 'الملك يومئذ لله يحكم بينهم فالذين آمنوا وعملوا الصالحات في جنات النعيم')
(0.39888525009155273, 'وأما من آمن وعمل صالحا فله جزاء الحسنى وسنقول له من أمرنا يسرا')
(0.37838301062583923, 'ومن يأته مؤمنا قد عمل الصالحات فأولئك لهم الدرجات العلى')
(0.37209072709083557, 'قول معروف ومغفرة خير من صدقة يتبعها أذى والله غني حليم')
(0.3688563108444214, 'لكم دينكم ولي دين')
(0.3688267469406128, 'أن اعمل سابغات وقدر في السرد واعملوا صالحا إني بما تعملون بصير')
(0.3668937683105469, 'بسم الله الرحمن الرحيم عبس وتولى')
(0.3546406626701355, 'فضلا من الله ونعمة والله عليم حكيم')
(0.3503027558326721, 'لمثل هذا فليعمل العاملون')
(0.34599581360816956, 'وإن كلا لما ليوفينهم ربك أعمالهم إنه بما يعملون خبير')
(0.3401797413825989, 'فمن يعمل مثقال ذرة خيرا يره')
