**Copyright 2019 The TensorFlow Hub Authors.**

Licensed under the Apache License, Version 2.0 (the "License");

In [None]:
# Copyright 2019 The TensorFlow Hub Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

# Cross-Lingual Similarity and Semantic Search Engine with Multilingual Universal Sentence Encoder


<table align="left"><td>
  <a target="_blank"  href="https://colab.research.google.com/github/tensorflow/hub/blob/master/examples/colab/cross_lingual_similarity_with_tf_hub_multilingual_universal_encoder.ipynb">
    <img src="https://www.tensorflow.org/images/colab_logo_32px.png" />Run in Google Colab
  </a>
</td><td>
  <a target="_blank"  href="https://github.com/tensorflow/hub/blob/master/examples/colab/cross_lingual_similarity_with_tf_hub_multilingual_universal_encoder.ipynb">
    <img width=32px src="https://www.tensorflow.org/images/GitHub-Mark-32px.png" />View source on GitHub</a>
</td></table>


This notebook illustrates how to access the Multilingual Universal Sentence Encoder module and use it for sentence similarity across multiple languages. This module is an extension of the [original Universal Encoder module](https://tfhub.dev/google/universal-sentence-encoder/2).

The notebook is divided as follows:

*   The first section shows a visualization of sentences between pair of languages. This is a more academic exercise.
*   In the second section, we show how to build a semantic search engine from a sample of a Wikipedia corpus in multiple languages.

## Citation

*Research papers that make use of the models explored in this colab should cite:*

### [Multilingual universal sentence encoder for semantic retrieval](https://arxiv.org/abs/1907.04307)
Yinfei Yang, Daniel Cer, Amin Ahmad, Mandy Guo, Jax Law, Noah Constant, Gustavo Hernandez Abrego, Steve Yuan, Chris Tar, Yun-Hsuan Sung, Brian Strope, and Ray Kurzweil. 2019.
 arXiv preprint arXiv:1907.04307

# Getting Started

This section sets up the environment for access to the Multilingual Universal Sentence Encoder Module and also prepares a set of English sentences and their translations. In the following sections, the multilingual module will be used to compute similarity *across languages*.

**IMPORTANT**Note: Pleaseelect "**Python 3**" _and_ "**GPU**" in the ***Runtime->Change Runtime type*** dropdown menu above _before_ running this notebook for faster execution.

In [None]:
%%capture
#@title Setup Environment
# Install the latest Tensorflow version.
!pip3 install tensorflow_text
!pip3 install --upgrade tensorflow-gpu
!pip install tensorflow-hub
!pip install bokeh
!pip install simpleneighbors
!pip install tqdm
!pip install nltk

In [None]:
#@title Setup common imports and functions
import bokeh
import bokeh.models
import bokeh.plotting
import numpy as np
import os
import pandas as pd
import tensorflow.compat.v2 as tf
import tensorflow_hub as hub
from tensorflow_text import SentencepieceTokenizer
import sklearn.metrics.pairwise
import nltk
nltk.download('punkt')
from simpleneighbors import SimpleNeighbors
from tqdm import tqdm
from tqdm import trange



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


This is additional boilerplate code where we import the pre-trained ML model we will use to encode text throughout this notebook.

In [None]:
# The 16-language multilingual module is the default but feel free
# to pick others from the list and compare the results.
module_url = 'https://tfhub.dev/google/universal-sentence-encoder-multilingual/3' #@param ['https://tfhub.dev/google/universal-sentence-encoder-multilingual/3', 'https://tfhub.dev/google/universal-sentence-encoder-multilingual-large/3']

model = hub.load(module_url)

def embed_text(input):
  return model(input)

# Visualize Text Similarity Between Languages
With the sentence embeddings now in hand, we can visualize semantic similarity across different languages.

## Computing Text Embeddings

We first define a set of sentences translated to various languages in parallel. Then, we precompute the embeddings for all of our sentences.

In [None]:
# get cosine similairty matrix
def cos_sim_data(data_feature_vectors,query_feature_vector):
    similarities_scores = sklearn.metrics.pairwise.cosine_similarity(data_feature_vectors,query_feature_vector)
    return similarities_scores

In [None]:
# read hadith file
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer 

df = pd.read_excel("Sahih Bukhari Without_Tashkel.xlsx")
df.head()

FileNotFoundError: ignored

In [None]:
# def make a column that have uni and bi and tri gram for each hadith
import json

# this functions make a mgrams from 2 : 5 gram by Tf-idf for each row 'hadith'
# make a dict for save index for each hadith and all compination of gram's
# return all grams for each hadith as a dict {index: grams}
# make a json file to save dict and to saving time

def make_grams():
  all_grams_clusters = {}
  vectorizer_1_2_3_gram = TfidfVectorizer(analyzer='word', ngram_range=(2,5))
  for i in range(1000): # df.shape[0]
    grams_for_each_row = []
    vectors_1_2_3_gram = vectorizer_1_2_3_gram.fit_transform([df['Sahih Bukhari Without_Tashkel'][i]])
    feature_names_1_2_3_gram = vectorizer_1_2_3_gram.get_feature_names()
    grams_for_each_row.append(feature_names_1_2_3_gram)
    all_grams_clusters[str(i)] = grams_for_each_row

    # # save all similarities with indeces as json
    # with open('similarities.txt', 'w') as json_file:
    #   json.dump(all_grams_clusters, json_file)

  # print(all_grams_clusters)
  return all_grams_clusters

In [None]:
# cos_sim_data
# get top N similarities for similar gram by embedding query and all grams for hadith
def top_n_similarities(query, grams_clusters, n = 3):
  all_sim_clusters = {}
  top_n_similarities_list = []
  for hadith_num, grams_list in grams_clusters.items():
    similarities_array = cos_sim_data(np.array(embed_text(query)),embed_text(grams_list))
    all_sim_clusters[hadith_num] = np.max(np.array(similarities_array).tolist())
    all_sim_clusters = {k: v for k, v in sorted(all_sim_clusters.items(), key=lambda item: item[1] , reverse= True)}
  for index, element in enumerate(all_sim_clusters.items()):
    # print(index)
    if index == n-1:
      break    
    top_n_similarities_list.append(element)
  return top_n_similarities_list


In [None]:
query = 'إنما الأعمال بالنيات'
grams_clusters = make_grams() # read json file
print(top_n_similarities(query, grams_clusters , 5))


In [None]:
list_of_similarities_scores = top_n_similarities(query, grams_clusters , 5)
for index, score in list_of_similarities_scores:
  print('Hadith number in Bukhari: {}\nHadiths is:\n{}\nScore: {}\n{}'.format(int(index)+1, df['Sahih Bukhari Without_Tashkel'][int(index)], score, '-'*200))