## Installing dependecies

In [0]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

import os 
try:
  os.chdir('drive/My Drive/Colab Notebooks/search')
  print('Changed directory')
  print(os.getcwd())
except:
  print('Cannot change directory')
  print(os.getcwd())

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive
Changed directory
/content/drive/My Drive/Colab Notebooks/search


In [0]:
# !pip install tensorflow tensorflow_text -U
# !pip install flair==0.4.3 
# !curl -L https://anaconda.org/pytorch/faiss-cpu/1.6.0/download/linux-64/faiss-cpu-1.6.0-py36h6bb024c_0.tar.bz2 | tar xj
# !mv lib/python3.6/site-packages/* /usr/local/lib/python3.6/dist-packages/

In [0]:
from abc import ABCMeta, abstractmethod

import pandas as pd
import tensorflow_hub as hub
import tensorflow as tf
import tensorflow_text

from flair.embeddings import BertEmbeddings, DocumentPoolEmbeddings
from flair.data import Sentence

import faiss

from tqdm import tqdm

Loading faiss with AVX2 support.


In [0]:
df = pd.read_csv('quora-question-pairs/train.csv')
df = df.sample(frac=0.01, random_state=1)
df.dropna(inplace=True)
df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
237030,237030,33086,348102,How can I stop playing video games?,Should I stop playing video games with my child?,0
247341,247341,73272,8624,Who is better Donald Trump or Hillary Clinton?,Why is Hillary Clinton a better choice than Do...,1
246425,246425,359482,359483,What do you think is the chance that sometime ...,Do you think there will be another world war/n...,1
306985,306985,1357,47020,Why are so many questions posted to Quora that...,Why do people write questions on Quora that co...,1
225863,225863,334315,334316,Can there even be a movie ever rated 10/10 on ...,What are your 10/10 movies?,0


In [0]:
questions = df.question1.values
len(questions)

4043

## Universal Sentence Encoder https://arxiv.org/pdf/1803.11175.pdf

### Languages Family
#### - Arabic (ar) Semitic
#### - Chinese (PRC) (zh) Sino-Tibetan
#### - Chinese (Taiwan) (zh-tw)
#### - Dutch(nl) English(en) Germanic
#### - German (de)
#### - French (fr) Italian (it) Latin
#### - Portuguese (pt) Spanish (es)
#### - Japanese (ja) Japonic
#### - Korean (ko) Koreanic
#### - Russian (ru) Polish (pl) Slavic
#### - Thai (th) Kra–Dai
#### - Turkish (tr) Turkic

In [0]:
class TFEncoder(metaclass=ABCMeta):
    """Base encoder to be used for all encoders."""
    def __init__(self, model_path:str):
        self.model = hub.load(model_path)
    
    @abstractmethod
    def encode(self, text:list):
        """Encodes text.
        Text: should be a list of strings to encode
        """
        
class USE(TFEncoder):
    """Universal sentence encoder"""
    def __init__(self, model_path):
        super().__init__(model_path)
        
    def encode(self, text):
        return self.model(text).numpy()
    
class USEQA(TFEncoder):
    """Universal sentence encoder trained on Question Answer pairs"""
    def __init__(self, model_path):
        super().__init__(model_path)
        
    def encode(self, text):
        return self.model.signatures['question_encoder'](tf.constant(s))['outputs'].numpy()
    
class BERT():
    """BERT models"""
    def __init__(self, model_name, layers="-2", pooling_operation="mean"):
        self.embeddings = BertEmbeddings(model_name, 
                                         layers=layers,
                                         pooling_operation=pooling_operation)

        self.document_embeddings = DocumentPoolEmbeddings([self.embeddings], fine_tune_mode='nonlinear')
        
    def encode(self, text):
        sentence = Sentence(text)
        self.document_embeddings.embed(sentence)
        return sentence.embedding.detach().numpy().reshape(1, -1)


# model_path = 'https://tfhub.dev/google/universal-sentence-encoder-qa/3'
# model_path = '../../models/universal-sentence-encoder-qa3/'

# https://arxiv.org/pdf/1803.11175.pdf
# model_path = '../../models/universal-sentence-encoder-large5/' #best for english

model_path = "https://tfhub.dev/google/universal-sentence-encoder-multilingual-large/3"
# model_path = '../../models/universal-sentence-encoder-multilingual-large3/'

# encoder = BERT('bert-base-uncased')
encoder = USE(model_path)

In [0]:
encoder.encode(['hello']).shape

(1, 512)

In [0]:
d = encoder.encode(['hello']).shape[-1]
d

512

## FAISS class

In [0]:
class FAISS:
    def __init__(self, dimensions:int):
        self.dimensions = dimensions
        self.index = faiss.IndexFlatL2(dimensions)
        self.vectors = {}
        self.counter = 0
    
    def add(self, text:str, v:list):
        self.index.add(v)
        self.vectors[self.counter] = (text, v)
        self.counter += 1
        
    def search(self, v:list, k:int=10):
        distance, item_index = self.index.search(v, k)
        for dist, i in zip(distance[0], item_index[0]):
            if i==-1:
                break
            else:
                print(f'{self.vectors[i][0]}, %.2f'%dist)

## Testing vector search

In [0]:
index = FAISS(d)

# index word
t1 = 'hello'
v1 = encoder.encode([t1])
index.add(t1, v1)

# index word
t1 = 'bye'
v1 = encoder.encode([t1])
index.add(t1, v1)

# search similar word
t1 = 'hi'
v1 = encoder.encode([t1])
print('word,  distance')
index.search(v1)

word,  distance
hello, 0.07
bye, 0.83


## Generate embeddings and index all questions

In [0]:
index = FAISS(d)

for q in tqdm(questions):
    emb = encoder.encode([q])
    index.add(q, emb)

100%|██████████| 4043/4043 [04:00<00:00, 16.83it/s]


In [0]:
def search(s, k=10):
    emb = encoder.encode([s])
    index.search(emb, k)

# Search Examples

In [0]:
search('how to lose weight?')

How do lose weight with healthy way?, 0.22
Can you offer me any advice on how to lose weight?, 0.28
Can you offer me any advice on how to lose weight?, 0.28
Can you offer me any advice on how to lose weight?, 0.28
What are the best ways to lose weight?, 0.29
How can I lose weight safely?, 0.31
How can I lose weight without doing excercise?, 0.32
I'm overweight. How can I begin to lose weight?, 0.33
I'm overweight. How can I begin to lose weight?, 0.33
How can I lose weight quickly? Need serious help., 0.38


In [0]:
print('Japanese')
search('体重を減らす方法は？')

Japanese
What are the best ways to lose weight?, 0.31
How do lose weight with healthy way?, 0.36
Can you offer me any advice on how to lose weight?, 0.37
Can you offer me any advice on how to lose weight?, 0.37
Can you offer me any advice on how to lose weight?, 0.37
How could I lose a few pounds quickly?, 0.39
How can I lose weight quickly? Need serious help., 0.42
How can I lose weight without doing excercise?, 0.43
How can I lose weight safely?, 0.43
What is the fastest way to lose weight successfully?, 0.43


In [0]:
print('Chinese')
search('如何减肥?')

Chinese
How do lose weight with healthy way?, 0.30
What are the best ways to lose weight?, 0.34
How can I lose weight without doing excercise?, 0.38
Can you offer me any advice on how to lose weight?, 0.40
Can you offer me any advice on how to lose weight?, 0.40
Can you offer me any advice on how to lose weight?, 0.40
How can I lose weight safely?, 0.42
How can I lose weight quickly? Need serious help., 0.45
How could I lose a few pounds quickly?, 0.46
What are the best ways to lose weight? What is the best diet plan?, 0.49


In [0]:
print('Spanish')
search('¿Cómo perder peso?')

Spanish
How do lose weight with healthy way?, 0.22
What are the best ways to lose weight?, 0.25
How can I lose weight without doing excercise?, 0.28
How can I lose weight safely?, 0.29
Can you offer me any advice on how to lose weight?, 0.30
Can you offer me any advice on how to lose weight?, 0.30
Can you offer me any advice on how to lose weight?, 0.30
How can I lose weight quickly? Need serious help., 0.31
How could I lose a few pounds quickly?, 0.32
I'm overweight. How can I begin to lose weight?, 0.36


In [0]:
print('German')
search('wie man Gewicht verliert?')

German
How do lose weight with healthy way?, 0.34
What are the best ways to lose weight?, 0.34
How could I lose a few pounds quickly?, 0.38
How can I lose weight without doing excercise?, 0.40
Can you offer me any advice on how to lose weight?, 0.41
Can you offer me any advice on how to lose weight?, 0.41
Can you offer me any advice on how to lose weight?, 0.41
What is the fastest way to lose weight successfully?, 0.42
How can I lose weight safely?, 0.44
How can I lose weight quickly? Need serious help., 0.46
