
<h1 style="align:center">Search Engine with Pytorch and Annoy</h1>

---




<h2> Creating a Model</h2>



> <h3>installing and importing modules</h3>







In [None]:
!pip install transformers[torch]==4.7
!pip install SentencePiece
!pip install bert-extractive-summarizer
!pip install spacy
!pip install sentence_transformers
!pip install annoy

In [None]:
import pandas as pd
import numpy as np
import torch
import transformers
from transformers import BertTokenizer, BertModel



> <h3>Creating a generator no duplicates data loader</h3>






In [None]:
import math
class NoDuplicatesDataLoader:

    def __init__(self, df, batch_size):
        self.batch_size = batch_size
        self.data_pointer = 0
        self.df = df.sample(frac=1).reset_index(drop=True)

    def __iter__(self):
        for _ in range(self.__len__()):
            batch = []
            texts_in_batch = set()

            while len(batch) < self.batch_size:
                example = self.df.iloc[self.data_pointer]

                valid_example = True
                for text in example:
                    if text.strip().lower() in texts_in_batch:
                        valid_example = False
                        break

                if valid_example:
                    batch.append(list(example))
                    for text in example:
                        texts_in_batch.add(text.strip().lower())

                self.data_pointer += 1
                if self.data_pointer > self.df.shape[0] - 1:
                    self.data_pointer = 0
                    self.df = df.sample(frac=1).reset_index(drop=True)

            yield batch

    def __len__(self):
        return math.floor(self.df.shape[0] / self.batch_size)



> <h3>Initializing Bert Model</h3>



In [None]:
class BertEncoder(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.tokenizer =  BertTokenizer.from_pretrained('google/bert_uncased_L-6_H-512_A-8')
        self.bert = BertModel.from_pretrained('google/bert_uncased_L-6_H-512_A-8')
 

    def forward(self, text_inputs):
        inputs = self.tokenizer(text_inputs,
                                return_tensors="pt",
                                padding=True,
                                truncation=True,
                                max_length=256)
        input_mask_expanded = inputs['attention_mask'].unsqueeze(-1)
        out = (
            torch.mean(self.bert(**inputs)['last_hidden_state'] * input_mask_expanded, 1)
            / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
        )
        return out

model = BertEncoder()

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/383 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/141M [00:00<?, ?B/s]

Some weights of the model checkpoint at google/bert_uncased_L-6_H-512_A-8 were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


<All keys matched successfully>

<h2>Preparing Data</h2>

<h4>I took quora question triplets and Mscmarco tripltes datasets from <h4>
https://www.sbert.net/examples/training/paraphrases/README.html
<h4>I tuned the Msmarco dataset by filtering out long queries to make the model's performance on short queries more robust. I also removed long passages because of the preformance issues. From the remaining set I randomly sampled 400000 triplets</h4>



<h2>Training a model</h2>





> <h3>Loss function</h3>



In [None]:
def MultiNegativeRankingLoss(q, p, n=None):
    q_p_sim = cosine_sim(q, p, 6)
    negative = 0
    positive = q_p_sim[np.arange(q.shape[0]),
                       np.arange(q.shape[0])]
    if n: 
        q_n_sim = cosine_sim(q, n, 6)
        negative += torch.sum(torch.exp(q_n_sim), dim=1)
    negative += torch.sum(torch.exp(q_p_sim), dim=1)
    loss = np.mean(np.log(negative) - positive)
    return loss

def cosine_sim(queries, passages, par = 6):
    q_norms = torch.sqrt(torch.sum(queries ** 2, dim=1).reshape(-1,1))
    p_norms = torch.sqrt(torch.sum(passages ** 2,dim=1).reshape(1,-1))
    norm = torch.matmul(q_norms, p_norms)
    dot_prod = torch.matmul(queries, torch.transpose(passages,0,1))
    return (dot_prod*par)/norm



> <h3>Training Loop</h3>



In [None]:
from transformers import AdamW, get_polynomial_decay_schedule_with_warmup
optimizer = AdamW(model.parameters(), lr=0.7e-5)
lr_scheduler = get_polynomial_decay_schedule_with_warmup(optimizer, num_warmup_steps = 30, num_training_steps=1000, power=1)
num_epochs=1

def train(train_generator, checkpoint_name)
    for t in range(num_epochs):
        for i, train_batch in enumerate(train_generator):
            torch.save({
                'epoch': t,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                },f'./model_data/checkpoints/{checkpoint_name}' )
            q = model([el[0] for el in train_batch])
            p = model([el[1] for el in train_batch])
            n = model([el[2] for el in train_batch])
            loss = MultiNegativeRankingLoss(q, p, n)
            print(i, loss)
            optimizer.zero_grad()  
            loss.backward()
            optimizer.step()
            lr_scheduler.step()

In [None]:
train(quora_data, 'symmetric')

In [None]:
train(msmarco_data, 'asymmetric')

<h2>Creating Indexes</h2>



> <h3>Preparing Movie Data</h3>



In [None]:
import ast
import json

movies = pd.read_csv('./movie_data/movie.metadata.tsv', sep='\t',header=None)
plots = pd.read_csv('./movie_data/plot_summaries.txt',sep='\t',header=None)

plots = plots.rename({0:'ID', 1:'Plot'},axis=1)
movies = movies.rename({0:'ID',
                        2:'Title',
                        3: 'Release Date',
                        4:'Box Office',
                        5: 'Duration',
                        7: 'Countries',
                        8: 'Genre'}, axis=1)

df = movies.merge(plots, on='ID',how='inner')[['ID',
                                               'Title',
                                               'Release Date',
                                               'Duration',
                                               'Box Office',
                                               'Countries',
                                               'Genre',
                                               'Plot']]

df['Countries']= df['Countries'].apply(
    lambda x:json.dumps(list(ast.literal_eval(x).values()))
)
df['Genre']= df['Genre'].apply(
    lambda x:json.dumps(list(ast.literal_eval(x).values()))
)



> <h3>Storing plots in indexes</h3>



In [None]:
from annoy import AnnoyIndex
import random

def create_index(model, data, index_name):
    f = 512
    t = AnnoyIndex(f, 'angular')
    for i in range(len(s['Plot'])):
        v = model(data.iloc[i])
        t.add_item(i, v.flatten())
    t.build(100)
    t.save(f'/content/drive/MyDrive/Search Engine/movie_data/indexes/{index_name}.ann')

In [None]:
symmetric_model = BertEncoder()
symmetric_model.load_state_dict(torch.load('./model_data/checkpoints/symmetric'))
create_index(symmetric_model, df['Plot'], 'symmetric_index')

In [None]:
asymmetric_model = BertEncoder()
asymmetric_model.load_state_dict(torch.load('./model_data/checkpoints/asymmetric'))
create_index(symmetric_model, df['Plot'], 'asymmetric_index')

<h2>Storing Movie info in sqlite database</h2>

In [None]:
import numpy as np
import sqlite3
sqlite3.register_adapter(np.int64, lambda val: int(val))
sqlite3.register_adapter(np.int32, lambda val: int(val))
con = sqlite3.connect('/content/drive/MyDrive/Search Engine/movies_info.sqlite3')
cur = con.cursor()

In [None]:
cur.execute("""CREATE TABLE movies_info(
                                        id INT NOT NULL PRIMARY KEY,
                                        movie_id INT,
                                        title TEXT,
                                        release_date TEXT,
                                        duration INT,
                                        box_office REAL,
                                        countries TEXT,
                                        genre TEXT,
                                        plot TEXT
                                        );
            """)

In [None]:
for i in range(df.shape[0]):
    cur.execute('''INSERT INTO movies_info VALUES (?,?,?,?,?,?,?,?,?)
                ''', (i,
                      df['ID'].iloc[i],
                      df['Title'].iloc[i],
                      df['Release Date'].iloc[i],
                      df['Duration'].iloc[i],
                      df['Box Office'].iloc[i],
                      df['Countries'].iloc[i],
                      df['Genre'].iloc[i],
                      df['Plot'].iloc[i],
                      )
                     
                )
    if i % 20 == 0:
        print(i)
con.commit()