<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Create-a-super-smart-search-engine-over-any-free-text-data-source" data-toc-modified-id="Create-a-super-smart-search-engine-over-any-free-text-data-source-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Create a super smart search engine over any free text data source</a></span><ul class="toc-item"><li><ul class="toc-item"><li><span><a href="#Load-data-and-create-DF" data-toc-modified-id="Load-data-and-create-DF-1.0.1"><span class="toc-item-num">1.0.1&nbsp;&nbsp;</span>Load data and create DF</a></span></li><li><span><a href="#Preprocess-and-tokenise" data-toc-modified-id="Preprocess-and-tokenise-1.0.2"><span class="toc-item-num">1.0.2&nbsp;&nbsp;</span>Preprocess and tokenise</a></span></li><li><span><a href="#Fast-text" data-toc-modified-id="Fast-text-1.0.3"><span class="toc-item-num">1.0.3&nbsp;&nbsp;</span>Fast text</a></span></li><li><span><a href="#Load-fasttext-and-query" data-toc-modified-id="Load-fasttext-and-query-1.0.4"><span class="toc-item-num">1.0.4&nbsp;&nbsp;</span>Load fasttext and query</a></span></li><li><span><a href="#Creating-BM25-document-vectors:" data-toc-modified-id="Creating-BM25-document-vectors:-1.0.5"><span class="toc-item-num">1.0.5&nbsp;&nbsp;</span>Creating BM25 document vectors:</a></span></li><li><span><a href="#Load-document-vectors,-build-index-and-search:" data-toc-modified-id="Load-document-vectors,-build-index-and-search:-1.0.6"><span class="toc-item-num">1.0.6&nbsp;&nbsp;</span>Load document vectors, build index and search:</a></span></li></ul></li></ul></li></ul></div>

# Create a super smart search engine over any free text data source

This code acompanies the following blog posts by https://medium.com/@thejoshtaylor


In [1]:
import os
import pandas as pd
import numpy as np
import pickle
import spacy
from tqdm import tqdm
import matplotlib.pyplot as plt
from gensim.models.fasttext import FastText
# !pip install rank_bm25 --quiet #install BM25
# !pip install --no-binary :all: nmslib #install nmslib
from rank_bm25 import BM25Okapi
import nmslib
import time
from google.colab import drive

# drive.mount('/content/drive') # if you want to use G Drive
pd.set_option('display.max_colwidth', -1)
plt.style.use('fivethirtyeight')


  from IPython.utils import traitlets as _traitlets


### Load data and create DF

In [2]:
#LINK TO DATASET USED:
# https://drive.google.com/file/d/13LLeNj9Fajk0PBd7U5kXEhsEpSRMWwbJ/view?usp=sharing

import pandas as pd
df = pd.read_csv('../data/export.csv')
df['text'] = df['tendertitle'] + ' ' + df['tenderdescription']+ ' ' + df['locality']+ ' ' + df['postalCode']
df.shape


(212447, 6)

In [3]:
df.head(1)

Unnamed: 0,externalid,tendertitle,tenderdescription,locality,postalCode,text
0,00227ecd-50ac-4f92-9e7f-d2675663efa2,TELEPHONY SERVICES,TELEPHONY SERVICES,Exeter,EX1 3PB,TELEPHONY SERVICES TELEPHONY SERVICES Exeter EX1 3PB


### Preprocess and tokenise

In [6]:
import ftfy

tokenise and do some cleaning: remove punctuation, white space 
and convert the text to lowercase

nlp = spacy.load("en_core_web_sm")
tok_text=[] # for our tokenised corpus
text = df.text.str.lower().values
text = [ftfy.fix_text(str(i)) for i in text] # change bad unicode with good unicode

#Tokenising using SpaCy:
for doc in tqdm(nlp.pipe(text, n_threads=2, disable=["tagger", "parser","ner"])):
    tok = [t.text for t in doc if (t.is_ascii and not t.is_punct and not t.is_space)]
    tok_text.append(tok)


### Fast text

In [8]:
# create word vectors with FastText
from gensim.models.fasttext import FastText

ft_model = FastText(
    sg=1, # use skip-gram: usually gives better results
    size=100, # embedding dimension (default)
    window=10, # window size: 10 tokens before and 10 tokens after to get wider context
    min_count=5, # only consider tokens with at least n occurrences in the corpus
    negative=15, # negative subsampling: bigger than default to sample negative examples more
    min_n=2, # min character n-gram
    max_n=5 # max character n-gram
)

ft_model.build_vocab(tok_text)

ft_model.train(
    tok_text,
    epochs=6,
    total_examples=ft_model.corpus_count, 
    total_words=ft_model.corpus_total_words)

ft_model.save('_fasttext.model')

### Load fasttext and query

In [9]:
ft_model = FastText.load('_fasttext.model')

In [None]:
with plt.xkcd():
    pd.DataFrame(ft_model.wv.most_similar("m4", topn=20, restrict_vocab=5000),columns=['Word','Score']).plot.barh(x='Word',figsize=(6,6),color=(0.3,0.7,0.7))

In [None]:
with plt.xkcd():
    pd.DataFrame(ft_model.wv.most_similar("rg9", topn=10, restrict_vocab=10000),columns=['Word','Score']).plot.barh(x='Word',figsize=(6,6),color=(0.3,0.7,0.7))

### Creating BM25 document vectors:

In [23]:
bm25 = BM25Okapi(tok_text)
weighted_doc_vects = []

for i,doc in tqdm(enumerate(tok_text)):
    doc_vector = []
    for word in doc:
        vector = ft_model[word]
        weight = (bm25.idf[word] * ((bm25.k1 + 1.0)*bm25.doc_freqs[i][word])) 
        / 
        (bm25.k1 * (1.0 - bm25.b + bm25.b *(bm25.doc_len[i]/bm25.avgdl))+bm25.doc_freqs[i][word])
        weighted_vector = vector * weight
        doc_vector.append(weighted_vector)
    doc_vector_mean = np.mean(doc_vector,axis=0)
    weighted_doc_vects.append(doc_vector_mean)

  import sys
212447it [02:47, 1267.77it/s]


In [24]:
pickle.dump( weighted_doc_vects, open( "weighted_doc_vects.p", "wb" ) )

### Load document vectors, build index and search:

In [10]:
with open( "weighted_doc_vects.p", "rb" ) as f:
    weighted_doc_vects = pickle.load(f)
# create a random matrix to index
data = np.vstack(weighted_doc_vects)

# initialize a new index, using a HNSW index on Cosine Similarity - can take a couple of mins
index = nmslib.init(method='hnsw', space='cosinesimil')
index.addDataPointBatch(data)
index.createIndex({'post': 2}, print_progress=True)

In [17]:
# querying the index:
input = 'defending'.lower().split()


query = [ft_model[vec] for vec in input]
query = np.mean(query,axis=0)

t0 = time.time()
ids, distances = index.knnQuery(query, k=10)
t1 = time.time()
print(f'Searched {df.shape[0]} records in {round(t1-t0,4) } seconds \n')
for i,j in zip(ids,distances):
    print(round(j,2))
    print(df.text.values[i])

Searched 212447 records in 0.0066 seconds 

0.27
CYBER SECURITY INFORMED SAFETY CASES CONTRACT FOR CYBER SECURITY INFORMED SAFETY CASES Newport NP10 8QQ
0.27
PROVISION OF CYBER RESILIENCE COURSE PROVISION OF CYBER RESILIENCE COURSE Glasgow G2 8EX
0.28
DEFENCE CYBER PROTECTION TOOL (OCTAVIAN) CONTINUATION OF THE SUPPORT/DEVELOPMENT WORK ON THE DEFENCE CYBER PROTECTION TOOL Corsham SN13 9NR
0.28
DISATER VICTIM IDENTIFICATION SOFTWARE SPECIALIST SOFTWARE FOR IDENTIFICATION OF DISASTER VICTIMS. London SW1 4DF
0.28
PROVISION OF A DEFENCE INCIDENT MANAGEMENT DATABASE PROVISION OF A DEFENCE INCIDENT MANAGEMENT DATABASE Corsham SN13 9NR
0.28
TRANSPARENCY NOTICE: PROVISION AND DELIVERY OF CYBER RESILIENCE TRAINING COURSE THE PROVISION AND DELIVERY OF A CYBER RESILIENCE TRAINING COURSE FOR THE MINISTRY OF DEFENCE. Glasgow G2 8EX
0.29
CAPABILITY DEVELOPMENT IN SUPPORT OF DEFENSIVE CYBER OPERATIONS CAPABILITY DEVELOPMENT IN SUPPORT OF DEFENSIVE CYBER OPERATIONS FOR NAVY COMMAND MARITIME DCO CONCEP

  """
