# Semantic Search Engine Using the MIND Dataset: Microsoft News Recommendations

In [1]:
import warnings
# Suppress all warnings
warnings.filterwarnings('ignore')

## 1. ElasticSearch Setup

In [2]:
import sys
import os
from dotenv import find_dotenv
from dotenv import load_dotenv

sys.path.append('../..')
_ = load_dotenv(find_dotenv()) # read local .env file

In [3]:
from elasticsearch import Elasticsearch

In [4]:
host_name = os.environ['HOST_NAME']
api_key = os.environ['API_KEY']

In [5]:
es = Elasticsearch(hosts=host_name, api_key=api_key)

In [6]:
es.ping()

True

## 2. Data Preprocessing

In [7]:
import pandas as pd

df = pd.read_csv("news.tsv", delimiter='\t', header=None).loc[:499]
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7
0,N55528,lifestyle,lifestyleroyals,"The Brands Queen Elizabeth, Prince Charles, an...","Shop the notebooks, jackets, and more that the...",https://assets.msn.com/labs/mind/AAGH0ET.html,"[{""Label"": ""Prince Philip, Duke of Edinburgh"",...",[]
1,N19639,health,weightloss,50 Worst Habits For Belly Fat,These seemingly harmless habits are holding yo...,https://assets.msn.com/labs/mind/AAB19MK.html,"[{""Label"": ""Adipose tissue"", ""Type"": ""C"", ""Wik...","[{""Label"": ""Adipose tissue"", ""Type"": ""C"", ""Wik..."
2,N61837,news,newsworld,The Cost of Trump's Aid Freeze in the Trenches...,Lt. Ivan Molchanets peeked over a parapet of s...,https://assets.msn.com/labs/mind/AAJgNsz.html,[],"[{""Label"": ""Ukraine"", ""Type"": ""G"", ""WikidataId..."
3,N53526,health,voices,I Was An NBA Wife. Here's How It Affected My M...,"I felt like I was a fraud, and being an NBA wi...",https://assets.msn.com/labs/mind/AACk2N6.html,[],"[{""Label"": ""National Basketball Association"", ..."
4,N38324,health,medical,"How to Get Rid of Skin Tags, According to a De...","They seem harmless, but there's a very good re...",https://assets.msn.com/labs/mind/AAAKEkt.html,"[{""Label"": ""Skin tag"", ""Type"": ""C"", ""WikidataI...","[{""Label"": ""Skin tag"", ""Type"": ""C"", ""WikidataI..."


In [8]:
df.columns = [
    "News ID",
    "Category",
    "SubCategory",
    "Title",
    "Abstract",
    "URL",
    "Title Entities",
    "Abstract Entities"
]


In [9]:
df.head()

Unnamed: 0,News ID,Category,SubCategory,Title,Abstract,URL,Title Entities,Abstract Entities
0,N55528,lifestyle,lifestyleroyals,"The Brands Queen Elizabeth, Prince Charles, an...","Shop the notebooks, jackets, and more that the...",https://assets.msn.com/labs/mind/AAGH0ET.html,"[{""Label"": ""Prince Philip, Duke of Edinburgh"",...",[]
1,N19639,health,weightloss,50 Worst Habits For Belly Fat,These seemingly harmless habits are holding yo...,https://assets.msn.com/labs/mind/AAB19MK.html,"[{""Label"": ""Adipose tissue"", ""Type"": ""C"", ""Wik...","[{""Label"": ""Adipose tissue"", ""Type"": ""C"", ""Wik..."
2,N61837,news,newsworld,The Cost of Trump's Aid Freeze in the Trenches...,Lt. Ivan Molchanets peeked over a parapet of s...,https://assets.msn.com/labs/mind/AAJgNsz.html,[],"[{""Label"": ""Ukraine"", ""Type"": ""G"", ""WikidataId..."
3,N53526,health,voices,I Was An NBA Wife. Here's How It Affected My M...,"I felt like I was a fraud, and being an NBA wi...",https://assets.msn.com/labs/mind/AACk2N6.html,[],"[{""Label"": ""National Basketball Association"", ..."
4,N38324,health,medical,"How to Get Rid of Skin Tags, According to a De...","They seem harmless, but there's a very good re...",https://assets.msn.com/labs/mind/AAAKEkt.html,"[{""Label"": ""Skin tag"", ""Type"": ""C"", ""WikidataI...","[{""Label"": ""Skin tag"", ""Type"": ""C"", ""WikidataI..."


In [10]:
df.isna().value_counts()

News ID  Category  SubCategory  Title  Abstract  URL    Title Entities  Abstract Entities
False    False     False        False  False     False  False           False                481
                                       True      False  False           False                 19
Name: count, dtype: int64

In [11]:
df.fillna("None", inplace=True)

## 3. Vectorization using BERT

In [12]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-mpnet-base-v2')

In [13]:
df["TitleVector"] = df["Title"].apply(lambda x: model.encode(x))

In [14]:
df.head()

Unnamed: 0,News ID,Category,SubCategory,Title,Abstract,URL,Title Entities,Abstract Entities,TitleVector
0,N55528,lifestyle,lifestyleroyals,"The Brands Queen Elizabeth, Prince Charles, an...","Shop the notebooks, jackets, and more that the...",https://assets.msn.com/labs/mind/AAGH0ET.html,"[{""Label"": ""Prince Philip, Duke of Edinburgh"",...",[],"[0.019095603, -0.0048817745, -0.008627319, 0.0..."
1,N19639,health,weightloss,50 Worst Habits For Belly Fat,These seemingly harmless habits are holding yo...,https://assets.msn.com/labs/mind/AAB19MK.html,"[{""Label"": ""Adipose tissue"", ""Type"": ""C"", ""Wik...","[{""Label"": ""Adipose tissue"", ""Type"": ""C"", ""Wik...","[0.06197543, 0.04211074, 0.0388066, -0.0036604..."
2,N61837,news,newsworld,The Cost of Trump's Aid Freeze in the Trenches...,Lt. Ivan Molchanets peeked over a parapet of s...,https://assets.msn.com/labs/mind/AAJgNsz.html,[],"[{""Label"": ""Ukraine"", ""Type"": ""G"", ""WikidataId...","[-0.010508516, 0.07337326, 0.0089243185, 0.036..."
3,N53526,health,voices,I Was An NBA Wife. Here's How It Affected My M...,"I felt like I was a fraud, and being an NBA wi...",https://assets.msn.com/labs/mind/AACk2N6.html,[],"[{""Label"": ""National Basketball Association"", ...","[-0.028978547, 0.025465462, -0.022636574, -0.0..."
4,N38324,health,medical,"How to Get Rid of Skin Tags, According to a De...","They seem harmless, but there's a very good re...",https://assets.msn.com/labs/mind/AAAKEkt.html,"[{""Label"": ""Skin tag"", ""Type"": ""C"", ""WikidataI...","[{""Label"": ""Skin tag"", ""Type"": ""C"", ""WikidataI...","[0.041656446, -0.038394276, -0.006969393, 0.00..."


In [15]:
es.ping()

True

## 4. Create Index and  Ingest Data

In [17]:
# from indexMapping import indexMapping

# es.indices.create(index="all_news", mappings=indexMapping)

In [18]:
record_list = df.to_dict("records")

In [19]:
for record in record_list:
    try:
        es.index(index="all_news", document=record, id=record["News ID"])
    except Exception as e:
        print(e)

In [20]:
es.count(index="all_news")

ObjectApiResponse({'count': 500, '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}})

## 5. Semantic Search with Query

In [21]:
input_keyword = "Trump"
vector_of_input_keyword = model.encode(input_keyword)

query = {
    "field" : "TitleVector",
    "query_vector" : vector_of_input_keyword,
    "k" : 5,
    "num_candidates" : 500, 
}

res = es.knn_search(index="all_news", knn=query , source=["Title","Abstract"])
res["hits"]["hits"]

[{'_index': 'all_news',
  '_id': 'N56461',
  '_score': 0.7045567,
  '_source': {'Title': "President Trump's trillion-dollar hit to homeowners",
   'Abstract': 'People should consider one of the largely unexamined effects of the last tax bill, which Trump promised would help the middle class: Would you believe it has inflicted a trillion dollars of damage on homeowners?'}},
 {'_index': 'all_news',
  '_id': 'N41013',
  '_score': 0.6972394,
  '_ignored': ['Abstract Entities.keyword', 'Title Entities.keyword'],
  '_source': {'Title': "Doug Schoen: Hillary vs. Trump in 2020? If Clinton is serious, here's best way for her to defeat the president",
   'Abstract': 'Even when she is out of the running for president, Hillary Clinton is still one of the most prominent voices in the Democratic Party.'}},
 {'_index': 'all_news',
  '_id': 'N42474',
  '_score': 0.6900089,
  '_ignored': ['Abstract Entities.keyword', 'Title Entities.keyword'],
  '_source': {'Title': "Trump's Trustbusters Bring Microsof