# Build semantic document search engine with TF-IDF

In this notebook, we are using some of (7 of them) hamshahri newspapare dataset for our work.
At the beginning, we shoud store them in mongodb collection whose name in 'ham_2007'. we shoud preprocess our texts of news in order to find unique words and create tf-idf model with them.

**I run this notebook in vscode.**


In [1]:
from parsivar import Normalizer, Tokenizer, FindStems

# plot
import matplotlib.pyplot as plt

import pandas as pd
import string
import re
import numpy as np
np.random.seed(0)

# connet to mongodb
from pymongo import MongoClient

# Library for parsing XML
import xml.etree.ElementTree as ET
import re

import os

import warnings
warnings.filterwarnings('ignore')
warnings.filterwarnings("ignore", message="numpy.dtype size changed")
warnings.filterwarnings("ignore", message="numpy.ufunc size changed")

In [2]:
# Loading NLTk
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## Create MongoDB collection

In [None]:
# connect to mongodb client
client = MongoClient('localhost', port=27017)
db = client.search_engine # create a database whose name is 'search_engine'
ham_2007 = db.ham_2007 # create a collection

In [2]:
#Store file names in the filename
filename = os.listdir(u"/Users/User/Flask_Basics/Search_Engine/TF-IDF/dataset")
print(f'XML files in path : {filename}')

XML files in path : ['HAM2-070101.xml', 'HAM2-070102.xml', 'HAM2-070103.xml', 'HAM2-070104.xml', 'HAM2-070106.xml', 'HAM2-070107.xml', 'HAM2-070110.xml']


In [None]:
filepath = "/Users/User/Flask_Basics/Search_Engine/TF-IDF/dataset"


def xml_to_mongodb(filepath, filename):
    # parse xml file
    for i in range(len(filename)):
        tree = ET.parse(os.path.join(filepath, filename[i]))
        root = tree.getroot()

        # write xml file on database 
        for elem in root.findall("DOC"):
            rows = []

            # parse DOCID
            doc_id = elem.find("DOCID")
            if doc_id != None:
                doc_id = doc_id.text
            rows.append(doc_id)

            # parse TITLE
            title = elem.find("TITLE")
            if title != None:
                title = re.sub('\n', ' ', title.text)
            rows.append(title)  

            # parse TEXT
            sent = elem.find("TEXT")
            if sent != None:
                sentence = list(sent)[-1].tail.strip() if list(sent) else sent.text.strip()
                sentence = re.sub('\n', ' ', sentence)
            rows.append(sentence)

            # save id, title and text of a news in json file
            ham_2007.insert_one({"doc_id":rows[0] , "doc_title":rows[1], "doc_text":rows[2]})



In [None]:
# store xml files in monogdb collection whose name is 'ham_2007'
xml_to_mongodb(filepath, filename)

## Create dataframe from monogdb data

In [3]:
df = pd.DataFrame()

In [3]:
from pymongo import MongoClient

# connect to mongodb client to accest to created database
client = MongoClient('localhost', port=27017)
db = client.search_engine # switch to database whose name is 'search_engine'
ham_2007= db.ham_2007 # switch to a collection whose name is 'ham_2007'

In [5]:
title = []
text = []

for record in ham_2007.find():
    
    title.append(record["doc_title"])
    text.append(record["doc_text"])

In [6]:
print(f'length of title : {len(title)}')
print(f'length of text : {len(text)}')

length of title : 731
length of text : 731


In [7]:
df['title'] = title
df['text'] = text

In [8]:
df.head()

Unnamed: 0,title,text
0,مديركل كتاب و كتابخواني وزارت فرهنگ و ارشاد ا...,فارس: مدير كل كتاب و كتاب خواني وزارت فرهنگ و...
1,مرگ هري پاتر و دردسر پستي انگلستان,ايسنا: در حالي كه تاريخ دقيق انتشار آخرين كتاب...
2,تغيير در اجراي اسكار 2007,ايسنا: نامزدهاي بهترين فيلم خارجي مراسم اسكار ...
3,اخبار كوتاه,چاپ 24 شازده كوچولو: ترجمه شازده كوچولوي محمد ...
4,مهران مديري: پا در كفش بزرگان كرده ام,فارس: مهران مديري گفت: ترانه هايي را كه در مجم...


In [9]:
df.shape

(731, 2)

Let's save this dataframe in order to use it in google colab for applying preprocessing function on the 'text' column with pandarallel library. (For using pandarallel in windows, we should install wsl (windows subsystem linux) on it.)

In [10]:
# # save dataframe
# df.to_pickle("/Users/User/Flask_Basics/Search_Engine/TF-IDF/df")

## Text preprocessing

### Stop Words

In [40]:
# define a function to read file
def readFile(filename):
  fileObj = open(filename, 'r', encoding ='utf-8') # open the file in read mode
  words = fileObj.read().splitlines() # puts the file into an array
  fileObj.close()
  return words

In [41]:
stopwords = readFile('/Users/User/Flask_Basics/persian_stopwords_kharazi.txt')
print(f'length of stop words : {len(stopwords)}')


length of stop words : 1370


In [42]:
print(f'some of stop words : {stopwords[:10]}')

some of stop words : ['!', '"', '#', '(', ')', '*', ',', '-', '.', '/']


### Preprocessing Function

Because, after stemming we have '&' in between of our verbs, we should delete them in function delete_and.

In [43]:
def delete_and(word):
  idx = word.find("&")
  if idx!=-1:
    word = word[:idx]
  return word

In [44]:
def data_preprocessing(review, stopwords):
  try:
    # replace half-space with ' '
    review =  re.sub('\u200c', ' ',review)

    # Normalizing the text
    # First we should normalize text in order to convert persian numbers into english numbers then
    # with following function (filter) delete them
    # Because, some comments are pinglish, we should set pinglish_conversion_needed = True
    normalizer = Normalizer() 
    review = normalizer.normalize(review)

    # because after normalization appear some '/u200c', we should replace them with space
    review =  re.sub('\u200c', ' ',review)

    # delete english characters and numbers from sentences
    review = filter(lambda x: x in string.whitespace or x not in string.printable, review)
    review = ''.join(ch for ch in list(review))
    
    if review != ' ':
      # word tokenization
      tokenizer = Tokenizer()
      words = tokenizer.tokenize_words(review)

      # stemming 
      stemmer = FindStems()
      review = [stemmer.convert_to_stem(word) for word in words]

      # we should delete '&', because after stemming we have '&' in between of our verbs
      review = [delete_and(word) for word in review]

      # remove stop words
      words_without_stopword = filter(lambda x: x not in stopwords, review)
      words_without_stopwords = list(words_without_stopword)
    
      # join words in preprocessed review
      review = ' '.join(words_without_stopwords)
    
    return review

  except TypeError:
    print(review)
    raise

In [13]:
stemmer = FindStems()
rev = stemmer.convert_to_stem('است')


In [14]:
rev

'اس'

### Apply preprocessing function on the dataset

This work took 38 seconds for me.

In [22]:
df['clean_text'] = df['text'].apply(lambda text: data_preprocessing(text, stopwords))

In [23]:
df.head(20)

Unnamed: 0,title,text,clean_text
0,مديركل كتاب و كتابخواني وزارت فرهنگ و ارشاد ا...,فارس: مدير كل كتاب و كتاب خواني وزارت فرهنگ و...,فارس مدیر کتاب کتاب خواند وزارت فرهنگ ارشاد اس...
1,مرگ هري پاتر و دردسر پستي انگلستان,ايسنا: در حالي كه تاريخ دقيق انتشار آخرين كتاب...,ایسنا تاریخ دقیق انتشار آخرین کتاب مجموعه داست...
2,تغيير در اجراي اسكار 2007,ايسنا: نامزدهاي بهترين فيلم خارجي مراسم اسكار ...,ایسنا نامزد فیلم خارجی مراسم اسکار مرحله معرفی...
3,اخبار كوتاه,چاپ 24 شازده كوچولو: ترجمه شازده كوچولوي محمد ...,چاپ شازده کوچولو ترجمه شازده کوچولو محمد قاضی ...
4,مهران مديري: پا در كفش بزرگان كرده ام,فارس: مهران مديري گفت: ترانه هايي را كه در مجم...,فارس مهران مدیر ترانه مجموعه باغ مظفر خوانده ا...
5,تصحيح و توضيح خبر فولادوند,روز شنبه در صفحه يك روزنامه، خبري با عنوان عزت...,شنبه صفحه روزنامه خبری عزت الله فولادوند دوران...
6,سايه روشن,مراسم روز هفت عبداللهي: مراسم گراميداشت هفتمين...,مراسم عبداللهی مراسم گرامیداشت هفتمین درگذشت ن...
7,برپايي نمايشگاه عكس سماع عاشقانوآرامگاه مولانا,گروه ادب و هنر براي اولين بار در كشور توسط يك ...,گروه ادب هنر کشور عکاس ایرانی مجموعه عکس سماع ...
8,عكس روز,عكس: رويترز‎/ ابوهيكل يك فروشگاه در فلسطين، عر...,عکس رویترز ابوهیکل فروشگاه فلسطین عروسک بن لاد...
9,در شهر چه خبر,نمايشگر هاي همشهري رسانه اي با 300 هزار بيننده...,نمایشگر همشهری رسانه بیننده نمایشگر خبری همشهر...


In [24]:
df.shape

(731, 3)

In [25]:
# # save dataframe
# df.to_pickle("/Users/User/Flask_Basics/Search_Engine/TF-IDF/df_clean")

In [4]:
#load the preprocessing dataframe
df= pd.read_pickle(r'/Users/User/Flask_Basics/Search_Engine/TF-IDF/df_clean')

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 731 entries, 0 to 730
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   title       731 non-null    object
 1   text        731 non-null    object
 2   clean_text  731 non-null    object
dtypes: object(3)
memory usage: 17.3+ KB


Let's save clean textes a in mongodb collectin whose name is 'ham_clean'.

In [12]:
ham_clean = db.ham_clean

for i in range(df.shape[0]):
    ham_clean.insert_one({"title": df["title"][i], "text":df["clean_text"][i]})

## Document Search engine with TF-IDF

### Generated TF-IDF by using TfidfVectorizer from Sklearn

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Create Vocabulary
vocabulary = set()

# find unique words in whole of the documents and append them to vocabulary
for doc in df.clean_text:
    vocabulary.update(doc.split())

In [18]:
vocabulary = list(vocabulary)
print(f'length of vocabulary : {len(vocabulary)}')
print(f'some of vocabulary : {vocabulary[:15]}')

length of vocabulary : 12975
some of vocabulary : ['کبیسه', 'ارضا', 'زاویه', 'خودویران', 'ناهماهنگ', 'خفگی', 'کونل', 'ماشائالله', 'تیرگی', 'شولم', 'رضاییان', 'موثربا', 'کالری', 'البیت', 'بهروز']


In [20]:
# Intializating the tfIdf model
tfidf = TfidfVectorizer(vocabulary=vocabulary)
tfidf_tran = tfidf.fit_transform(df['clean_text'])# Fit and Transform the TfIdf model

In [21]:
print(f'tfidf_tran shape : {tfidf_tran.shape}')

tfidf_tran shape : (731, 12975)


In [210]:
print(f'type of tfidf : {type(tfidf)}')
print(f'type of tfidf_tran : {type(tfidf_tran)}')

type of tfidf : <class 'sklearn.feature_extraction.text.TfidfVectorizer'>
type of tfidf_tran : <class 'scipy.sparse.csr.csr_matrix'>


The above cells has created TF-IDF weight of the whole dataset, Now we have to create a vector for the input query in order to calculate inner product of input query and each doc to determine similarty of them.

### Query processing

we shoud define a function that applying data_preprocessing funtion on the input query and then create vector of shape (n_docs, n_vocabulary) for it. Then find similarity of input query vector and each document vector with cosine_similarity function .

This function returns top 10 results .

In [222]:
from sklearn.metrics.pairwise import cosine_similarity

def find_similarity(query):
    #apply preprocessing on the input query
    clean_query = data_preprocessing(query, stopwords)
    # create query vector using tfidf model
    query_vec = tfidf.transform([clean_query]) # Ip -- (n_docs,x), Op -- (n_docs,n_Feats) 
    # calculate inner product of each document vector and query vector
    # tfidf_tran : matrix of documents -- shape (n_docs, n_vocabulary=n_feats)
    results = cosine_similarity(tfidf_tran, query_vec).reshape((-1,)) # Op -- (n_docs,1) -- Cosine Sim with each doc

    results_lst = list(results)# use for show the score of each doc
    results_lst.sort() 

    # create dataframe for our output
    result_df = pd.DataFrame()
    # sort results array then use 10 elements of it from the end of the list then reverse it to have increasing order
    # np.argsort :  Returns the indices that would sort an array
    # we can change 10 to any number we want 
    out = results.argsort()[-10:][::-1]
    
    # add columns to the dataframe
    # index: index of a document
    for i,index in enumerate(out):
        result_df.loc[i,'index'] = str(index)
        result_df.loc[i,'title'] = df['title'][index]
        result_df.loc[i, 'text'] = df['text'][index]
    for j,simScore in enumerate(results_lst[-10:][::-1]):
        result_df.loc[j,'Score'] = simScore

    return result_df

In [224]:
find_similarity('وزارت خانه')

Unnamed: 0,index,title,text,Score
0,366,كركس سياه اصفهان از خانه همسايه وارد مي شد,گروه حوادث- شكارچي زنان خانه دار اصفهاني كه با...,0.37388
1,226,محققان مي گويند,خانه اي كه كنار خيابان پر رفت و آمد شهر ساخته ...,0.21754
2,27,اعتبار بيماران كليويبه وزارت رفاه منتقل مي شود,فارس: معاون سلامت وزارت بهداشت از انتقال اعتبا...,0.187749
3,362,خلافكار حرفه اي، باجه فروش مواد راه انداخته بود,گروه حوادث- با دستگيري يك فروشنده مواد مخدر كه...,0.159569
4,588,خبر,اختصاص اعتباري ويژه براي كارآفريني در بودجه 86...,0.147407
5,612,دريچه بردن كار از اداره به خانه,آرش سيواني هوا كم كم داشت تاريك مي شد، در ادار...,0.146634
6,137,دستگيري سارق در هتل مجلل,گروه حوادث- سارق 400ميليون توماني براي آنكه مخ...,0.145817
7,124,مهلت 2 ماهه وزارت رفاه براي راه اندازي بانك ا...,فارس: عضو كميسيون اجتماعي مجلس از مهلت 2 ماهه ...,0.143144
8,243,اعتراف سارق پر رو در دادسراي شميرانات اموال م...,گروه حوادث- دزد شب رو كه پس از دستبرد به خانه ...,0.140551
9,136,راز تغيير چهره جنايتكار 3 ميليارد توماني,گروه حوادث- پرونده جنايتكاري كه بعد از قتل يك ...,0.136532
