# 1 Parsing

# 2 Indexing
    in this part we try to index the webpages


## 2.1 Importing the Necessary Libraries and data

In [1]:
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import string
from PersianStemmer import PersianStemmer
from urllib.parse import urlparse
import pickle
import time
from math import log10

In [2]:
# importing the dataset
docs = pd.read_csv('Parsed_pages.csv', index_col=0)
docs.head()

Unnamed: 0_level_0,URL,Title,Body
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,http://abu.ut.ac.ir/,به سايت پرديس ابوريحان دانشگاه تهران خوش آمديد,http://abu.ut.ac.ir صفحه اصلي | گروههاي آموزشي...
1,http://abu.ut.ac.ir/arshiv.htm,صفحه مدیریت پردیس ابوریحان دانشگاه تهران,آرشيو اخبار پرديس ابوريحان دانشگاه تهران افتتا...
2,http://abu.ut.ac.ir/news/1001.htm,دبیرخانه خبرنامه انجمن علوم باغبانی ایران به گ...,http://abu.ut.ac.ir صفحه اصلي | گروههاي آموزشي...
3,http://abu.ut.ac.ir/news/rezayeean.htm,انتصاب دكتر محمد رضائيان به عنوان رئيس جديد پر...,http://abu.ut.ac.ir صفحه اصلي | گروههاي آموزشي...
4,http://abu.ut.ac.ir/persian/administration.htm,صفحه مدیریت پردیس ابوریحان دانشگاه تهران,مديريت پرديس ابوريحان دانشگاه تهران ریاست پردي...


In [3]:
len(docs)

10782

## 2.2 preprocessing and indexing the data


In [4]:


# preprocess the string
def preprocess_string(doc_string):

    # convert arabic to persian
    doc_string = doc_string.replace('ك', 'ک')
    doc_string = doc_string.replace('ي', 'ی')

    # Tokenize
    token_list = word_tokenize(doc_string)

    # remove punctuations
    punctuations = string.punctuation
    punctuations  += '،–'
    trans_table = str.maketrans('', '', punctuations)
    stripped_words = [word.translate(trans_table) for word in token_list]
    token_list = [str for str in stripped_words if str]
 
    # Change to lowercase.
    token_list =[word.lower() for word in token_list]

    return token_list



In [12]:
# initializing the dictionaries
url_index = {}
title_index = {}
body_index = {}

# initialize the stemmer
p_stemmer = PersianStemmer() # persian
e_stemmer = PorterStemmer()


def url_parse(docID, row, index):
    hostname = urlparse(row).hostname

    if hostname in index:
        index[hostname].append(docID)

    else:
        index[hostname] = [docID]







# define the indexer
def indexer(docID, row, index):

    
    try:
        lst = preprocess_string(row)
    except:
        lst = preprocess_string(str(row))

    for pos, term in enumerate(lst):

        # first stem the word
        term = p_stemmer.stem(term)
        term = e_stemmer.stem(term)

        # if term already in exists in index
        if term in index:

            # increment freq by 1
            index[term][0] += 1

            # check if the term has existed in the Doc before
            if docID in index[term][1]:
                index[term][1][docID].append(pos)

            else:
                 index[term][1][docID] = [pos]
                 
        else:

            # initialize the list
            index[term] = []
            # add the frequency (1)
            index[term].append(1)
            # the postings list is empty
            index[term].append({})
            # add doc id
            index[term][1][docID] = [pos]


In [13]:
#apply indexer for url, title and body
st = time.time()
docs.apply(lambda row : url_parse(row.name, row['URL'], url_index), axis = 1);
docs.apply(lambda row : indexer(row.name, row['Title'], title_index), axis = 1);
docs.apply(lambda row : indexer(row.name, row['Body'], body_index), axis = 1);

#replace the df with log(df)
for k, v in title_index.items():
    v[0] = log10(v[0])

for k, v in body_index.items():
    v[0] = log10(v[0])

ft = time.time()
print('time spent to index pages: ', '{:.2f}'.format(ft - st))


time spent to index pages:  649.44


In [7]:
# save dictionaries to a file for later
with open('url_index_dic.pkl', 'wb') as f:
    pickle.dump(url_index, f)

with open('title_index_dic.pkl', 'wb') as f:
    pickle.dump(title_index, f)

with open('body_index_dic.pkl', 'wb') as f:
    pickle.dump(body_index, f)
 

In [2]:
       
with open('title_index_dic.pkl', 'rb') as f:
    title_index = pickle.load(f)

with open('body_index_dic.pkl', 'rb') as f:
    body_index = pickle.load(f)    

In [14]:
title_index['یزد']

[3.2528530309798933,
 {7547: [4],
  7556: [4],
  7558: [4],
  7559: [2],
  7560: [3],
  7561: [3],
  7562: [3],
  7563: [3],
  7564: [3],
  7565: [3],
  7566: [3],
  7567: [3],
  7568: [3],
  7569: [3],
  7570: [3],
  7571: [3],
  7572: [3],
  7573: [3],
  7574: [3],
  7575: [3],
  7576: [3],
  7577: [3],
  7578: [3],
  7579: [3],
  7580: [3],
  7581: [3],
  7582: [3],
  7583: [3],
  7584: [3],
  7585: [3],
  7586: [3],
  7587: [3],
  7588: [3],
  7589: [3],
  7590: [3],
  7591: [3],
  7592: [3],
  7593: [3],
  7594: [3],
  7595: [3],
  7596: [3],
  7597: [3],
  7598: [3],
  7599: [3],
  7600: [3],
  7601: [3],
  7602: [3],
  7603: [3],
  7604: [3],
  7605: [3],
  7606: [3],
  7607: [3],
  7608: [3],
  7609: [3],
  7610: [3],
  7611: [3],
  7612: [3],
  7613: [3],
  7614: [3],
  7615: [3],
  7616: [3],
  7617: [3],
  7618: [3],
  7619: [3],
  7620: [3],
  7621: [3],
  7622: [3],
  7623: [3],
  7624: [3],
  7625: [3],
  7626: [3],
  7627: [3],
  7628: [3],
  7629: [3],
  7630: [3],
  76

In [6]:
docs.loc[7547]

URL                             http://www.tebyan-yazd.ir/
Title                      سازمان تبلیغات اسلامی استان یزد
Body     صفحه اول :: عضویت در سایت :: ايميل :: English ...
Name: 7547, dtype: object

In [15]:
len(title_index)

6157

In [16]:
len(body_index)

268765

In [9]:
len(url_index)

19