# Getting started

In [2]:
from whoosh.index import create_in
from whoosh.fields import *

In [3]:
schema = Schema(title=TEXT(stored=True), path=ID(stored=True), content=TEXT)

In [4]:
print(schema)

<Schema: ['content', 'path', 'title']>


Make sure you have the directory "indexdir" created beforehand in the folder where you start the notebook

In [201]:
ix = create_in("indexdir_collection5", schema)
writer = ix.writer()

In [202]:
file_path = 'C:/Users/Bin/Desktop/collection.tsv'
with open(file_path,encoding='utf-8') as hfile:
    lines = hfile.read().splitlines()

In [194]:
# check split line detail
'''
count = 0
file_path = 'C:/Users/Bin/Desktop/test.txt'
with open(file_path,'w',encoding='utf-8') as f:
    for line in lines:
        if ('|' in line)& line.startswith('clueweb09-en')==False:
            f.write(line+'\n')
'''

     # Problem 4 (A) -- indexing the collection.tsv

In [203]:
text = ""
title = ""
path = ""
for line in lines:
    line = line.replace('||','')
    if ('|' in line)& line.startswith('clueweb09-en')==False:
        line = line.replace('|','')
    if ('|' not in line) & line.startswith('clueweb09-en'):
        writer.add_document(title="",path=line.split("\t")[0],content="")
        continue
    if line.startswith('clueweb09-en') & len(text)!=0:
        writer.add_document(title=title,path=path,content=text)
        text = ""
        title = line.split('|')[1] 
        path = line.split('|')[0].split('\t')[0]
    elif line.startswith('clueweb09-en')==False:
        text += line
    else :
        title = line.split('|')[1]
        path = line.split('|')[0].split('\t')[0]
writer.commit()

In [30]:
from whoosh.qparser import QueryParser

the query we will use now is "first"

In [209]:
with ix.searcher() as searcher:
 query = QueryParser("content", ix.schema).parse("baseball")
 results = searcher.search(query)
 print(results[0])


<Hit {'path': 'clueweb09-en0003-32-21337', 'title': 'Position:'}>


# Custom ranking functions

In [95]:
from whoosh import scoring
from whoosh import reading


In [121]:
w = scoring.TF_IDF()

In [152]:
with ix.searcher(weighting =w) as searcher:
 query = QueryParser("content", ix.schema).parse("first")
 results = searcher.search(query)
 for i in range(len(results)):
     print (results[i],results[i].score)

<Hit {'path': '/a', 'title': 'First document'}> 1.7836046756755068
<Hit {'path': '/b', 'title': 'Second document'}> 1.1890697837836712


You can define a custom scoring function too. pos_score_fn computes a score for a given document using only one field. Here the score is based on the first occurence (position) of the query term.

      # Problem 4 (A) -- tf-idf function

In [211]:
def tfidf_fn(searcher, fieldname, text, matcher):
     poses = matcher.value_as("positions")
     idf = searcher.idf(fieldname,text)
     tf = len(poses)
     return tf*idf
my_tfidf = scoring.FunctionWeighting(tfidf_fn)

In [222]:
with ix.searcher(weighting =my_tfidf) as searcher:
 query = QueryParser("content", ix.schema).parse("baseball")
 results = searcher.search(query)
 print(results)
 for i in range(10):
     print (results[i],results[i].score)

<Top 10 Results for Term('content', 'baseball') runtime=0.0008303585200337693>
<Hit {'path': 'clueweb09-en0003-32-21337', 'title': 'Position:'}> 28.459537349952047
<Hit {'path': 'clueweb09-en0006-84-09638', 'title': 'More Featured Items'}> 14.229768674976023
<Hit {'path': 'clueweb09-en0006-85-33143', 'title': "West Baden Springs Hotel's History:"}> 14.229768674976023
<Hit {'path': 'clueweb09-en0003-13-14436', 'title': 'Content-Length: 65536 Mit cloning human ear mitchell barlow woolworths queensgate  mitchell airport in milwaukee wi  misty voicings  misty vs super sonic 4 . Misty edwards worship music mistress anjolie  mistress of voltaire  mitch cain taxidermy  misty mundae in black opaque tights  mistletoe kiss  mistress desiree maine'}> 14.229768674976023
<Hit {'path': 'clueweb09-en0006-84-09637', 'title': 'More Featured Items'}> 9.486512449984016
<Hit {'path': 'clueweb09-en0006-84-09636', 'title': 'More Featured Items'}> 9.486512449984016
<Hit {'path': 'clueweb09-en0008-17-16833', 

      # Problem 4 (A) -- tf*position function

In [220]:
def tftp_fn(searcher, fieldname, text, matcher):
     poses = matcher.value_as("positions")
     tf = len(poses)
     sum = 0
     for p in poses:
        sum += tf*p 
     return sum
my_tftp = scoring.FunctionWeighting(tftp_fn)

In [221]:
with ix.searcher(weighting =my_tftp) as searcher:
 query = QueryParser("content", ix.schema).parse("baseball")
 results = searcher.search(query)
 print(results)
 for i in range(10):
     print (results[i],results[i].score)

<Top 10 Results for Term('content', 'baseball') runtime=0.0005324213634594344>
<Hit {'path': 'clueweb09-en0003-13-14436', 'title': 'Content-Length: 65536 Mit cloning human ear mitchell barlow woolworths queensgate  mitchell airport in milwaukee wi  misty voicings  misty vs super sonic 4 . Misty edwards worship music mistress anjolie  mistress of voltaire  mitch cain taxidermy  misty mundae in black opaque tights  mistletoe kiss  mistress desiree maine'}> 43656
<Hit {'path': 'clueweb09-en0011-33-12559', 'title': 'Home : Organize Blog'}> 7450
<Hit {'path': 'clueweb09-en0011-33-12559', 'title': 'Home : Organize Blog'}> 7450
<Hit {'path': 'clueweb09-en0003-32-21337', 'title': 'Position:'}> 6516
<Hit {'path': 'clueweb09-en0001-87-06458', 'title': 'Student Athlete'}> 3366
<Hit {'path': 'clueweb09-en0005-27-24996', 'title': 'On the Web: Read the Mitchell Report'}> 3270
<Hit {'path': 'clueweb09-en0001-86-15563', 'title': 'The Clarence M. Mitchell, Jr. School of Engineering'}> 2382
<Hit {'path'

# Indexing a collection and computing metrics

In [15]:
import csv

In [17]:
def unicode_csv_reader(utf8_data, dialect=csv.excel, **kwargs):
 csv_reader = csv.reader(utf8_data, dialect=dialect, **kwargs)
 for row in csv_reader:
     yield [unicode(cell, 'utf-8') for cell in row]

In [18]:
def read_file(file_path, delimiter='\t'):
 with open(file_path, 'r') as csvfile:
     reader = csv.reader(csvfile, delimiter=delimiter, quotechar='|', quoting=csv.QUOTE_MINIMAL)
 doc_list = []
 for row in reader:
     doc_list.append((row[0],row[1], row[2].replace('\n',' ')))

 return doc_list

In [19]:
doc_list = read_file("clueweb_clean_docs.tsv")

FileNotFoundError: [Errno 2] No such file or directory: 'clueweb_clean_docs.tsv'

In [29]:
print(len(doc_list),'\n', doc_list[0])

216910 
 ('\ufeff\ufeffclueweb09-en0010-82-12593', '18.0852279663', 'Roth 403b Plans Roth 403b Plans Included in the Bush Administrations Fiscal Year 2006 Budget were some changes to the way you can fund your 403b plan - the new Roth 403b.\xa0 In this publication we\'re going to discuss the benefits of a Roth 403b along with the rules of these plans including withdrawals / distributions, contributions, and income taxes. Background of the Roth 403b Starting in January 2006, operators of traditional 403b plans can offer their employees what\'s being called a Roth 403b plan.\xa0 In essence, offering "Roth" treatment of funds going into a 403b account. \xa0 Additional Resources Roth IRA versus 403b Plan A Roth 403b combines the contribution features of 403b plans with the tax-free growth advantage of Roth IRAs .\xa0 Employees currently offered the ability to fund their retirement plans via 403b plans may now be offered the option of participating in a Roth 403b.\xa0 With this new plan, you

In [30]:
schema = Schema(id=ID(stored=True), content=TEXT)
ix = create_in("cw_index", schema)
writer = ix.writer()

In [31]:
for doc in doc_list[:1000]:
 writer.add_document(id=doc[0],content=doc[2])
writer.commit()

In [32]:
query_str = "403b"
result_list = []

In [34]:
with ix.searcher() as searcher:
 query = QueryParser("content", ix.schema).parse(query_str)
 results = searcher.search(query, limit=None)
 print("Results found:", len(results))
 for result in results:
 print(result['id'], result.score)
 result_list.append(result['id'])

Results found: 288
ï»¿ï»¿clueweb09-en0010-82-12593 4.68235845867756
clueweb09-en0002-19-09466 4.606890901190984
clueweb09-en0010-82-12588 4.576296584299555
clueweb09-en0010-82-12589 4.574809817919019
clueweb09-en0010-82-12587 4.483470812852932
clueweb09-en0009-84-33862 4.450273110200792
clueweb09-en0008-24-06210 4.341685424381006
clueweb09-en0000-14-03360 4.234154273566411
clueweb09-en0004-01-03541 4.194615271966087
clueweb09-en0004-80-00508 4.193528868778884
clueweb09-en0008-41-10509 4.1769417838009195
clueweb09-en0009-52-27808 4.111156204215051
clueweb09-en0009-82-05607 4.097154045886272
clueweb09-en0007-55-00718 4.065995269601156
clueweb09-en0001-41-08510 4.035306840104864
clueweb09-en0010-82-12518 3.9562982164117613
clueweb09-en0007-55-00719 3.9464327968911745
clueweb09-en0008-24-06121 3.913863776352913
clueweb09-en0009-94-31189 3.8456964156242117
clueweb09-en0005-82-07314 3.813692823434889
clueweb09-en0008-62-39243 3.746270626291283
clueweb09-en0003-88-24169 3.738143499793505
clue

In [35]:
def read_qrels(file_path, delimiter=' '):
 with open(file_path, 'r') as csvfile:
 reader = csv.reader(csvfile, delimiter=delimiter, quotechar='|', quoting=csv.QUOTE_MINIMAL)
 qrels = {}
 for row in reader:
 qrels[row[0]] = int(row[1])

 return qrels

In [36]:
qrels_hash = read_qrels("403b-qrels.csv")

In [48]:
def precision(doc_list, qrels, k=10):
 f = lambda x: qrels[x] if x in qrels else 0
 vals = list(map(lambda q: 1 if q>0 else 0, map(f, doc_list[:k])))
 print(vals)
 return sum(vals)/k

In [49]:
precision(result_list, qrels_hash, k=15)

[0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1]


0.5333333333333333

In [52]:
result_list = []
with ix.searcher(weighting =w) as searcher:
 query = QueryParser("content", ix.schema).parse(query_str)
 results = searcher.search(query, limit=None)
 print("Results found:", len(results))
 for result in results:
 print(result['id'], result.score)
 result_list.append(result['id'])

Results found: 288
ï»¿ï»¿clueweb09-en0010-82-12593 105.34244377087613
clueweb09-en0010-82-12589 73.96384349870026
clueweb09-en0010-82-12588 60.51587195348203
clueweb09-en0010-82-12518 60.51587195348203
clueweb09-en0010-82-12587 56.03321477174262
clueweb09-en0002-19-09466 44.8265718173941
clueweb09-en0004-37-02167 40.34391463565469
clueweb09-en0004-01-03541 35.86125745391528
clueweb09-en0009-84-33862 26.89594309043646
clueweb09-en0007-18-30966 24.654614499566755
clueweb09-en0003-93-27146 20.171957317827346
clueweb09-en0011-52-03669 20.171957317827346
clueweb09-en0004-80-00508 17.93062872695764
clueweb09-en0003-95-21017 15.689300136087935
clueweb09-en0008-41-10509 11.206642954348524
clueweb09-en0011-67-00065 11.206642954348524
clueweb09-en0009-79-14364 11.206642954348524
clueweb09-en0007-55-00719 8.96531436347882
clueweb09-en0008-24-06210 8.96531436347882
clueweb09-en0010-82-12592 8.96531436347882
clueweb09-en0007-18-30977 8.96531436347882
clueweb09-en0003-88-24170 8.96531436347882
cluew

In [53]:
precision(result_list, qrels_hash, k=15)

[0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0]


0.4666666666666667