-
Notifications
You must be signed in to change notification settings - Fork 1
/
xapian_music.py
executable file
·179 lines (136 loc) · 6.3 KB
/
xapian_music.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
#!/usr/bin/python
# -*- mode: Python; encoding: utf-8; indent-tabs-mode: nil; tab-width: 2 -*-
import xapian
from db.dirtree import get_songs
import json
import os
import time
# a mapping of query term/aliases → xapian prefixes.
# most of these are also keys to the song[] dict.
# Prefixes from http://xapian.org/docs/omega/termprefixes.html
PREFIXES = {'artist' : 'A',
'title' : 'S',
'path' : 'U',
'album' : 'XALBUM',
'title' : 'XTITLE'}
# These numeric prefixes will also be used as data slots.
NUMERIC_PREFIXES = ['year', 'mtime',
'tracknumber', 'rating']
def index(datapath, dbpath):
# Create or open the database we're going to be writing to.
db = xapian.WritableDatabase(dbpath, xapian.DB_CREATE_OR_OPEN)
# Set up a TermGenerator that we'll use in indexing.
termgenerator = xapian.TermGenerator()
termgenerator.set_stemmer(xapian.Stem("en"))
def make_value(s, term):
"Parse various string values and return suitable numeric representations."
if term == 'year':
# This is in a date string format due to serialization.
return xapian.sortable_serialise(int(s))
if term == 'mtime':
return xapian.sortable_serialise(time.mktime(time.strptime(s)))
if term == 'rating':
return xapian.sortable_serialise(max([float(n) for n in s]))
else:
return xapian.sortable_serialise(int(s))
for song in get_songs(datapath):
# We make a document and tell the term generator to use this.
doc = xapian.Document()
termgenerator.set_document(doc)
# Index each field with a suitable prefix.
for term in PREFIXES:
termgenerator.index_text(unicode(song[term]), 1, PREFIXES[term])
# Index fields without prefixes for general search.
for pos, term in enumerate(PREFIXES):
termgenerator.index_text(unicode(song[term]))
#if pos < len(term):
termgenerator.increase_termpos()
for data_slot, term in enumerate(NUMERIC_PREFIXES):
if song[term]:
doc.add_value(data_slot, make_value(song[term], term))
# Store all the fields for display purposes.
doc.set_data(unicode(json.dumps(song)))
# use doc.add_term(str.join(K, "my tag"), 0) to add tags the way notmuch does
# We use the identifier to ensure each object ends up in the
# database only once no matter how many times we run the
# indexer.
# Using relative paths to the data root to get slightly
# shorter arguments.
# In the future, we might need to handle this better, see this
# FAQ: http://trac.xapian.org/wiki/FAQ/UniqueIds
idterm = "Q" + os.path.relpath(song['path'], datapath)
doc.add_boolean_term(idterm)
db.replace_document(idterm, doc)
def parse_query(q):
"Parse the query <q> and return a query ready for use with enquire.set_query()."
# Set up a QueryParser with a stemmer and suitable prefixes
queryparser = xapian.QueryParser()
queryparser.set_stemmer(xapian.Stem("en"))
queryparser.set_stemming_strategy(queryparser.STEM_SOME)
queryparser.add_boolean_prefix("tag", "K")
for term in PREFIXES:
queryparser.add_prefix(term, PREFIXES[term])
for data_slot, term in enumerate(NUMERIC_PREFIXES):
queryparser.add_valuerangeprocessor(
xapian.NumberValueRangeProcessor(data_slot, term, True)
)
# And parse the query
return queryparser.parse_query(q)
def query(dbpath, querystring, order=None):
"""Query the database at path <dbpath> with the string
<querystring>. Return iterator over maches. This is mostly for
internal use, as it returns xapian match objects. Optionally takes
the argument order with valid values None or any numeric term."""
# Open the database we're going to search.
db = xapian.Database(dbpath)
query = parse_query(querystring)
# Use an Enquire object on the database to run the query
enquire = xapian.Enquire(db)
enquire.set_query(query)
# Don't care about document ID order, just optimize.
enquire.set_docid_order(enquire.DONT_CARE)
if order in NUMERIC_PREFIXES:
slot_id = NUMERIC_PREFIXES.index(order)
enquire.set_sort_by_value_then_relevance(slot_id, False)
for match in enquire.get_mset(0, db.get_doccount()):
yield match
def search(dbpath, querystring, order=None):
"Search the database at dbpath with querystring. Return list of matches."
return [({'id': match.docid,
'rank' : match.rank + 1,
'percent' : match.percent,
'data' : json.loads(unicode(match.document.get_data()))})
for match in query(dbpath, querystring, order)]
def add_tag(dbpath, querystring, tag):
"Add the tag <tag> to all songs matching <querystring>."
db = xapian.WritableDatabase(dbpath, xapian.DB_CREATE_OR_OPEN)
for m in query(dbpath, querystring):
doc = m.document
data = json.loads(doc.get_data())
new_tags = data['tags']
new_tags.append(tag)
doc.add_boolean_term('K' + tag.lower())
data['tags'] = new_tags
doc.set_data(unicode(json.dumps(data)))
# This is to make sure the term was actually added BEFORE
# modifying the database.
assert 'K' + tag.lower() in [t.term for t in doc.termlist()]
db.replace_document(m.docid, doc)
def remove_tag(dbpath, querystring, tag):
db = xapian.WritableDatabase(dbpath, xapian.DB_CREATE_OR_OPEN)
for m in query(dbpath, querystring):
doc = m.document
data = json.loads(doc.get_data())
new_tags = [tag for tag in data['tags'] if tag != tag]
doc.remove_term('K' + tag.lower())
data['tags'] = new_tags
doc.set_data(unicode(json.dumps(data)))
assert 'K' + tag.lower() not in [t.term for t in doc.termlist()]
db.replace_document(m.docid, doc)
def all_songs(dbpath):
"Iterator over all songs stored in the database <dbpath>."
db = xapian.Database(dbpath)
documents = (db.get_document(post.docid)
for post in db.postlist(""))
return ({'id' : doc.get_docid(), 'data' : json.loads(doc.get_data())}
for doc in documents)