Permalink
Browse files

Implement better indexing (re-use old entries). Add logging.

  • Loading branch information...
1 parent 841189e commit 91dbccdfb2a249340c3a450d025a6275a990e239 Albin Stjerna committed Mar 18, 2013
Showing with 216 additions and 133 deletions.
  1. +58 −41 db/dirtree.py
  2. +128 −63 db/xapian_music.py
  3. +30 −29 test/test.py
View
@@ -1,5 +1,6 @@
# -*- mode: Python; encoding: utf-8; indent-tabs-mode: nil; tab-width: 2 -*-
+# FIXME: standardise naming conventions for functions.
import codecs
from mutagen.flac import FLAC
import mutagen
@@ -17,6 +18,23 @@ class FileFormatError(Exception):
pass
+class SongData(dict):
+ """Basically a dictionary with authorised keys."""
+ def __init__(self, *args, **kwargs):
+ self.mtime = kwargs['mtime']
+ self.genre = kwargs['genre']
+ self.lastplayed = kwargs['lastplayed']
+ self.rating = kwargs['rating']
+ self.length = kwargs['length']
+ self.artist = kwargs['artist']
+ self.title = kwargs['title']
+ self.year = kwargs['year']
+ self.tracknumber = kwargs['tracknumber']
+ self.album = kwargs['album']
+ self.path = kwargs['path']
+ dict.__init__(self, *args, **kwargs)
+
+
def read_metadata_from_file(af):
extension = af.split(".")[-1]
if extension == "flac":
@@ -34,6 +52,41 @@ def get_files(p):
# fixme: find out the actual encoding of the file system
yield unicode(pathjoin(dirpath, f), encoding="utf-8")
+def parseFile(filePath):
+ "Parse a data file and return something that can be used later on."
+
+ # FIXME: return some kind of pre-defined object. Using a hashmap
+ # for this is just silly.
+ mtime = time.ctime(getmtime(filePath))
+ metadata = read_metadata_from_file(filePath)
+ genre = metadata.get("genre", [None])[0]
+ lastplayed = None
+ rating = metadata.get("rating:banshee", [None])[0]
+ length = int(metadata.info.length)
+ artist = unicode(metadata["artist"][0])
+ title = unicode(metadata["title"][0])
+ year = metadata["date"][0]
+ track = metadata.get("tracknumber", [0])[0]
+ try:
+ tracknumber = int(track)
+ except ValueError:
+ # Handle those ugly "1/10" track number formats
+ tracknumber = int(str(track).split("/")[0])
+ album = unicode(metadata["album"][0])
+
+ return SongData(length=length,
+ artist=artist,
+ title=title,
+ mtime=mtime,
+ year=year,
+ album=album,
+ path=filePath,
+ tracknumber=tracknumber,
+ tags=["index"],
+ lastplayed=lastplayed,
+ rating=rating,
+ genre=genre)
+
def get_songs(tree, prefilter=None, postfilter=None):
"""Generator that returns a dictionary of metadata for a number of songs in that directory tree."""
@@ -45,50 +98,14 @@ def get_songs(tree, prefilter=None, postfilter=None):
continue
try:
- metadata = read_metadata_from_file(fl)
- except FileFormatError as e:
+ data = parseFile(fl)
+ except FileFormatError as e:
continue
except mutagen.flac.FLACNoHeaderError:
continue
-
- try:
- genre = metadata.get("genre", [None])[0]
- lastplayed = None
- rating = metadata.get("rating:banshee", [None])[0]
- length = int(metadata.info.length)
- artist = unicode(metadata["artist"][0])
- title = unicode(metadata["title"][0])
- year = metadata["date"][0]
- track = metadata.get("tracknumber", [0])[0]
- try:
- tracknumber = int(track)
- except ValueError:
- # Handle those ugly "1/10" track number formats
- tracknumber = int(str(track).split("/")[0])
- album = unicode(metadata["album"][0])
- except KeyError as e:
- #print "W: No metadata field \"%s\" in file %s" % (e[0], fl)
- continue
-
+
if postfilter:
- if not postfilter(fl, mtime, length, artist, title, year, album, tracknumber):
+ if not postfilter(data):
continue
- yield {"length" : length,
- "artist" : artist,
- "title" : title,
- "mtime" : mtime,
- "year" : year,
- "album" : album,
- "path" : fl,
- "tracknumber" : tracknumber,
- "tags" : ["index"],
- "lastplayed" : lastplayed,
- "rating" : rating,
- "genre" : genre
-
- # investigate which other metadata posts we should include.
- #"mdata" : metadata
- }
-
-
+ yield data
View
@@ -3,9 +3,12 @@
import xapian
from db.dirtree import get_songs
+import db.dirtree as dt
import json
import os
import time
+#import hashlib
+import logging
# a mapping of query term/aliases → xapian prefixes.
# most of these are also keys to the song[] dict.
@@ -20,78 +23,141 @@
NUMERIC_PREFIXES = ['year', 'mtime', 'lastplayed',
'tracknumber', 'rating', 'length']
-def index(datapath, dbpath):
- """Create or update the index stored in database <dbpath>, using
- the music file/directory structure in <datapath>."""
- # Create or open the database we're going to be writing to.
- db = xapian.WritableDatabase(dbpath, xapian.DB_CREATE_OR_OPEN)
- # Set up a TermGenerator that we'll use in indexing.
- termgenerator = xapian.TermGenerator()
- termgenerator.set_stemmer(xapian.Stem("en"))
-
- def make_value(s, term):
- """Parse various string values and return suitable numeric
- representations."""
- if term == 'year':
- # This is in a date string format due to serialization.
- return xapian.sortable_serialise(int(s))
- if term == 'mtime':
- return xapian.sortable_serialise(time.mktime(time.strptime(s)))
- if term == 'rating':
- return xapian.sortable_serialise(float(s))
- else:
- return xapian.sortable_serialise(int(s))
+class SongMatch(dict):
+ def __init__(self, **kwargs):
+ self.data = dt.SongData(**kwargs['data'])
+ self.id = kwargs['id']
+ self.rank = kwargs['rank']
+ self.percent = kwargs['percent']
- for song in get_songs(datapath):
- # We make a document and tell the term generator to use this.
- doc = xapian.Document()
- termgenerator.set_document(doc)
+ setDict = {'data' : self.data, 'id' : self.id,
+ 'rank' : self.rank, 'percent' : self.percent}
+ dict.__init__(self, **setDict)
- # Index each field with a suitable prefix.
- for term in PREFIXES:
- termgenerator.index_text(unicode(song[term]), 1, PREFIXES[term])
- # Index fields without prefixes for general search.
- for term in PREFIXES:
- termgenerator.index_text(unicode(song[term]))
- termgenerator.increase_termpos()
+def addSong(db, songData):
+ """Add a song with songData to the xapian WritableDatabase
+ db. Performs no double-checking to see if file already exists, and
+ will overwrite data mercilessly."""
+ doc = xapian.Document()
- for data_slot, term in enumerate(NUMERIC_PREFIXES):
- if song[term]:
- doc.add_value(data_slot, make_value(song[term], term))
-
+ # Set up a TermGenerator that we'll use in indexing.
+ termGenerator = xapian.TermGenerator()
+ termGenerator.set_stemmer(xapian.Stem("en"))
+ termGenerator.set_document(doc)
- # Store all the fields for display purposes.
- doc.set_data(unicode(json.dumps(song)))
+ # Index each field with a suitable prefix.
+ for term in PREFIXES:
+ termGenerator.index_text(unicode(songData[term]),
+ 1, PREFIXES[term])
- # use doc.add_term(str.join(K, "my tag"), 0) to add tags the
- # way notmuch does
+ # Index fields without prefixes for general search.
+ for term in PREFIXES:
+ termGenerator.index_text(unicode(songData[term]))
+ termGenerator.increase_termpos()
- # We use the identifier to ensure each object ends up in the
- # database only once no matter how many times we run the
- # indexer.
+ for data_slot, term in enumerate(NUMERIC_PREFIXES):
+ if songData[term]:
+ doc.add_value(data_slot, make_value(songData[term], term))
+
+
+ # Store all the fields for display purposes.
+ doc.set_data(unicode(json.dumps(songData)))
+
+ # We use the identifier to ensure each object ends up in the
+ # database only once no matter how many times we run the
+ # indexer.
+
+ idterm = "P" + songData.path
+ # previous solution:
+ # hashlib.sha256(songData.path).hexdigest()
+ doc.add_boolean_term(idterm)
+ db.replace_document(idterm, doc)
+
+def mergeSongs(songA, songB):
+ """Will merge the two song data sets. Data fields existing in any
+ of the songs will be kept in the final product. Left fields will
+ be preferred if duplicates exist."""
+ mergedSong = dict()
+ for key in songA:
+ if songA[key]:
+ # this would seem silly, but handles empty strings etc.
+ if songB[key]:
+ logging.warning("Throwing away %s value %s in merge"
+ % (key, songB[key]))
+ mergedSong[key] = songA[key]
+ elif songB[key]:
+ # This catches the case where there was an empty string
+ # at the key, but where songB has something.
+ mergedSong[key] = songB[key]
+
+ for key in songB:
+ if key not in mergedSong or not mergedSong[key]:
+ mergedSong[key] = songB[key]
+
+ return dt.SongData(**mergedSong)
+
+def pathInDB(db, path):
+ """Returns True if a song with path exists in the xapian database
+ db, False otherwise."""
+ enquire = xapian.Enquire(db)
+ enquire.set_query(parse_query("path:" + path))
+ enquire.set_docid_order(enquire.DONT_CARE)
+ return bool(enquire.get_mset(0, db.get_doccount()))
+
+def make_value(s, term):
+ """Parse various string values and return suitable numeric
+ representations."""
+ if term == 'year':
+ # This is in a date string format due to serialization.
+ return xapian.sortable_serialise(int(s))
+ if term == 'mtime':
+ return xapian.sortable_serialise(time.mktime(time.strptime(s)))
+ if term == 'rating':
+ return xapian.sortable_serialise(float(s))
+ else:
+ return xapian.sortable_serialise(int(s))
- # Using relative paths to the data root to get slightly
- # shorter arguments.
+def index(datapath, dbpath):
+ """Create or update the index stored in database <dbpath>, using
+ the music file/directory structure in <datapath>."""
+ # Create or open the database we're going to be writing to.
+ db = xapian.WritableDatabase(dbpath, xapian.DB_CREATE_OR_OPEN)
- # In the future, we might need to handle this better, see this
- # FAQ: http://trac.xapian.org/wiki/FAQ/UniqueIds
- idterm = "Q" + os.path.relpath(song['path'], datapath)
- doc.add_boolean_term(idterm)
- db.replace_document(idterm, doc)
+ # Make sure all songs in the directory are in the database.
+ for filePath in dt.get_files(datapath):
+ if not pathInDB(db, filePath):
+ addSong(db, dt.parseFile(filePath))
+ else:
+ mtimeFile = os.path.getmtime(filePath)
+ dbEntry = search(dbpath, "path:" + filePath)[0]
+ mtimeDB = time.mktime(time.strptime(dbEntry['data']['mtime']))
+ if mtimeFile > mtimeDB:
+ logging.warning("File %s has changed." % filePath)
+ addSong(db, mergeSongs(dbEntry['data'], dt.parseFile(filePath)))
+ else:
+ logging.info("File %s hasn't changed." % filePath)
+
+ # Now, make sure no songs have disappeared.
+ songFiles = dt.get_files(datapath)
+ for song in all_songs(dbpath):
+ songPath = song['data']['path']
+ if not songPath in songFiles:
+ logging.warning("Song file %s has disappeared!" % songPath)
+ # do some voodoo to delete the entry here.
def parse_query(q):
"""Parse the query <q> and return a query ready for use with
enquire.set_query()."""
-
+
# Set up a QueryParser with a stemmer and suitable prefixes
queryparser = xapian.QueryParser()
queryparser.set_stemmer(xapian.Stem("en"))
queryparser.set_stemming_strategy(queryparser.STEM_SOME)
queryparser.add_boolean_prefix("tag", "K")
-
+
for term in PREFIXES:
queryparser.add_prefix(term, PREFIXES[term])
@@ -101,7 +167,7 @@ def parse_query(q):
)
# And parse the query
- return queryparser.parse_query(q)
+ return queryparser.parse_query(q)
def query(dbpath, querystring, order=None):
"""Query the database at path <dbpath> with the string
@@ -127,18 +193,18 @@ def query(dbpath, querystring, order=None):
yield match
def search(dbpath, querystring, order=None):
- "Search the database at dbpath with querystring. Return list of matches."
+ """Search the database at dbpath with querystring. Return list of
+ SongMatch object."""
- return [({'id': match.docid,
- 'rank' : match.rank + 1,
- 'percent' : match.percent,
- 'data' : json.loads(unicode(match.document.get_data()))})
+ return [SongMatch(id=match.docid, rank=(match.rank + 1),
+ percent=match.percent,
+ data=(json.loads(unicode(match.document.get_data()))))
for match in query(dbpath, querystring, order)]
def add_tag(dbpath, querystring, tag):
"Add the tag <tag> to all songs matching <querystring>."
db = xapian.WritableDatabase(dbpath, xapian.DB_CREATE_OR_OPEN)
-
+
for m in query(dbpath, querystring):
doc = m.document
data = json.loads(doc.get_data())
@@ -153,7 +219,7 @@ def add_tag(dbpath, querystring, tag):
assert 'K' + tag.lower() in [t.term for t in doc.termlist()]
db.replace_document(m.docid, doc)
-
+
def remove_tag(dbpath, querystring, tag):
"""Remove the tag <tag> (if existing) from all entries in the
database at dbpath matching querystring."""
@@ -177,7 +243,6 @@ def all_songs(dbpath):
documents = (db.get_document(post.docid)
for post in db.postlist(""))
-
- return ({'id' : doc.get_docid(), 'data' : json.loads(doc.get_data())}
- for doc in documents)
+ return ({'id' : doc.get_docid(), 'data' : json.loads(doc.get_data())}
+ for doc in documents)
Oops, something went wrong.

0 comments on commit 91dbccd

Please sign in to comment.