# Finding instances from one corpus in Hathi

This shows how to churn through two corpuses to find copies of one in the other.


In [2]:
import SRP
import numpy as np
from scipy.spatial.distance import cdist

In [2]:
%load_ext autoreload
%autoreload 2

This assumes that you've already created the txtlab file described in the notebook "Hash a corpus of text files into SRP space"

In [3]:
txtlab = SRP.Vector_file("txtlab.bin").to_matrix()


In [4]:
def hathi_chunker(max_size=1000):
    hathi = SRP.Vector_file("/home/bschmidt/vector_models/hathi.bin")
    id_cache = []
    row_cache = np.zeros((max_size,hathi.dims),"<f4")
    for id,row in hathi:
        row_cache[len(id_cache)] = row
        id_cache.append(id)
        if len(id_cache) == max_size:
            yield (id_cache, row_cache)
            id_cache = []
            row_cache = np.zeros((max_size,hathi.dims),"<f4")        
            


Use an array to store the neighbors of each index: store the top ten items to start.

In [6]:
hathi_chunks = hathi_chunker()

knn = 20
neighbors = []
for i in range(len(txtlab["names"])):
    neighbors.append([])
    for j in range(knn):
        neighbors[-1].append((float("Inf"),"nothing"))

n_chunked = 0
for ids,rows in hathi_chunks:
    n_chunked += 1
    if n_chunked % 1000 == 0:
        print "checked {} million in hathi\r".format(n_chunked/1000)
    pairwise = cdist(txtlab["matrix"], rows, "cosine")
    closest = np.argpartition(pairwise,knn,1)
    for i,row in enumerate(closest):
        for ix in range(knn):
            dist = pairwise[i][row[ix]]
            if dist < neighbors[i][-1][0]:
                neighbors[i][-1] = (dist,ids[row[ix]])
                neighbors[i].sort()
            elif dist < .05:
                # Catch everything that close
                neighbors[i].append((dist,ids[ix]))
                neighbors[i].sort()

checked 1 million in hathi
checked 2 million in hathi
checked 3 million in hathi
checked 4 million in hathi
checked 5 million in hathi
checked 6 million in hathi
checked 7 million in hathi
checked 8 million in hathi
checked 9 million in hathi
checked 10 million in hathi
checked 11 million in hathi
checked 12 million in hathi
checked 13 million in hathi


In [7]:
nearly = []
for i,neighbor in enumerate(neighbors):
    name = txtlab["names"][i]
    for dist, hathi in neighbor:
        if dist < .1:
            nearly.append((dist,name,hathi))
nearly.sort()
for dist,name,hathi in nearly[25]:
    print u"{} is only {:0.4f} from {}".format(name,dist, hathi)

EN_1900_Barr,Amelia_TheMaidofMaidenLane_Novel is only 0.0021 from uc1.$b249538
DE_1932_Roth,Joseph_Radetzkymarsch_Novel is only 0.0021 from wu.89093642122
EN_1900_Barr,Amelia_TheMaidofMaidenLane_Novel is only 0.0023 from nyp.33433074833421
EN_1900_Barr,Amelia_TheMaidofMaidenLane_Novel is only 0.0025 from hvd.hn1m5d
EN_1851_Hawthorne,Nathaniel_TheHouseoftheSevenGables_Novel is only 0.0032 from hvd.hn6n6v
EN_1818_Shelley,Mary_Frankenstein_Novel is only 0.0035 from pst.000049200850
EN_1837_Disraeli,Benjamin_Venetia_Novel is only 0.0035 from nnc1.0055235000
EN_1851_Hawthorne,Nathaniel_TheHouseoftheSevenGables_Novel is only 0.0036 from hvd.hn6nhb
EN_1850_Aguilar,Grace_ValeofCedars_Novel is only 0.0039 from nyp.33433074945985
EN_1900_Barr,Amelia_TheMaidofMaidenLane_Novel is only 0.0040 from hvd.hn5fcn
EN_1851_Hawthorne,Nathaniel_TheHouseoftheSevenGables_Novel is only 0.0042 from hvd.32044011866720
EN_1837_Disraeli,Benjamin_Venetia_Novel is only 0.0042 from nyp.33433074937099
EN_1813_Austen,J

In [27]:
import urllib2
import ujson as json
from IPython.display import HTML

#hathi_cache = {}

def jsonify(id, force = False):
    global hathi_cache
    if id in hathi_cache and not force:
        return hathi_cache[id]
    sons = "\n".join(urllib2.urlopen("http://catalog.hathitrust.org/api/volumes/brief/htid/%s.json" %id.replace("+",":").replace("=","/")).readlines())
    hathi_cache[id] = json.loads(sons)
    return hathi_cache[id]

def descend(record):
    # Parse a hathi API call response.
    a = record['records']
    try:
        return a[a.keys()[0]]
    except IndexError:
        print record
        raise
        
def pretty_print(htid,text):
    output_string = ""#u"<ul>"
    try:
        a = descend(jsonify(htid))
        a['url'] = u"https://babel.hathitrust.org/cgi/pt?id=" + htid
        try:
            output_string += u"<li><a href={}>{} ({})</a><br>{}</li>".format(
                a['url'],a['titles'][0].encode("ascii","ignore"),a['publishDates'][0],text.encode("ascii","ignore"))
        except:
            print a
    except IndexError:
        print ('no index',p)
        pass
    except:
        print ""
        raise
    return HTML(output_string + "")#)"</ul>")

class Hathi_Book():
    def __init__(self,htid,text=""):
        self.htid = htid
        self.desc = descend(jsonify(htid))
        self.text = text
    def _repr_html_(self):
        self.desc['url'] = u"https://babel.hathitrust.org/cgi/pt?id=" + self.htid
        output_string = u"<li><a href={}>{} ({})</a><br>{}</li>".format(
                self.desc['url'],self.desc['titles'][0].encode("ascii","ignore"),self.desc['publishDates'][0],self.text.decode("utf-8","ignore"))
        return output_string
    def title(self):
        return self.desc['titles'][0]
    
Hathi_Book("inu.30000026383574","Some sample text to go with, ❤").title()

u'Yearbook of German-American studies.'

This is code to debug the matches that I find. It's involved in the way that research code can be.

Essentially, though, it spends most of its time on data cleaning and cutoff. The big challenge is 
that I don't want it to flag for me as a problem when Hathi has a "The Works of Charles Dickens, vol 3" 
and the textlab has "Great Expectations."

So it doesn't bother to compare matches for uninformative Hathi titles.

Then it does some string replacement to normalize words or strings like "and", "roman", and "œ":
finally, it can compare the titles from Hathi to see if they're the same as those in the textlab. If not,
it prints to console suggesting that we check up.

In many cases, this reveals problems in the original data: the textlab called a book "The Vicar of Wrexham", but it's actuall *The vicar of Wrexhill*. The machine is a decent proofreader!

In [117]:
nearly = []
for i,neighbor in enumerate(neighbors):
    name = txtlab["names"][i]
    for dist, hathi in neighbor:
        if dist < .25:
            nearly.append((dist,name,hathi))
nearly.sort()
seen = set()
last_dist = 0

for dist,name,hathi in nearly:
    if dist > .1 and last_dist <= .1:
        print "***seen {} at .1 distance, the conservative cutoff.".format(len(seen))
    if dist > .18 and last_dist <= .18:
        print "***seen {} at .18 distance, the hand-picked cutoff for best performance at this task".format(len(seen))
    last_dist = dist
    if name in seen:
        # The first match for a book is the best.
        continue
    try:
        hathi_title = Hathi_Book(hathi).title()
    except: 
        continue
    broken = False
    for workmarker in [
        u"sämmtliche", u"Novels and tales",u"works of", "novels of",
        u"Werke", u"Gesammelte", u"Romane und Erzählungen", "werke", "Romane", u"Erzählungen",
        u"Works", u"Life and works", u"v.",u"O︠e︡uvres", u"complètes", u"complètes","gesammelt",u"Sämmtliche",
        u"OEuvres", "The writings of", "Tales and novels", u"Œuvres", "Waverley novels", u"Erzählungen",
        u"Oeuvres", "gesammelte Romane", "Standard novels", "uvres comple", u"sämtliche", u"sämliche","Samtliche",
    "Deutsche Literatur", "prose tales", "Romans", "ovels of",
        "in philology", "Agora", # These are both 20C journals I can't check to see if they published an old novel.
        "Dichtungen und Schriften"]:
        if workmarker in hathi_title:
            broken = True
    if broken:
        # Don't make me check "Works v. 4"
        continue
    import sys   
    seen.add(name)
    mcgill_title = name.split("_")[3]
    mt = mcgill_title
    ht = hathi_title
    for find, replace in [
        (u"'",""),
        (u"œ", "oe"),
        ("the", ""),
        (" ",""),
        (u"è", "e"),
        ("-",""),
        (u"é","e"),
        ("man","men"),
        ("dela", ""),
        ("de", ""),
        (u"ß","ss"),
        (",roman","")
    ]:
        mt = mt.lower().replace(find, replace)
        ht = ht.lower().replace(find, replace)
    if mt in ht:
        sys.stdout.write(".")
        continue
    print u"{} is {:0.4f} from {} ({} - {})".format(mcgill_title, dist, hathi_title, hathi, name)
    

# As with the library metadata, using textual features instead of metadata reveals several places where the metadata itself  is inaccurate.
# Jan Vedder's wife is listed as Jan Veeder's Wife; Effi Briest is spelled "Effie Briest"; The Vicar of Wrexhill is title "The Vicar of Wrexham."
# The metadata identifies a book as Rachilde's Nono, when in fact it is actually Monsieur Venus.
# The algorithm also identifies a copy of what the Hathi catalog describes as Adele Schopenhauer's Haus, Wald, und Feldmaerchen as Anna; the catalog metadata (hvd.hnxstq) appears not to note that that 350 novel is bound into the same covers as the 150 page fairy tales.


........JanVeeder'sWife is 0.0045 from Jan Vedder's wife, (hvd.hwssci - EN_1885_Barr,Amelia_JanVeeder'sWife_Novel)
.....EffieBriest is 0.0053 from Effi Briest : roman / (mdp.39015054099133 - DE_1895_Fontane,Theodor_EffieBriest_Novel)
........{u'records': {}, u'items': []}
........LesPleiadesroman is 0.0066 from Les Pléïades. (uc1.$b183061 - FR_1874_Gobineau,Arthur,comtede_LesPleiadesroman_Novel)
..........TheVicarofWrexham is 0.0074 from The vicar of Wrexhill / (inu.32000002550467 - EN_1837_Trollope,FrancesMilton_TheVicarofWrexham_Novel)
............PeterPan is 0.0085 from Peter and Wendy, (mdp.39015008403183 - EN_1911_Barrie,J.M._PeterPan_Novel)
..............LaMaternelle,Roman is 0.0093 from La maternelle / (wu.89099436701 - FR_1904_Frapie,Leon_LaMaternelle,Roman_Novel)
..........AventuresdeMelleMariette is 0.0100 from Les aventures de Mademoiselle Mariette, par Champfleury. (hvd.32044087054235 - FR_1853_Champfleury_AventuresdeMelleMariette_Novel)
.HeidesLehrundWanderjahre is 0.010

412