In [1]:
import pycurl
import json

In [2]:
import BeautifulSoup
import re

In [3]:
import pandas as pd

In [4]:
from StringIO import StringIO
from urllib import urlencode

In [5]:
from collections import defaultdict

In [6]:
from collections import Counter

In [7]:
def myCurl(url):
    c = pycurl.Curl()
    c.setopt(c.URL, url)
    buff = StringIO()

    c.setopt(c.WRITEFUNCTION, buff.write)
    c.setopt(c.FOLLOWLOCATION, True)
    c.perform()
    c.close()
    body = buff.getvalue()
    response = json.loads(body)
    return response


In [8]:
def getEditionsISBN(isbn):
    print 'CallgetEditionsISBN - expensive! - ' + isbn
    url = 'http://xisbn.worldcat.org/webservices/xid/isbn/#ISBN#?method=getEditions&format=json&fl=lang,isbn,oclcnum'
    url = url.replace('#ISBN#', isbn)
    response = myCurl(url)
    if response['stat']=='ok':
        isbns = []
        langs = []
        oclcs = []
        for item in response['list']:
            if 'isbn' in item and 'lang' in item and 'oclcnum' in item :
                isbns.extend(item['isbn'])
                langs.append(item['lang'])
                oclcs.extend(item['oclcnum'])
        return isbns, langs, oclcs
    else:
        print 'Warning: stat not ok for (editions) ISBN: ' + isbn
        return [], [], []

In [9]:
class OverLimitException(Exception):
    pass

def getLangsISBN(isbn, useISSN=False):
#    print 'CallgetLangsISBN - expensive! - ' + isbn
    url = 'http://xisbn.worldcat.org/webservices/xid/isbn/#ISBN#?method=getEditions&format=json&fl=lang,isbn'
    if useISSN:
        url = url.replace('xisbn.worldcat.org', 'xissn.worldcat.org')
    url = url.replace('#ISBN#', isbn)
    response = myCurl(url)
    if response['stat']=='ok':
        isbns = []
        langs = []
        for item in response['list']:
            if 'isbn' in item and 'lang' in item:
                isbns.extend(item['isbn'])
                langs.append(item['lang'])
        return isbns, langs
    elif response['stat']=='overlimit':
        raise OverLimitException('Over limit for isbn: ' + isbn)
    else:
        print 'Warning: stat not ok for (editions) ISBN: ' + isbn
        print response
        return [], []

In [10]:
def thingISBN(isbn):
    searchURL = 'http://www.librarything.com/api/thingISBN/#ISBN#'
    searchURL = searchURL.replace('#ISBN#', isbn)
    page = myWget(searchURL)
    
    pageSoup = BeautifulSoup.BeautifulSoup(page)
    isbns = []
    for item in pageSoup.findAll('isbn'):
        isbns.append(item.getText())
    if isbn not in isbns:
        isbns.append(isbn)
    return isbns

In [11]:
import time 

In [12]:
def getLangsLTWC(seedISBN, useISSN=False, verbose=False, doDelay=False):
    # Get languages associated with ISBN using LibraryThing (LT) plus
    # WorldCat (WC) combo
    isbns = thingISBN(seedISBN)
    visitedISBNs = []
    allLangs = []
    
    if verbose:
        print 'LibraryThing returned %d isbns'%(len(isbns))

    for isbn in isbns:
        if isbn not in visitedISBNs:
            if doDelay:
                time.sleep(5)
            if verbose:
                print 'Calling for isbn:',isbn
            newIsbns, newLangs = getLangsISBN(isbn, useISSN)
            visitedISBNs.extend(newIsbns)
            allLangs.extend(newLangs)
            if verbose:
                print 'Extended by %d for isbn %s'%(len(newLangs), isbn)
        else:
            if verbose:
                print 'Skipped ISBN:',isbn
    return Counter(allLangs)

In [13]:
def myWget(url):
    c = pycurl.Curl()
    c.setopt(c.URL, url)
    buff = StringIO()

    c.setopt(c.WRITEFUNCTION, buff.write)
    c.setopt(c.FOLLOWLOCATION, True)
    c.perform()
    c.close()
    body = buff.getvalue()
    return body

http://www.worldcat.org/search?qt=worldcat_org_bks&q=#ISBN#&fq=dt%3Abks

Click on " View all formats and languages » "
http://www.worldcat.org/title/menuet/oclc/50698918/editions?editionsView=true&referer=br

DO:
Find DIV <div id="LanguageRefinement">
  if contains <a rel="nofollow" href="LINK"><strong>Show more ...</strong></a>
    follow link
    goto DO
  else
    scrape:
    foreach <li><a rel="nofollow" title="LANG" href="LINK">LANG</a> (NUM)</li>
      dict[LANG] = NUM

In [None]:
def worldCatSearch(isbn):
    baseURL = 'http://www.worldcat.org/'
    searchURL = baseURL + 'search?qt=worldcat_org_bks&q=#ISBN#&fq=dt%3Abks'
    searchURL = searchURL.replace('#ISBN#', isbn)
    page = myWget(searchURL)
    
    pageSoup = BeautifulSoup.BeautifulSoup(page)

    if pageSoup.find('div', {'id': 'div-results-none'}):
        print 'ISBN not found: ' + isbn
        return {}

    doLoop = True
    while(doLoop):
        doLoop = False
        aLangs = pageSoup.find('a', {'title': 'View all held editions and formats for this item'})
        langDiv = pageSoup.find('div', {'id': 'LanguageRefinement'})
        itemLangs = pageSoup.findAll('span', { 'class': 'itemLanguage' })
        
        # If has View all held editions or 
        if aLangs:
            # Follow loop again
            langsURL = baseURL + aLangs.get('href')
            page = myWget(langsURL)
            pageSoup = BeautifulSoup.BeautifulSoup(page)
            doLoop = True
        elif langDiv: # if has div LanguageRefinements
            items = langDiv.findAll('li')
            # If has show more
            if 'Show more ...' in [strong.text for strong in langDiv.findAll('strong')]:
                # Follow and loop again
                for item in items:
                    if 'Show more ...' == item.getText():
                        moreLink = item.find('a')
                        redirectURL = baseURL + moreLink.get('href')
                        page = myWget(redirectURL)
                        pageSoup = BeautifulSoup.BeautifulSoup(page)
                        doLoop = True
            else:
                counts = {}
                # Parse and return
                for item in items:
                    # Split "lang(num)" into lang,num
                    splits = re.split('\(|\)', item.getText())
                    # if item.getText() is "lang" (i.e. no num), default num to 1
                    lang,num = splits[0],(splits[1] if len(splits)>1 else 1)
                    counts[lang] = num
                return counts
        elif itemLangs:
            counts = defaultdict(int)
            for item in pageSoup.findAll('span', { 'class': 'itemLanguage' }):
                counts[item.getText()] += 1
            return dict(counts)
        # else fail
    print 'Could not find data for: ' + isbn
    return {}

In [14]:
cKarina = pd.read_excel('../data/Book_selection.xlsx', 'CorpusOverview-Karina.csv')
cSanders = pd.read_excel('../data/Book_selection.xlsx', 'Corpus-Sanders')
transData = pd.concat([ cKarina, cSanders ])

In [15]:
translations = {}
isbnErr = []
translations['unknown'] = {}    # Ignore ISBN='unknown'
useISSN = False

In [24]:
isbnList = transData['isbn'].tolist()
i = 0

while i<len(isbnList):
    isbn = str(isbnList[i])
    try:
        if isbn not in translations:
            langs = getLangsLTWC(isbn, useISSN, False, True)
            translations[isbn] = langs
        i += 1
    except OverLimitException:
        if not useISSN:
            useISSN = True
            print 'Switch to use ISSN!'
            # Try again
        else:
            print 'Request limit reached!!! Breaking -- start again tomorrow...'
            break
    except Exception as e:
        print 'Problem with: ', isbn
        print '  > ',e
        isbnErr.append(isbn)
        i = len(isbnList)

{u'stat': u'unknownId'}
{u'stat': u'unknownId'}
{u'stat': u'unknownId'}
{u'stat': u'unknownId'}
{u'stat': u'unknownId'}
{u'stat': u'unknownId'}
{u'stat': u'unknownId'}
{u'stat': u'unknownId'}
{u'stat': u'unknownId'}
{u'stat': u'unknownId'}
{u'stat': u'unknownId'}
{u'stat': u'unknownId'}
{u'stat': u'unknownId'}
{u'stat': u'unknownId'}
{u'stat': u'unknownId'}
{u'stat': u'unknownId'}
{u'stat': u'unknownId'}
{u'stat': u'unknownId'}
{u'stat': u'unknownId'}
{u'stat': u'unknownId'}
{u'stat': u'unknownId'}
{u'stat': u'unknownId'}
{u'stat': u'unknownId'}
{u'stat': u'unknownId'}
{u'stat': u'unknownId'}
{u'stat': u'unknownId'}
{u'stat': u'unknownId'}
{u'stat': u'unknownId'}
{u'stat': u'unknownId'}
{u'stat': u'unknownId'}
{u'stat': u'unknownId'}
{u'stat': u'unknownId'}
{u'stat': u'unknownId'}
{u'stat': u'unknownId'}
{u'stat': u'unknownId'}
{u'stat': u'unknownId'}
{u'stat': u'unknownId'}
{u'stat': u'unknownId'}
{u'stat': u'unknownId'}
{u'stat': u'unknownId'}
{u'stat': u'unknownId'}
{u'stat': u'unkn

In [39]:
def getCounts(isbn):
    ''' Languages and counts '''
    return translations[str(isbn)]

def getNumLangs(counts):
    ''' Number of languages '''
    return len(counts)

def getNameLangs(counts):
    ''' Name of languages '''
    return counts.keys()

def getTotalTrans(counts):
    ''' Total translations '''
    return sum( int(v) for v in counts.values() )

transData['lang counts'] = transData['isbn'].apply(lambda x: getCounts(x))
transData['num_languages2'] = transData['isbn'].apply(lambda x: getNumLangs(getCounts(x)))
transData['langs'] = transData['isbn'].apply(lambda x: getNameLangs(getCounts(x)))
transData['Total translations'] = transData['isbn'].apply(lambda x: getTotalTrans(getCounts(x)))

In [40]:
def getCountsList(counts):
    return '\n'.join([ '%s(%s)'%(lang,counts[lang]) for lang in counts ])


transData['lang counts list'] = transData['lang counts'].apply(getCountsList)

In [41]:
allLangs = set()
for langs in transData['langs']:
    allLangs = allLangs.union(set(langs))

In [42]:
def getLangCount(langs, lang):
    return langs[lang] if lang in langs else 0

for lang in allLangs:
    transData[lang] = transData['lang counts'].apply(lambda x:getLangCount(x, lang))

In [43]:
columns=[ 'bookid', 'isbn', 'auteur', 'titel', 'jaar', 'genre', 'num_languages2', 
        'lang counts list', 'Total translations']

for lang in allLangs:
    columns.append(lang)

writer = pd.ExcelWriter('../data/TranslationsV2.xlsx')
transData.to_excel(excel_writer=writer, sheet_name='Translations', columns=columns)
writer.save()

In [47]:
import pickle
pickle.dump(transData, open('../data/TranslationsV2.pkl', 'w'))