In [1]:
from __future__ import division

import BeautifulSoup
import numpy as np
import itertools
import mwclient
import pandas as pd

import pickle

import btb.utils.tools as btbtools
import btb.utils.wikiquery as wq

In [7]:
wikiNL = mwclient.Site('nl.wikipedia.org')
wikiEN = mwclient.Site('en.wikipedia.org')
bots = wq.getAllBots(wikiEN)

In [8]:
__NAME_CACHE = {}
__INTEREST_CACHE = {}

def getFromGlobalNameCache(name):
    try:
        return __NAME_CACHE[name]
    except:
        return None

def saveToGlobalNameCache(name, value):
    __NAME_CACHE[name] = value

def getFromGlobalInterestCache(name):
    try:
        return __INTEREST_CACHE[name]
    except:
        return None

def saveToGlobalInterestCache(name, value):
    __INTEREST_CACHE[name] = value

def syncCaches():
    global __NAME_CACHE
    global __INTEREST_CACHE
    try:
        nameCacheFile = pickle.load(open('NAME_CACHE.pkl','r'))
        __NAME_CACHE = dict(__NAME_CACHE.items() + nameCacheFile.items())
        
        sizePre = len(nameCacheFile)
        sizePost = len(__NAME_CACHE)
        if sizePre!=sizePost:
            print 'Cache grew by: ',(sizePost - sizePre)
    except:
        pass
    
    try:
        interestCacheFile = pickle.load(open('INTEREST_CACHE.pkl','r'))
        __INTEREST_CACHE = dict(__INTEREST_CACHE.items() + interestCacheFile.items())
        
        sizePre = len(interestCacheFile)
        sizePost = len(__INTEREST_CACHE)
        if sizePre!=sizePost:
            print 'Cache 2 grew by: ',(sizePost - sizePre)
    except:
        pass

    pickle.dump(__NAME_CACHE, open('NAME_CACHE.pkl', 'w'))
    pickle.dump(__INTEREST_CACHE, open('INTEREST_CACHE.pkl', 'w'))


In [9]:
def getName(nym_xml):
    forms = nym_xml.findAll('form', {'type': 'nym'})
    if len(forms)==1:
        return forms[0].getText()
    else:
        print '!!! getName(nym_xml) -- length = ',len(forms)
        return None

def getNymData(nym_xml):
    '''
    Returns:
      freq    The frequency of term on any form it was observed
      name    The normalized way in which the text has been observed
      nstype  Namescape type of entity: person, location, etc.
    '''
    freqs = nym_xml.findAll('usg', { 'type': 'frequency' })
    freq = np.array([ int(f.getText()) for f in freqs]).max()
    name = getName(nym_xml)
    nstype = nym_xml.get('ns:type')
    
    return freq, name, nstype

def getLangTitle(sourceWiki, title, targetLang='en'):
    page = sourceWiki.Pages[title]
    page = page.resolve_redirect()

    for lang,langTitle in page.langlinks():
        if lang==targetLang:
            return langTitle
    return None

def getAllTitleOptions(title):
    words = title.split(' ')
    cases = range(2) # [ 0 1 ] ==> 0 lower, 1 title case
    nWords = len(words)
    titleOpts = []
    for x in itertools.product(cases, repeat=nWords):
        title =  [ words[i].lower() if x[i]==0 else words[i].title() for i in range(nWords) ]
        titleOpts.append(' '.join(title))
    return titleOpts

def findFirstTitleMatch(sourceWiki, origTitle, targetLang='en', debug=False):
    cached = getFromGlobalNameCache(origTitle)
#    if cached is not None:
#        print 'Cached saved time!'
#        return cached
    
    titles = getAllTitleOptions(origTitle)
    for title in titles:
        title = mwclient.page.Page.normalize_title(title)
        candidate = getLangTitle(sourceWiki, title, targetLang=targetLang)
        
        if debug:
            print title,'-->',candidate
        
        if candidate is not None:
            saveToGlobalNameCache(origTitle, candidate)
            return candidate
    return None # No translation was found !

def wikiCountryInterest(wiki, pageTitle):
    cached = getFromGlobalInterestCache(pageTitle)
#    if cached is not None:
#        print 'Cached saved time 2!'
#        return cached
    try:
        ips, usrs, nrevs = wq.getContributionsForPage(wiki, pageTitle)
        knwRevs, conf, nIP, nUsr, nBot, nUnkn = btbtools.prepareData(ips, usrs, bots)
        expEdits = wq.getTotalContributions()

        cmpEdits = btbtools.compareEdits(expEdits, knwRevs)

        nl_e, nl_o, nl_m = cmpEdits['NL']
        ca_e, ca_o, ca_m = cmpEdits['CA']

        saveToGlobalInterestCache(pageTitle, (nl_o, ca_o, conf))
        return (nl_o, ca_o, conf)
    except Exception as err:
        print 'PATCH IT! Error for word "'+pageTitle+'":',err.__class__,' ->',err
        return None,None,None

In [15]:
# karinaFile = '../data/corpus.karina.nerINL.2012-10-26/nl.ns.d.9789020417159.k.xml'
karinaFile = '../data/corpus.karina.nerINL.2012-10-26/nl.ns.d.9789044301717.k.xml'


In [17]:
tei = open(karinaFile).read()
teiSoup = BeautifulSoup.BeautifulSoup(tei)

In [18]:
nymList = teiSoup.findAll('nym')
nymData = [getNymData(nym) for nym in nymList]

In [None]:
dataEn = []

for freq,name,nstype in nymData:
    try:
        match = findFirstTitleMatch(wikiNL, name, targetLang='en')
        if match is not None:
            dataEn.append((name,match,nstype,freq))
    except Exception as err:
        print 'Error for word "'+name+'":',err.__class__,' ->',err
    

In [83]:
data = pd.DataFrame(dataEn, columns=['OrigWord','Word','Type','Count'])

In [85]:
data['TEMP'] = data['Word'].apply(lambda x: wikiCountryInterest(wikiEN, x))
data['NLO'] = data['TEMP'].apply(lambda x: x[0])
data['CAO'] = data['TEMP'].apply(lambda x: x[1])
data['Conf'] = data['TEMP'].apply(lambda x: x[2])
del data['TEMP']

In [19]:
expEdits = wq.getTotalContributions()
NLE = expEdits['NL']
CAE = expEdits['CA']

data['Interest-NL'] = data.apply(lambda x: btbtools.relativeInterest(NLE, x['NLO']), axis=1)
data['Interest-CA'] = data.apply(lambda x: btbtools.relativeInterest(CAE, x['CAO']), axis=1)

In [89]:
nItems = data['Count'].sum()
data['Total-NL'] = data.apply(lambda x: 100 * x['Count'] / nItems * x['Conf'] * x['Interest-NL'],axis=1)
data['Total-CA'] = data.apply(lambda x: 100 * x['Count'] / nItems * x['Conf'] * x['Interest-CA'],axis=1)

In [34]:
data.to_pickle('data-karina.tmp.pkl')

In [5]:
def processKarinaFile(inFile_XML, outFile_PKL):
    syncCaches()
    tei = open(inFile_XML).read()
    teiSoup = BeautifulSoup.BeautifulSoup(tei)

    nymList = teiSoup.findAll('nym')
    nymData = [getNymData(nym) for nym in nymList]

    dataEn = []

    for freq,name,nstype in nymData:
        try:
            match = findFirstTitleMatch(wikiNL, name, targetLang='en')
            if match is not None:
                dataEn.append((name,match,nstype,freq))
            else:
                print 'Warning: No English data for word: ',name
        except Exception as err:
            print 'Error for word "'+name+'":',err.__class__,' ->',err
    syncCaches()
        
    data = pd.DataFrame(dataEn, columns=['OrigWord','Word','Type','Count'])

    data['TEMP'] = data['Word'].apply(lambda x: wikiCountryInterest(wikiEN, x))
    data['NLO'] = data['TEMP'].apply(lambda x: x[0])
    data['CAO'] = data['TEMP'].apply(lambda x: x[1])
    data['Conf'] = data['TEMP'].apply(lambda x: x[2])
    del data['TEMP']

    expEdits = wq.getTotalContributions()
    NLE = expEdits['NL']
    CAE = expEdits['CA']

    data['Interest-NL'] = data.apply(lambda x: btbtools.relativeInterest(NLE, x['NLO']), axis=1)
    data['Interest-CA'] = data.apply(lambda x: btbtools.relativeInterest(CAE, x['CAO']), axis=1)

    nItems = data['Count'].sum()
    data['Total-NL'] = data.apply(lambda x: 100 * x['Count'] / nItems * x['Conf'] * x['Interest-NL'],axis=1)
    data['Total-CA'] = data.apply(lambda x: 100 * x['Count'] / nItems * x['Conf'] * x['Interest-CA'],axis=1)

    data.to_pickle(outFile_PKL)

In [6]:
import glob
import os.path

In [7]:
inFiles = glob.glob('../data/corpus.karina.nerINL.2012-10-26/*.xml')
outFiles = [ inFile.replace('.xml', '.pkl') for inFile in inFiles ]

#inFiles = [
#    '../data/corpus.sanders.nerINL.2012-10-24/nl.ns.d.9022914186.s.xml', 
##    '../data/corpus.sanders.nerINL.2012-10-24/nl.ns.d.9021413353.s.xml', 
#    '../data/corpus.sanders.nerINL.2012-10-24/nl.ns.d.9021413396.s.xml', 
#    '../data/corpus.sanders.nerINL.2012-10-24/nl.ns.d.9023431294.s.xml', 
#    '../data/corpus.sanders.nerINL.2012-10-24/nl.ns.d.9023404319.s.xml', 
#    '../data/corpus.sanders.nerINL.2012-10-24/nl.ns.d.9044604279.s.xml', 
#    '../data/corpus.sanders.nerINL.2012-10-24/nl.ns.d.9085420415.s.xml', 
#    '../data/corpus.sanders.nerINL.2012-10-24/nl.ns.d.9041410252.s.xml', 
#    '../data/corpus.sanders.nerINL.2012-10-24/nl.ns.d.9029026561.s.xml', 
#    '../data/corpus.sanders.nerINL.2012-10-24/nl.ns.d.9021412411.s.xml', 
#    '../data/corpus.sanders.nerINL.2012-10-24/nl.ns.d.9023404114.s.xml', 
#    '../data/corpus.sanders.nerINL.2012-10-24/nl.ns.d.902340744x.s.xml', 
#    '../data/corpus.sanders.nerINL.2012-10-24/nl.ns.d.9024292646.s.xml', 
#    '../data/corpus.sanders.nerINL.2012-10-24/nl.ns.d.9029561610.s.xml', 
#    '../data/corpus.sanders.nerINL.2012-10-24/nl.ns.d.902230292X.s.xml', 
#    '../data/corpus.sanders.nerINL.2012-10-24/nl.ns.d.9023431251.s.xml', 
#    '../data/corpus.sanders.nerINL.2012-10-24/nl.ns.d.9023404866.s.xml', 
#    '../data/corpus.sanders.nerINL.2012-10-24/nl.ns.d.9029505249.s.xml', 
#    '../data/corpus.sanders.nerINL.2012-10-24/nl.ns.d.9053333029.s.xml', 
#    '../data/corpus.sanders.nerINL.2012-10-24/nl.ns.d.9029518219.s.xml', 
#    '../data/corpus.sanders.nerINL.2012-10-24/nl.ns.d.9064811091.s.xml', 
#    '../data/corpus.sanders.nerINL.2012-10-24/nl.ns.d.9023409272.s.xml', 
#    '../data/corpus.sanders.nerINL.2012-10-24/nl.ns.d.9057134829.s.xml', 
#    '../data/corpus.sanders.nerINL.2012-10-24/nl.ns.d.9021485060.s.xml', 
#    '../data/corpus.sanders.nerINL.2012-10-24/nl.ns.d.9029554606.s.xml', 
#    '../data/corpus.sanders.nerINL.2012-10-24/nl.ns.d.9025426158.s.xml', 
#    '../data/corpus.sanders.nerINL.2012-10-24/nl.ns.d.9029530421.s.xml', 
#    '../data/corpus.sanders.nerINL.2012-10-24/nl.ns.d.9074336825.s.xml', 
#    '../data/corpus.sanders.nerINL.2012-10-24/nl.ns.d.9029098961.s.xml', 
#    '../data/corpus.sanders.nerINL.2012-10-24/nl.ns.d.9041409068.s.xml', 
#    '../data/corpus.sanders.nerINL.2012-10-24/nl.ns.d.906291232X.s.xml', 
#    '../data/corpus.sanders.nerINL.2012-10-24/nl.ns.d.9029528893.s.xml', 
#    '../data/corpus.sanders.nerINL.2012-10-24/nl.ns.d.905000802X.s.xml', 
#    '../data/corpus.sanders.nerINL.2012-10-24/nl.ns.d.9027420351.s.xml', 
#    '../data/corpus.sanders.nerINL.2012-10-24/nl.ns.d.9023432592.s.xml', 
#    '../data/corpus.sanders.nerINL.2012-10-24/nl.ns.d.9023431189.s.xml', 
#    '../data/corpus.sanders.nerINL.2012-10-24/nl.ns.d.9025802109.s.xml'
#]
# outFiles = [ inFile.replace('.xml', '.pkl') for inFile in inFiles ]

In [None]:
for fin, fout in zip(inFiles, outFiles):
    if not os.path.isfile(fout):
        print 'Processing ',fin,'...'
        processKarinaFile(fin, fout)
    else:
        print 'Skipping ',fin,'...'


# PATCH !
First check for files with missing values...

In [2]:
import glob

expEdits = wq.getTotalContributions()
NLE = expEdits['NL']
CAE = expEdits['CA']

In [3]:
def needsPatching(data):
    dataMissing = data[data['NLO'].isnull()]
    if len(dataMissing)==0:
        return []
    else:
        return dataMissing['Word']

In [15]:
# for pklFile in glob.glob('../data/corpus.karina.nerINL.2012-10-26/*.pkl'):
for pklFile in glob.glob('../data/corpus.sanders.nerINL.2012-10-24/*.pkl'):
    data = pickle.load(open(pklFile, 'r'))
    toPatch = needsPatching(data)
    if len(toPatch)>0:
        print pklFile, toPatch

If there are any, fix those missing values
 - Load the file with missing value
 - Compute missing values
 - Replace computed values on data frame
 - Save data frame to original file

In [12]:
word = 'Kyphosis'
pklFile = '../data/corpus.sanders.nerINL.2012-10-24/nl.ns.d.9057592576.s.pkl'

data = pickle.load(open(pklFile, 'r'))
data[data['Word']==word]

Unnamed: 0,OrigWord,Word,Type,Count,NLO,CAO,Conf,Interest-NL,Interest-CA,Total-NL,Total-CA
16,BOCHEL,Kyphosis,person,1,,,,,,,


In [13]:
nlo,cao,conf = wikiCountryInterest(wikiEN, word)

idx = data[data['Word']==word].index
assert len(idx)==1 # If more than 1 index, something is wrong...
idx = idx[0]

inl = btbtools.relativeInterest(NLE, nlo)
ica = btbtools.relativeInterest(CAE, cao)
nItems = data['Count'].sum()
count = data['Count'][idx]

tnl = 100 * count / nItems * conf * inl
tca = 100 * count / nItems * conf * ica

data.loc[idx,'NLO'] = nlo
data.loc[idx,'CAO'] = cao
data.loc[idx,'Conf'] = conf
data.loc[idx,'Interest-NL'] = inl
data.loc[idx,'Interest-CA'] = ica
data.loc[idx,'Total-NL'] = tnl
data.loc[idx,'Total-CA'] = tca

data[data['Word']==word]


Unnamed: 0,OrigWord,Word,Type,Count,NLO,CAO,Conf,Interest-NL,Interest-CA,Total-NL,Total-CA
16,BOCHEL,Kyphosis,person,1,0.016327,0.065306,0.528017,0.51,0.173125,0.075431,0.025606


In [14]:
pickle.dump(data,open(pklFile, 'w'))