In [79]:
import requests
from bs4 import BeautifulSoup
from sys import stdout
from time import sleep
import pickle
import os

# WHO SCRAPER

In [80]:
def get_links_by_year(list_of_years=None, proxies={'http': 'http://fw-bln.rki.local:8020'}):
    '''Get (all) the anual links of the WHO DONs
    
    list_of_years -- a list of years (YYYY format) you want to parse (default None)
    proxies -- the proxy to use while scraping (default {'http': 'http://fw-bln.rki.local:8020'})
    '''

    page = requests.get('http://www.who.int/csr/don/archive/year/en/',proxies=proxies)
    soup = BeautifulSoup(page.content,'html.parser')
    archiv_years = soup.find('ul',attrs={'class':'list'})
    years_links_html = archiv_years.find_all('a')
    if list_of_years:
        return ['http://www.who.int' + link.get('href') for link in years_links_html if any(year in link for year in list_of_years)]
    else:
        return ['http://www.who.int' + link.get('href') for link in years_links_html]
    

In [81]:
# Get all provided links per year

def get_links_per_year(years_links, list_of_months=None, proxies={'http': 'http://fw-bln.rki.local:8020'}):
    '''Take a list of links to the annual archive and return a list of DON links of these years
    
    years_links -- a list of links of the anual archive to parse 
    list_of_months -- a list of months (MMM* format) you want to parse (default None)
    proxies -- the proxy to use while scraping (default {'http': 'http://fw-bln.rki.local:8020'})
    '''
    all_links = []
    
    for year_link in years_links:
        page_year = requests.get(year_link,proxies=proxies)
        soup_year = BeautifulSoup(page_year.content,'html.parser')
        archive_year = soup_year.find('ul',attrs={'class':'auto_archive'})
        daily_links = ['http://www.who.int' + link.get('href') for link in archive_year.find_all('a')]
        all_links.extend(daily_links)
    
    if list_of_months:
        all_links = [link for link in all_links if any(month in link for month in map(lambda s:s.lower(),list_of_months))]
    return all_links

In [82]:
headers = {
    'User-Agent': 'Auss Abbood, www.rki.de',
    'From': 'abbooda@rki.de'
}

In [83]:
def scrape_from_links(all_links,headers,proxies={'http': 'http://fw-bln.rki.local:8020'},num_last_reports=None):
    '''Take a list of links of WHO DONs and return a list of their content 
    
    all_links -- a list of links of the WHO DONs to parse 
    headers -- a header dictonary to be indentifiable as a parser
    proxies -- the proxy to use while scraping (default {'http': 'http://fw-bln.rki.local:8020'})
    '''
    all_text = []
    if num_last_reports:
        all_links = all_links[-num_last_reports:]
    for step,single_report in enumerate(all_links):
        page_single_report = requests.get(single_report,proxies=proxies,headers=headers)
        soup_single_report = BeautifulSoup(page_single_report.content,'html.parser')
        text_single_report = [parse.get_text() for parse in soup_single_report.find_all('span') if (len(parse.get_text()) > 65)]
        all_text.extend(text_single_report)

        stdout.write("\r%d" % step + '/'+ str(len(all_links)) + ' links processed | ' 
                     + str('%.0f' % (step/len(all_links) *100)) + '% complete' )
        stdout.flush()
        sleep(0.1)
    return all_text

In [86]:
import datetime
date = date = str(datetime.datetime.now())[:-7].replace(' ','-')
def scrape(years=None,months=None,num_last_reports=None,headers=None):
    years = get_links_by_year(list_of_years=years)
    all_links = get_links_per_year(years,list_of_months=months)
    all_text = scrape_from_links(all_links,headers=headers)
    with open("who_crawl_{}.p".format(date), "wb") as fp:   #Pickling
        pickle.dump(all_text, fp)

In [87]:
scrape([2018],headers)

IOError: [Errno 22] invalid mode ('wb') or filename: 'who_crawl_2018-10-31-14:40:51.p'

In [12]:
with open("who_crawl.p", "rb") as fp:   # Unpickling
    b = pickle.load(fp)


In [35]:
import jpype
import urllib2
import socket
import charade
import threading

socket.setdefaulttimeout(15)
lock = threading.Lock()

InputSource        = jpype.JClass('org.xml.sax.InputSource')
StringReader       = jpype.JClass('java.io.StringReader')
HTMLHighlighter    = jpype.JClass('de.l3s.boilerpipe.sax.HTMLHighlighter')
BoilerpipeSAXInput = jpype.JClass('de.l3s.boilerpipe.sax.BoilerpipeSAXInput')

class Extractor(object):
    """
    Extract text. Constructor takes 'extractor' as a keyword argument,
    being one of the boilerpipe extractors:
    - DefaultExtractor
    - ArticleExtractor
    - ArticleSentencesExtractor
    - KeepEverythingExtractor
    - KeepEverythingWithMinKWordsExtractor
    - LargestContentExtractor
    - NumWordsRulesExtractor
    - CanolaExtractor
    """
    extractor = None
    source    = None
    data      = None
    headers   = {'User-Agent': 'Mozilla/5.0'}
    
    def __init__(self, extractor='DefaultExtractor', **kwargs):
        if kwargs.get('url'):
            proxy = urllib2.ProxyHandler({'http': 'http://fw-bln.rki.local:8020'})
            opener = urllib2.build_opener(proxy)
            urllib2.install_opener(opener)
            urllib2.urlopen('http://www.google.com')
            request     = urllib2.Request(kwargs['url'], headers=self.headers)
            connection  = urllib2.urlopen(request)
            self.data   = connection.read()
            encoding    = connection.headers['content-type'].lower().split('charset=')[-1]
            if encoding.lower() == 'text/html':
                encoding = charade.detect(self.data)['encoding']
            self.data = unicode(self.data, encoding)
        elif kwargs.get('html'):
            self.data = kwargs['html']
            if not isinstance(self.data, unicode):
                self.data = unicode(self.data, charade.detect(self.data)['encoding'])
        else:
            raise Exception('No text or url provided')

        try:
            # make it thread-safe
            if threading.activeCount() > 1:
                if jpype.isThreadAttachedToJVM() == False:
                    jpype.attachThreadToJVM()
            lock.acquire()
            
            self.extractor = jpype.JClass(
                "de.l3s.boilerpipe.extractors."+extractor).INSTANCE
        finally:
            lock.release()
    
        reader = StringReader(self.data)
        self.source = BoilerpipeSAXInput(InputSource(reader)).getTextDocument()
        self.extractor.process(self.source)
    
    def getText(self):
        return self.source.getContent()
    
    def getHTML(self):
        highlighter = HTMLHighlighter.newExtractingInstance()
        return highlighter.process(self.source, self.data)
    
    def getImages(self):
        extractor = jpype.JClass(
            "de.l3s.boilerpipe.sax.ImageExtractor").INSTANCE
        images = extractor.process(self.source, self.data)
        jpype.java.util.Collections.sort(images)
        images = [
            {
                'src'   : image.getSrc(),
                'width' : image.getWidth(),
                'height': image.getHeight(),
                'alt'   : image.getAlt(),
                'area'  : image.getArea()
            } for image in images
        ]
        return images

In [90]:
your_url = 'http://www.who.int/csr/don/15-october-2018-chikungunya-sudan/en/'
extractor = Extractor(extractor='ArticleExtractor', url=your_url)

In [91]:
extracted_text = extractor.getText()

In [97]:
from epitator.annotator import AnnoDoc
from epitator.geoname_annotator import GeonameAnnotator
from epitator.count_annotator import CountAnnotator
from epitator.date_annotator import DateAnnotator
from epitator.resolved_keyword_annotator import ResolvedKeywordAnnotator
from epitator.structured_incident_annotator import StructuredIncidentAnnotator
import logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [96]:
anno_doc = AnnoDoc(extracted_text,date=None)

In [98]:
anno_doc.add_tier(ResolvedKeywordAnnotator())
logger.info('resolved')

In [99]:
anno_doc.add_tier(DateAnnotator())
logger.info('dates annotated')

In [101]:
anno_doc.add_tier(CountAnnotator())
logger.info('counts annotated')

In [102]:
anno_doc.add_tier(GeonameAnnotator())
logger.info('geonames annotated')

In [103]:
anno_doc.add_tier(StructuredIncidentAnnotator())
logger.info('structured incidents annotated')

In [105]:
anno_doc.filter_overlapping_spans(tier_names=['dates','geonames','diseases','hosts','modes','pathogens','symptoms'])



In [112]:
anno_doc.tiers['counts']


AnnoTier([AnnoSpan(129-149, four suspected cases), AnnoSpan(351-365, suspected case), AnnoSpan(450-454, male), AnnoSpan(501-534, cases have been reported in three), AnnoSpan(622-624, 24), AnnoSpan(650-652, 22), AnnoSpan(787-804, an additional 100), AnnoSpan(853-856, ten), AnnoSpan(1142-1163, total of 13 978 cases)])