In [3]:
from datetime import timedelta, date, datetime
from tqdm import tqdm
import json
import asyncio
import aiohttp
from lxml.html import fromstring

def date_range(start_date, end_date):
    """
    find business days to parse
    :param start_date:
    :param end_date:
    :return: business days 
    """
    for n in range(int((end_date - start_date).days) + 1):
        if (start_date + timedelta(n)).weekday() not in (5, 6):
            yield start_date + timedelta(n)

def generate_links(dates, url):
    """generate links to parse from"""
    
    for date in dates:
        link = url.format(date)
        yield link

async def fetch(session, url):
    """download data from url and return it in json"""
    with aiohttp.Timeout(10):
        async with session.get(url) as response:
            html = await response.text()
            tree = fromstring(html)
            text = []
            
            for tr in tree.xpath('//tr'):
                text.append([x.strip().replace(u'\xa0', u'') for x in tr.text_content().split('\n')])
            
            names = [
                "not_used1",
                "Short",
                "ISIN",
                "Currency",
                "Open",
                "High",
                "Low",
                "Closing",
                "% daily change *",
                "Volume",
                "Number of trades",
                "Turnover (thous.)",
                "not_used2"]
            
            output = {}
            output[url[-10:]] = [dict(zip(names, t)) for indx, t in enumerate(text) if indx != 0]

            return json.dumps(output, indent=4, sort_keys=True)
    
async def fetch_all(loop, urls, sem):
    """task manager"""
    async with aiohttp.ClientSession(loop=loop) as session:
        tasks = []
        for url in tqdm(urls):
            await sem.acquire()
            task = asyncio.ensure_future(fetch(session, url))
            task.add_done_callback(lambda t: sem.release())
            task.add_done_callback(save_to_file)
            task.add_done_callback(tasks.remove)
            tasks.append(task)
        
        results = await asyncio.gather(*tasks, return_exceptions=True)  # default is false, that would raise
        
    # for testing purposes only
    # gather returns results in the order of coros, works if urls is a list
    [print('ERROR: ', url) for url in results if isinstance(url, BaseException)]
            
    return results

def save_to_file(data_save):
    """save data to filename"""
    with open("{}.txt".format('warsaw_futures1'), "a") as f:
        for i in data_save.result():
            f.write(i)
    # print("Saved to file {}".format(f.name))
    
if __name__ == '__main__':

    start = date(2016, 9, 17)
    end = date(2016, 10, 18)
    # index
    # url = 'https://www.gpw.pl/notowania_archiwalne_full_en?type=1&date={}'
    # futures
    url = 'https://www.gpw.pl/notowania_archiwalne_full_en?type=35&date={}'
    
    headers = {'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
    'Accept-Encoding': 'gzip, deflate, sdch, br',
    'Accept-Language': 'ru-RU,ru;q=0.8,en-US;q=0.6,en;q=0.4',
    'Upgrade-Insecure-Requests': 1,
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36',
    'Referer': 'https://www.gpw.pl/notowania_archiwalne_en?type=1&date=2016-10-12&show.x=32&show.y=12',
    'Cookie': 'PHPSESSID=9ms5i5ot338lkmorctqp7uf0o4; SID=!Khgk9sVQQ1zJmCEx3T9eKbrcTnC4jFE0LGIk/49Mys+u4PdrgyyJb0IcJBmyj7v7VF/taLxvsP5lqyaqEOoMZ6TrVZn9eTFrkLPO6nsSigfv2CbqJEbbVNDh9ZvRKVhSC6KG8zuzU3TIMe/hWIBqmwKcELYW7uw=; lang_code=EN; TS014040d7=016c1ed7ff34b3ee8cd0041f3724d83f5f0b455c8c9163223d541f4b3fbd0dfaf0f8ef6af01d96c806f14c8fd0f025c4cc8135b8ca32fae5a197c21b401a054bd3992bbdedcd89131a55b62bb9536890c98fda94c7',
    'Connection': 'keep-alive'}
    
    loop = asyncio.get_event_loop()
    sem = asyncio.Semaphore(10, loop=loop)
    urls = generate_links(date_range(start, end), url)
    future = asyncio.ensure_future(fetch_all(loop, urls, sem))
    res = %time loop.run_until_complete(future)


22it [00:02,  5.27it/s]


CPU times: user 690 ms, sys: 170 ms, total: 860 ms
Wall time: 9.97 s
