In [None]:
from datetime import timedelta, date, datetime
from tqdm import tqdm
import json
import asyncio
import aiohttp
from lxml.html import fromstring

def date_range(start_date, end_date):
    """
    find business days to parse
    :param start_date:
    :param end_date:
    :return: generator with business days
    """
    for n in range(int((end_date - start_date).days) + 1):
        if (start_date + timedelta(n)).weekday() not in (5, 6):
            yield start_date + timedelta(n)

def generate_links(dates, url):
    """
    generate links to parse from
    """

    for date in dates:
        link = url.format(date)
        yield link

async def fetch(session, url):
    with aiohttp.Timeout(10):
        async with session.get(url) as response:
            html = await response.text()
            tree = fromstring(html)
            text = []
            
            for tr in tree.xpath('//tr'):
                text.append([x.strip().replace(u'\xa0', u'') for x in tr.text_content().split('\n')])
            
            names = [
                "not_used1",
                "Short",
                "ISIN",
                "Currency",
                "Open",
                "High",
                "Low",
                "Closing",
                "% daily change *",
                "Volume",
                "Number of trades",
                "Turnover (thous.)",
                "not_used2"]
            
            output = {}
            
            
            output[url[-10:]] = [dict(zip(names, t)) for indx, t in enumerate(text) if indx != 0]

            return json.dumps(output, indent=4, sort_keys=True)

def got_result(future):
    print(future.result())
    
async def fetch_all(loop, urls, sem):
    async with aiohttp.ClientSession(loop=loop) as session:
        tasks = []
        for url in tqdm(urls):
            await sem.acquire()
            task = asyncio.ensure_future(fetch(session, url))
            task.add_done_callback(lambda t: sem.release())

            task.add_done_callback(save_to_file)
            task.add_done_callback(tasks.remove)
            
            tasks.append(task)
        
        results = await asyncio.gather(*tasks, return_exceptions=True)  # default is false, that would raise
        #save_to_file(results)
        
    # for testing purposes only
    # gather returns results in the order of coros
    # works if urls is a list
    for url in results:
        if isinstance(url, BaseException):
            print('ERROR: ', url)
    
    return results

def save_to_file(data_save):
    """save data to filename"""
    with open("{}.txt".format('warsaw_index'), "a") as f:
        for i in data_save.result():
            f.write(i)
    # print("Saved to file {}".format(f.name))
    
if __name__ == '__main__':

    start = date(1994, 4, 18)
    end = date(2016, 10, 18)
    # index
    url = 'https://www.gpw.pl/notowania_archiwalne_full_en?type=1&date={}'
    # futures
    # url = 'https://www.gpw.pl/notowania_archiwalne_full_en?type=35&date={}'
    
    headers = {'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
    'Accept-Encoding': 'gzip, deflate, sdch, br',
    'Accept-Language': 'ru-RU,ru;q=0.8,en-US;q=0.6,en;q=0.4',
    'Upgrade-Insecure-Requests': 1,
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36',
    'Referer': 'https://www.gpw.pl/notowania_archiwalne_en?type=1&date=2016-10-12&show.x=32&show.y=12',
    'Cookie': 'PHPSESSID=9ms5i5ot338lkmorctqp7uf0o4; SID=!Khgk9sVQQ1zJmCEx3T9eKbrcTnC4jFE0LGIk/49Mys+u4PdrgyyJb0IcJBmyj7v7VF/taLxvsP5lqyaqEOoMZ6TrVZn9eTFrkLPO6nsSigfv2CbqJEbbVNDh9ZvRKVhSC6KG8zuzU3TIMe/hWIBqmwKcELYW7uw=; lang_code=EN; TS014040d7=016c1ed7ff34b3ee8cd0041f3724d83f5f0b455c8c9163223d541f4b3fbd0dfaf0f8ef6af01d96c806f14c8fd0f025c4cc8135b8ca32fae5a197c21b401a054bd3992bbdedcd89131a55b62bb9536890c98fda94c7',
    'Connection': 'keep-alive'}
    
    loop = asyncio.get_event_loop()
    sem = asyncio.Semaphore(10, loop=loop)

    urls = generate_links(date_range(start, end), url)
    future = asyncio.ensure_future(fetch_all(loop, urls, sem))
    res = %time loop.run_until_complete(future)



81it [00:59,  5.75s/it]
92it [01:00,  2.94s/it]
93it [01:01,  2.22s/it]
25it [00:02,  9.78it/s][A
94it [01:01,  1.72s/it]
95it [01:02,  1.37s/it]
96it [01:03,  1.12s/it]
97it [01:03,  1.05it/s]
31it [00:04,  2.66it/s][A
98it [01:04,  1.18it/s]
99it [01:04,  1.32it/s]
100it [01:05,  1.45it/s]
101it [01:05,  1.52it/s]
102it [01:06,  1.59it/s]
37it [00:07,  2.15it/s][A
104it [01:07,  1.92it/s]
39it [00:07,  2.22it/s][A
105it [01:07,  1.94it/s]
106it [01:08,  1.90it/s]
107it [01:08,  1.88it/s]
108it [01:09,  1.88it/s]
44it [00:10,  1.94it/s][A
115it [01:10,  2.36it/s]
46it [00:11,  2.28it/s][A
49it [00:11,  2.97it/s][A
118it [01:11,  2.31it/s]
119it [01:12,  2.32it/s]
120it [01:12,  2.23it/s]
121it [01:13,  2.16it/s]
122it [01:13,  2.23it/s]
124it [01:14,  2.19it/s]
126it [01:15,  2.23it/s]
127it [01:15,  2.29it/s]
128it [01:16,  2.23it/s]
129it [01:16,  2.17it/s]
130it [01:17,  2.30it/s]
132it [01:17,  2.61it/s]
133it [01:18,  2.43it/s]
134it [01:18,  2.33it/s]
135it [01:19,  2.25

['{\n    "1994-04-20": []\n}', '{\n    "1994-04-22": []\n}']
