## ALL

In [1]:
from grab import Grab
from datetime import timedelta, date, datetime
import time
from random import randint
import requests

def date_range(start_date, end_date):
    """
    find business days to parse
    :param start_date:
    :param end_date:
    :return: generator with business days
    """
    for n in range(int((end_date - start_date).days) + 1):
        if (start_date + timedelta(n)).weekday() not in (5, 6):
            yield start_date + timedelta(n)
            
            
def save_to_file(data_save, filename):
    """save data to filename"""
    
    with open("{}.txt".format(filename), "a") as f:
        for raw in data_save:
            for obj in raw:
                # exclude not numerical symbols
                num = str(obj).replace(',', '').replace('-', '')
                if obj == raw[0]:
                    # convert to excel datetime format
                    dt = datetime.strptime(num, "%Y%m%d")
                    num = datetime.strftime(dt, "%Y-%m-%d")
                f.write(str(num))

                if obj != raw[-1]:
                    # do not need write ',' after last symbol
                    f.write(', ')
            f.write('\n')
    print("Saved to file {}".format(f.name))


def czce(dates):
    """parse data from czce
    different link after 20151006
    """
    
    _links = []
    # old link format
    # url = 'http://english.czce.com.cn/enportal/exchange/marketdata/{0}/datadaily/{1}.htm'
    # new link format from 20151008
    url = 'http://english.czce.com.cn/enportal/DFSStaticFiles/Future/{0}/{1}/EnglishFutureDataDaily.htm'
    for single_date in dates:
        _links.append(url.format(single_date.strftime("%Y"), single_date.strftime("%Y%m%d")))
        
    g = Grab()
    _data = []
    for indx, l in enumerate(_links):
        try:
            g.go(l)
            for i in g.doc.select('//*[@id="senfe"]/tr'):
                if 'Total' in i.text():
                    # old format
                    # _data.append((l[-12:-4], i.text().split(' ')[1], i.text().split(' ')[2]))
                    # print('added', l[-12:-4])
                    # new format
                    _data.append((l[-35:-27], i.text().split(' ')[1], i.text().split(' ')[2]))
                    print('added', l[-35:-27])
            
            if indx % 3 == 0:
                save_to_file(_data, 'czce')
                _data.clear()

                time.sleep(randint(0, 3))  # try to avoid ban
        except:
            print('error', l[-35:-27])
            continue
    save_to_file(_data, 'czce')


def shfe(dates):
    """parse shfe data"""
    
    _links = []
    url = 'http://www.shfe.com.cn/data/dailydata/kx/kx{0}.dat'
    for single_date in dates:
        _links.append(url.format(single_date.strftime("%Y%m%d")))
    
    _data = []
    for indx, l in enumerate(_links):
        try:
            r = requests.get(l).json()
            time.sleep(randint(0, 3))  # try to avoid ban
            _data.append((l[-12:-4], r['o_curinstrument'][-1]['VOLUME'],
                         r['o_curinstrument'][-1]['OPENINTEREST']))
            print('added', l[-12:-4])
            if indx % 3 == 0:
                save_to_file(_data, 'shfe')
                _data.clear()
        except:
            print('error', l[-12:-4])
            continue
    save_to_file(_data, 'shfe')

    
def dce(dates):
    """grab data from dce"""
    
    g = Grab()
    url = 'http://www.dce.com.cn/PublicWeb/MainServlet'

    res = []
    for indx, date in enumerate(dates):
        try:
            # post request
            cur_date = int(date.strftime("%Y%m%d"))
            _data = {'action': 'Pu00231_result',
                'Pu00231_Input.trade_date': cur_date,
                'Pu00231_Input.variety': 'all',
                'Pu00231_Input.trade_type': 0,
                'Submit': 'Go'}
            g.go(url, post=_data)
            time.sleep(randint(0, 3))  # try to avoid ban
            # find 'total' element and grab its Volume and OI
            for i in g.doc.select('//body/table/tr/td/*/tr'):
                if 'Total' in i.text():
                    d = i.text().split(' ')
                    res.append((cur_date, d[1], d[2]))
            print('added', cur_date)
            # save all, every 3rd element
            if indx % 3 == 0:
                save_to_file(res, 'dce')
                res.clear()
        except:
            print('error', cur_date)
            continue
    save_to_file(res, 'dce')


def cffex(dates):
    """grab data from cffex"""
    
    url = 'http://www.cffex.com.cn/fzjy/mrhq/{0}/{1}/index.xml'
    _links = []

    for date in dates:
        _links.append(url.format(date.strftime("%Y%m"), date.strftime("%d")))
    
    g = Grab()
    _data = []

    for indx, l in enumerate(_links):
        try:
            g.go(l)
            tree = g.doc.build_xml_tree()

            daily_vol = []
            daily_oi = []

            for i in tree:
                for j in i.iter():
                    if j.tag == 'volume':
                        # print(j.tag, ' = ', j.text)
                        vol = j.text
                        daily_vol.append(int(vol.split('.')[0]))
                    elif j.tag == 'openinterest':
                        # print(j.tag, ' = ', j.text)
                        oi = j.text
                        daily_oi.append(int(oi.split('.')[0]))
                        # print(sum(daily_vol), sum(daily_oi))
            _data.append((l[-19:-13] + l[-12:-10], sum(daily_vol), sum(daily_oi)))
            daily_vol.clear()
            daily_oi.clear()
            print('added: ', _data[-1])
            if indx % 3 == 0:
                save_to_file(_data, 'cffex')
                _data.clear()
        except:
            print('error ', l[-19:-13] + l[-12:-10])
            continue
    save_to_file(_data, 'cffex')

    
def szse_main(dates):
    """parse data from szse main board"""
    
    g = Grab()
    url = 'http://www.szse.cn/szseWeb/FrontController.szse?randnum=0.06895916919770584'

    res = []
    for indx, date in enumerate(dates):
        try:
            # post request
            cur_date = date.strftime("%Y-%m-%d")
            post_data = {'ACTIONID':7,
                        'AJAX':'AJAX-TRUE',
                        'CATALOGID':1849,
                        'txtQueryDate':cur_date,
                        'TABKEY':'tab2'}
            g.go(url, post=post_data)
            time.sleep(randint(0, 3))  # try to avoid ban
            # find 'shares traded' element and grab its Volume
            volume = []
            for tr in g.doc.select('//table[@class="cls-data-table"]/tr'):
                if 'Shares' in tr.text():
                    for td in tr.select('td'):
                        volume.append(td.text())
            print(cur_date,volume[1])
            res.append((cur_date, volume[1]))
            # save all, every 3rd element
            if indx % 3 == 0:
                save_to_file(res, 'szse_main')
                res.clear()
        except:
            print('error', cur_date)
            continue
    save_to_file(res, 'szse_main')

    
def szse_sme(dates):
    """grab data from SZSE SME Board"""
    
    g = Grab()
    url = 'http://www.szse.cn/szseWeb/FrontController.szse?randnum=0.06895916919770584'

    res = []
    for indx, date in enumerate(dates):
        try:
            # post request
            cur_date = date.strftime("%Y-%m-%d")
            post_data = {'ACTIONID':7,
                        'AJAX':'AJAX-TRUE',
                        'CATALOGID':1849,
                        'txtQueryDate':cur_date,
                        'TABKEY':'tab3'}
            g.go(url, post=post_data)
            time.sleep(randint(0, 3))  # try to avoid ban
            # find 'shares traded' element and grab its Volume
            volume = []
            for tr in g.doc.select('//table[@class="cls-data-table"]/tr'):
                if 'Shares' in tr.text():
                    for td in tr.select('td'):
                        volume.append(td.text())
            print(cur_date,volume[1])
            res.append((cur_date, volume[1]))
            # save all, every 3rd element
            if indx % 3 == 0:
                save_to_file(res, 'szse_sme')
                res.clear()
        except:
            print('error', cur_date)
            continue
    save_to_file(res, 'szse_sme')
    
if __name__ == '__main__':
    start = date(2016, 9, 13)
    end = date(2016, 9, 21)
    
    czce(date_range(start, end))
    shfe(date_range(start, end))
    dce(date_range(start, end))
    cffex(date_range(start, end))
    szse_main(date_range(start, end))
    szse_sme(date_range(start, end))

added 20160913
Saved to file czce.txt
added 20160914
Saved to file czce.txt
added 20160919
added 20160920
added 20160921
Saved to file czce.txt
Saved to file czce.txt
added 20160913
Saved to file shfe.txt
added 20160914
error 20160915
error 20160916
added 20160919
added 20160920
added 20160921
Saved to file shfe.txt
Saved to file shfe.txt
added 20160913
Saved to file dce.txt
added 20160914
added 20160915
added 20160916
Saved to file dce.txt
added 20160919
added 20160920
added 20160921
Saved to file dce.txt
Saved to file dce.txt
added:  ('20160913', 79387, 165853)
Saved to file cffex.txt
added:  ('20160914', 76526, 165359)
error  20160915
error  20160916
added:  ('20160919', 55500, 156292)
added:  ('20160920', 44597, 159836)
added:  ('20160921', 49804, 162021)
Saved to file cffex.txt
Saved to file cffex.txt
2016-09-13 6,765,656,012
Saved to file szse_main.txt
2016-09-14 6,202,817,898
error 2016-09-15
error 2016-09-16
2016-09-19 5,843,585,362
2016-09-20 5,835,569,072
2016-09-21 5,991,643

# czce

different link after 20151006

In [None]:
from grab import Grab
from datetime import timedelta, date
import time
from random import randint


def date_range(start_date, end_date):
    """
    find business days to parse
    :param start_date:
    :param end_date:
    :return: generator with business days
    """
    for n in range(int((end_date - start_date).days) + 1):
        if (start_date + timedelta(n)).weekday() not in (5, 6):
            yield start_date + timedelta(n)


def find_links(dates):
    """
    create links
    :param dates: days range when we need to parse data
    :return: list with links
    """
    _links = []
    # old link format
    # url = 'http://english.czce.com.cn/enportal/exchange/marketdata/{0}/datadaily/{1}.htm'
    # new link format from 20151008
    url = 'http://english.czce.com.cn/enportal/DFSStaticFiles/Future/{0}/{1}/EnglishFutureDataDaily.htm'
    for single_date in dates:
        _links.append(url.format(single_date.strftime("%Y"), single_date.strftime("%Y%m%d")))
    return _links


def parse_data(linkz):
    """
    parse data from each link
    :param linkz: list with links
    :return: list with data
    """
    g = Grab()
    _data = []
    for l in linkz:
        try:
            g.go(l)
            for i in g.doc.select('//*[@id="senfe"]/tr'):
                if 'Total' in i.text():
                    # old format
                    # _data.append((l[-12:-4], i.text().split(' ')[1], i.text().split(' ')[2]))
                    # print('added', l[-12:-4])
                    # new format
                    _data.append((l[-35:-27], i.text().split(' ')[1], i.text().split(' ')[2]))
                    print('added', l[-35:-27])
            time.sleep(randint(0, 3))  # try to avoid ban
        except:
            print('error', l[-12:-4])
            continue
    return _data


def save_to_file(data_save):
    with open("czce_new.txt", "w") as f:
        f.write("(Date, Total_Volume, Total_Open_Interest)")
        f.write('\n')
        for raw in data_save:
            f.write(str(raw))
            f.write('\n')
    print("Saved to file {}".format(f.name))


if __name__ == '__main__':
    start = date(2015, 10, 8)
    end = date(2016, 8, 21)
    links = find_links(date_range(start, end))
    data = parse_data(links)
    save_to_file(data)

# shfe

In [None]:
from datetime import timedelta, date
import time
from random import randint
import requests


def date_range(start_date, end_date):
    """
    find business days to parse
    :param start_date:
    :param end_date:
    :return: generator with business days
    """
    for n in range(int((end_date - start_date).days) + 1):
        if (start_date + timedelta(n)).weekday() not in (5, 6):
            yield start_date + timedelta(n)


def find_links(dates):
    """
    create links
    :param dates: days range when we need to parse data
    :return: list with links
    """
    _links = []
    url = 'http://www.shfe.com.cn/data/dailydata/kx/kx{}.dat'
    for single_date in dates:
        _links.append(url.format(single_date.strftime("%Y%m%d")))
    return _links


def parse_data(linkz):
    """
    parse data from each link
    :param linkz: list with links
    :return: list with data
    """
    _data = []
    for i, l in enumerate(linkz):
        try:
            r = requests.get(l).json()
            time.sleep(randint(0, 3))  # try to avoid ban
            _data.append((l[-12:-4], r['o_curinstrument'][-1]['VOLUME'],
                         r['o_curinstrument'][-1]['OPENINTEREST']))
            print('added', l[-12:-4])
            if i % 3 == 0:
                save_to_file(_data)
                _data.clear()
        except:
            print('error', l[-12:-4])
            continue
    save_to_file(_data)
    return _data


def save_to_file(data_save):
    with open("shfe.txt", "a") as f:
        #f.write("(Date, Total_Volume, Total_Open_Interest)")
        #f.write('\n')
        for raw in data_save:
            f.write(str(raw))
            f.write('\n')
    print("Saved to file {}".format(f.name))


if __name__ == '__main__':
    start = date(2016, 4, 11)
    end = date(2016, 8, 22)
    links = find_links(date_range(start, end))
    data = parse_data(links)
    #save_to_file(data)

# DCE

In [None]:
from grab import Grab
from datetime import timedelta, date
import time
from random import randint


def date_range(start_date, end_date):
    """
    find business days to parse
    :param start_date:
    :param end_date:
    :return: generator with business days
    """
    for n in range(int((end_date - start_date).days) + 1):
        if (start_date + timedelta(n)).weekday() not in (5, 6):
            yield start_date + timedelta(n)


def grab_data(dates):
    g = Grab()
    url = 'http://www.dce.com.cn/PublicWeb/MainServlet'

    res = []
    for indx, date in enumerate(dates):
        try:
            # post request
            cur_date = int(date.strftime("%Y%m%d"))
            _data = {'action': 'Pu00231_result',
                'Pu00231_Input.trade_date': cur_date,
                'Pu00231_Input.variety': 'all',
                'Pu00231_Input.trade_type': 0,
                'Submit': 'Go'}
            g.go(url, post=_data)
            time.sleep(randint(0, 3))  # try to avoid ban
            # find 'total' element and grab its Volume and OI
            for i in g.doc.select('//body/table/tr/td/*/tr'):
                if 'Total' in i.text():
                    d = i.text().split(' ')
                    res.append((cur_date, d[1], d[2]))
            print('added', cur_date)
            # save all, every 3rd element
            if indx % 3 == 0:
                save_to_file(res)
                res.clear()
        except:
            print('error', cur_date)
            continue
    save_to_file(res)

    
def save_to_file(data_save):
    with open("dce.txt", "a") as f:
        #f.write("(Date, Total_Volume, Total_Open_Interest)")
        #f.write('\n')
        for raw in data_save:
            f.write(str(raw))
            f.write('\n')
    print("Saved to file {}".format(f.name))


if __name__ == '__main__':
    start = date(2016, 8, 15)
    end = date(2016, 8, 21)
    grab_data(date_range(start, end))

# cffex

In [None]:
from grab import Grab
from datetime import timedelta, date


def date_range(start_date, end_date):
    """
    find business days to parse
    :param start_date:
    :param end_date:
    :return: generator with business days
    """
    for n in range(int((end_date - start_date).days) + 1):
        if (start_date + timedelta(n)).weekday() not in (5, 6):
            yield start_date + timedelta(n)


def save_to_file(data_save):
    with open("cffex.txt", "a") as f:
        # f.write("(Date, Total_Volume, Total_Open_Interest)")
        # f.write('\n')
        for raw in data_save:
            f.write(str(raw))
            f.write('\n')
    print("Saved to file {}".format(f.name))


def create_links(dates):
    url = 'http://www.cffex.com.cn/fzjy/mrhq/{0}/{1}/index.xml'
    _links = []

    for date in dates:
        _links.append(url.format(date.strftime("%Y%m"), date.strftime("%d")))
    return _links


def grab_data(links):
    g = Grab()
    _data = []

    for indx, l in enumerate(links):
        try:
            g.go(l)
            tree = g.doc.build_xml_tree()

            daily_vol = []
            daily_oi = []

            for i in tree:
                for j in i.iter():
                    if j.tag == 'volume':
                        # print(j.tag, ' = ', j.text)
                        vol = j.text
                        daily_vol.append(int(vol.split('.')[0]))
                    elif j.tag == 'openinterest':
                        # print(j.tag, ' = ', j.text)
                        oi = j.text
                        daily_oi.append(int(oi.split('.')[0]))
                        # print(sum(daily_vol), sum(daily_oi))
            _data.append((l[-19:-13] + l[-12:-10], sum(daily_vol), sum(daily_oi)))
            daily_vol.clear()
            daily_oi.clear()
            print('added: ', _data[-1])
            if indx % 3 == 0:
                save_to_file(_data)
                _data.clear()
        except:
            print('error ', l[-19:-13] + l[-12:-10])
            continue
    save_to_file(_data)


if __name__ == '__main__':
    start = date(2014, 8, 17)
    end = date(2016, 8, 22)
    all_links = create_links(date_range(start, end))
    grab_data(all_links)

# szse main_board

In [None]:
from grab import Grab
from datetime import timedelta, date
import time
from random import randint


def date_range(start_date, end_date):
    """
    find business days to parse
    :param start_date:
    :param end_date:
    :return: generator with business days
    """
    for n in range(int((end_date - start_date).days) + 1):
        if (start_date + timedelta(n)).weekday() not in (5, 6):
            yield start_date + timedelta(n)


def grab_data(dates):
    g = Grab()
    url = 'http://www.szse.cn/szseWeb/FrontController.szse?randnum=0.06895916919770584'

    res = []
    for indx, date in enumerate(dates):
        try:
            # post request
            cur_date = date.strftime("%Y-%m-%d")
            post_data = {'ACTIONID':7,
                        'AJAX':'AJAX-TRUE',
                        'CATALOGID':1849,
                        'txtQueryDate':cur_date,
                        'TABKEY':'tab2'}
            g.go(url, post=post_data)
            time.sleep(randint(0, 3))  # try to avoid ban
            # find 'shares traded' element and grab its Volume
            volume = []
            for tr in g.doc.select('//table[@class="cls-data-table"]/tr'):
                if 'Shares' in tr.text():
                    for td in tr.select('td'):
                        volume.append(td.text())
            print(cur_date,volume[1])
            res.append((cur_date, volume[1]))
            # save all, every 3rd element
            if indx % 3 == 0:
                save_to_file(res)
                res.clear()
        except:
            print('error', cur_date)
            continue
    save_to_file(res)

    
def save_to_file(data_save):
    with open("szse_main.txt", "a") as f:
        for raw in data_save:
            f.write(str(raw))
            f.write('\n')
    print("Saved to file {}".format(f.name))


if __name__ == '__main__':
    start = date(2014, 8, 15)
    end = date(2016, 8, 24)
    grab_data(date_range(start, end))

# szse SME board

In [None]:
from grab import Grab
from datetime import timedelta, date
import time
from random import randint


def date_range(start_date, end_date):
    """
    find business days to parse
    :param start_date:
    :param end_date:
    :return: generator with business days
    """
    for n in range(int((end_date - start_date).days) + 1):
        if (start_date + timedelta(n)).weekday() not in (5, 6):
            yield start_date + timedelta(n)


def grab_data(dates):
    g = Grab()
    url = 'http://www.szse.cn/szseWeb/FrontController.szse?randnum=0.06895916919770584'

    res = []
    for indx, date in enumerate(dates):
        try:
            # post request
            cur_date = date.strftime("%Y-%m-%d")
            post_data = {'ACTIONID':7,
                        'AJAX':'AJAX-TRUE',
                        'CATALOGID':1849,
                        'txtQueryDate':cur_date,
                        'TABKEY':'tab3'}
            g.go(url, post=post_data)
            time.sleep(randint(0, 3))  # try to avoid ban
            # find 'shares traded' element and grab its Volume
            volume = []
            for tr in g.doc.select('//table[@class="cls-data-table"]/tr'):
                if 'Shares' in tr.text():
                    for td in tr.select('td'):
                        volume.append(td.text())
            print(cur_date,volume[1])
            res.append((cur_date, volume[1]))
            # save all, every 3rd element
            if indx % 3 == 0:
                save_to_file(res)
                res.clear()
        except:
            print('error', cur_date)
            continue
    save_to_file(res)

    
def save_to_file(data_save):
    with open("szse_sme.txt", "a") as f:
        for raw in data_save:
            f.write(str(raw))
            f.write('\n')
    print("Saved to file {}".format(f.name))


if __name__ == '__main__':
    start = date(2014, 8, 15)
    end = date(2016, 8, 24)
    grab_data(date_range(start, end))