In [15]:
from datetime import date, datetime
import time
from calendar import monthrange
from time import sleep
from glob import glob
import os
import sys
import requests
import re
from pyquery import PyQuery
import lxml

REQUEST_INTERVAL = 0.01    # Length of delay between each request in seconds, to avoid server banning the spider

# Date related settings and help functions

YEAR_START   = 2011     # 2011-04-06 is the earliest date accesible with the format 
MONTH_START  = 4        # http://cctv.cntv.cn/lm/xinwenlianbo/20110406.shtml
DAY_START    = 6

YEAR_END     = date.today().year
MONTH_END    = date.today().month
DAY_END      = date.today().day

def normalizeDate(date_unit):
    result = str(date_unit)
    if len(result) == 1:
        result = '0' + result
    return result

In [9]:
# Networking helper functions

BASE_URL = 'http://cctv.cntv.cn/lm/xinwenlianbo/'
URL_SUFFIX = '.shtml'

def getIndexURL(year, month, day, base_url=BASE_URL, concat_f = None, suffix = URL_SUFFIX):
    
    if concat_f == None:
        concat_f = lambda year, month, day: normalizeDate(year) + normalizeDate(month) + normalizeDate(day)
    
    return base_url + concat_f(year, month, day) + suffix

In [10]:
# Generate the list of index pages

index_list = [];

print('Generating a list of index pages for each day')

for year in range(YEAR_START, YEAR_END + 1):
    
    first_month = MONTH_START if year == YEAR_START else  1
    last_month  = MONTH_END   if year == YEAR_END   else 12
    
    print('\nThe year is', year, '|',
          'first month=', normalizeDate(first_month), '|',
          'last month=', normalizeDate(last_month))

    for month in range(first_month, last_month + 1):
              
        first_day = 1
        _, last_day = monthrange(year, month)
              
        if year == YEAR_START and month == MONTH_START:
            first_day = DAY_START
        
        if year == YEAR_END and month == MONTH_END:
            last_day = DAY_END
        
        print('', 'the month is', normalizeDate(month), '|',
              'first day is', normalizeDate(first_day), '|',
              'last day is', normalizeDate(last_day))
            
        for day in range(first_day, last_day + 1):
            indexURL = getIndexURL(year, month, day)
            index_list.append(indexURL)

Generating a list of index pages for each day

The year is 2011 | first month= 04 | last month= 12
 the month is 04 | first day is 06 | last day is 30
 the month is 05 | first day is 01 | last day is 31
 the month is 06 | first day is 01 | last day is 30
 the month is 07 | first day is 01 | last day is 31
 the month is 08 | first day is 01 | last day is 31
 the month is 09 | first day is 01 | last day is 30
 the month is 10 | first day is 01 | last day is 31
 the month is 11 | first day is 01 | last day is 30
 the month is 12 | first day is 01 | last day is 31

The year is 2012 | first month= 01 | last month= 12
 the month is 01 | first day is 01 | last day is 31
 the month is 02 | first day is 01 | last day is 29
 the month is 03 | first day is 01 | last day is 31
 the month is 04 | first day is 01 | last day is 30
 the month is 05 | first day is 01 | last day is 31
 the month is 06 | first day is 01 | last day is 30
 the month is 07 | first day is 01 | last day is 31
 the month is 08

In [11]:
DEFAULT_CODING = 'utf-8'

def getEncoding(html, mode='xinwenlianbo'):

    if mode == 'xinwenlianbo':
        
        # Extract encoding information from declarations like:
        # <meta http-equiv="Content-Type" content="text/html; charset=gbk" />
        
        CHARSET_LEFT_MARKER = 'charset='
        CHARSET_RIGHT_MARKER = '"'
        charset_start_pos = html.find(CHARSET_LEFT_MARKER)
        charset_end_pos = html.find(CHARSET_RIGHT_MARKER, charset_start_pos)
        
        return html[charset_start_pos + len(CHARSET_LEFT_MARKER) : charset_end_pos]

    else:
        raise NotImplementedError("Unknown mode of operation:" +  mode)

def getHTML(url, encoding=DEFAULT_CODING):
    
    # Get the HTML from a URL
    req = requests.get(url)
    req.encoding = getEncoding(req.text) # Requests can't correctly guess encoding
    if req.status_code == 200:
        return req.text
    else:
        raise RuntimeError('Fail to get url ' + url + ' | Status code=' + str(req.status_code))

def getDateFromURL(url, mode='xinwenlianbo-index'):
    
    if mode == 'xinwenlianbo-index':
        date_start = url.rfind('/')
        date_end = url.rfind('.shtml')
        date = url[date_start+1:date_end]
    
    elif mode == 'xinwenlianbo-post':
        pass
    
    return date



In [237]:
# Download all index pages and save them onto local storage

count = 0

for url in index_list:
      
    html = getHTML(url)
    
    date_url = getDateFromURL(url)
        
    f = open('index_pages/xinwenlianbo_index_' + date_url + '.html', 'w', encoding='utf-8');
    f.write(html)
    f.close()
    
    print('Downloaded ' + url)
        
    count += 1
    sleep(REQUEST_INTERVAL)

print('Downloaded ' + str(count) + ' pages in total')

Downloaded http://cctv.cntv.cn/lm/xinwenlianbo/20110406.shtml
Downloaded http://cctv.cntv.cn/lm/xinwenlianbo/20110407.shtml
Downloaded http://cctv.cntv.cn/lm/xinwenlianbo/20110408.shtml
Downloaded http://cctv.cntv.cn/lm/xinwenlianbo/20110409.shtml
Downloaded http://cctv.cntv.cn/lm/xinwenlianbo/20110410.shtml
Downloaded http://cctv.cntv.cn/lm/xinwenlianbo/20110411.shtml
Downloaded http://cctv.cntv.cn/lm/xinwenlianbo/20110412.shtml
Downloaded http://cctv.cntv.cn/lm/xinwenlianbo/20110413.shtml
Downloaded http://cctv.cntv.cn/lm/xinwenlianbo/20110414.shtml
Downloaded http://cctv.cntv.cn/lm/xinwenlianbo/20110415.shtml
Downloaded http://cctv.cntv.cn/lm/xinwenlianbo/20110416.shtml
Downloaded http://cctv.cntv.cn/lm/xinwenlianbo/20110417.shtml
Downloaded http://cctv.cntv.cn/lm/xinwenlianbo/20110418.shtml
Downloaded http://cctv.cntv.cn/lm/xinwenlianbo/20110419.shtml
Downloaded http://cctv.cntv.cn/lm/xinwenlianbo/20110420.shtml
Downloaded http://cctv.cntv.cn/lm/xinwenlianbo/20110421.shtml
Download

In [17]:
# Download each post from an index page
# Currently, these functions only work on the Xinwen Lianbo site as structured in October 2015

def getPostUrl(html, mode='xinwenlianbo-b'):
    
    # return a list of urls from an html
    
    post_url_list = []
    
    if mode == 'xinwenlianbo-b':
        
        URL_START_MARKER = 'http://'
        POST_LIST_MARKER_JS = 'new title_array_01'
        
        # Xinwei Lianbo posts are generated with different means, and requires different extraction methods.
        # The break point seems to be 2013-07-15. Before and on that date, JavaScript; afterwards, server side.
        
        #################################################
        #                                               #
        # The post anchors are inserted with JavaScript #
        # Period B1                                     #
        #                                               #
        #################################################
              
        if POST_LIST_MARKER_JS in html:

            marker_start_pos_list = [marker.start() for marker in re.finditer(POST_LIST_MARKER_JS, html)]

            for start_pos in marker_start_pos_list:
                url_start_pos = html.find(URL_START_MARKER, start_pos)
                url_end_pos  = html.find(URL_SUFFIX, start_pos)
                post_url_list.append(html[url_start_pos:url_end_pos] + URL_SUFFIX)
        
        ######################################################
        #                                                    #
        # The post anchors are generated on the server side  #
        # Period B2                                          #
        #                                                    #
        ######################################################
        
        else:
            d = PyQuery(html)
            for post_anchor in d('ul.title2 a'):
                post_url_list.append(post_anchor.attrib['href'])
        
        return post_url_list
        
    else:
        raise NotImplementedError("Unknown mode of operation:" +  mode)


def getTitle(html, mode='xinwenlianbo-b'):
    
    BOILERPLATES = ('[视频]', '_新闻频道_央视网(cctv.com)')
    
    if mode == 'xinwenlianbo-b':
        
        TITLE_MARKER_JS = 'document.write("<title>'
        TITLE_END_MARKER = '"+'
        
        # Dynamically inserted <title>
        if TITLE_MARKER_JS in html:
            title_start_pos = html.find(TITLE_MARKER_JS)
            title_end_pos = html.find(TITLE_END_MARKER, title_start_pos)
            title = html[title_start_pos + len(TITLE_MARKER_JS) : title_end_pos]
            
        # <title> is generated by server
        else:
            d = PyQuery(html)
            title = d('title')[0].text
            
        for plate in BOILERPLATES:
            try:
                title = title.replace(plate, '')
            except IndexError:
                pass
            
        return title
        
    else:
        raise NotImplementedError("Unknown mode of operation:" +  mode)
    
def getMainText(html, mode='xinwenlianbo-b'):
    
    # Return the main text 
        
    if mode == 'xinwenlianbo-b':
        
        GARBAGES = ('var para_count=1', )
        
        d = PyQuery(html)
        text = lxml.etree.tostring(d('div#content_body')[0], method='text', encoding='utf-8').decode('utf-8')
       
        for garbage in GARBAGES:
            text =text.replace(garbage, '')

        text = ''.join([line.strip() for line in text.splitlines()])

        return text

    else:
        raise NotImplementedError("Unknown mode of operation:" +  mode)

        
def downloadAllPostsFromIndexHTML(html_index, index_path, mode='xinwenlianbo-b'):
    
    def getDateFromIndexPath(path):       
        
        # Extract date from path like /index_pages/xinwenlianbo_index_20110406.html
        
        return re.search('\_20.*\.', path).group(0)[1:-1]
    
    global post_count

    for url_post in getPostUrl(html_index):
        
        try:
            html_post = getHTML(url_post)      
        except RuntimeError as error:
            print(error)
            
        title = getTitle(html_post)
        main_text = getMainText(html_post)

        path = requests.utils.quote(url_post, safe='')
        
        print('        Downloading ' + url_post)

        date_index = getDateFromIndexPath(index_path)
        year, month, day = date_index[0:4], date_index[4:6], date_index[6:8]
        
        date_today = str(time.time())
           
        # Group files by year
        directory = year
        
        if not os.path.exists('posts/' + directory):
            os.makedirs('posts/' + directory)        
      
        # Save original post in HTML (encoded with utf-8)
        f_post = open('posts/' + directory + '/post_'+ date_index + '_' + 'visited=' + date_today + '_' + path,
                      'w',
                      encoding='utf-8')
        f_post.write(html_post)
        f_post.close()
        
        # Save the title and main text
        if not os.path.exists('texts/' + directory):
            os.makedirs('texts/' + directory)        
        
        f_text = open('texts/' + directory + '/text_' + date_index + '_' + 'visited=' + date_today + '_' + path + '.txt',
                      'w',
                      encoding='utf-8')
        f_text.write(title + '\n' + main_text)
        f_text.close()
        
        post_count += 1;
        
index_count = 0;
post_count = 0;
            
for path in glob('index_pages/*.html'):

    try:
        print('\n Processing the ' + str(index_count) + 'th index pages: ' + path + '\n')
        file_index = open(path, 'r', encoding='utf-8')
        html_index = file_index.read()
        downloadAllPostsFromIndexHTML(html_index, path)    
        index_count += 1
    
    except:
        print(sys.exc_info()) # Prevent problems from stopping the whole scrapping process
        
print('**Downloaded ' + str(post_count) + ' posts in total**')


 Processing the 0th index pages: index_pages/xinwenlianbo_index_20110406.html

        Downloading http%3A%2F%2Fnews.cntv.cn%2Fprogram%2Fxwlb%2F20110406%2F104557.shtml
        Downloading http%3A%2F%2Fnews.cntv.cn%2Fprogram%2Fxwlb%2F20110406%2F104380.shtml
        Downloading http%3A%2F%2Fnews.cntv.cn%2Fprogram%2Fxwlb%2F20110406%2F104382.shtml
        Downloading http%3A%2F%2Fnews.cntv.cn%2Fprogram%2Fxwlb%2F20110406%2F104383.shtml
        Downloading http%3A%2F%2Fnews.cntv.cn%2Fprogram%2Fxwlb%2F20110406%2F104394.shtml
        Downloading http%3A%2F%2Fnews.cntv.cn%2Fprogram%2Fxwlb%2F20110406%2F104393.shtml
        Downloading http%3A%2F%2Fnews.cntv.cn%2Fprogram%2Fxwlb%2F20110406%2F104396.shtml
        Downloading http%3A%2F%2Fnews.cntv.cn%2Fprogram%2Fxwlb%2F20110406%2F104410.shtml
        Downloading http%3A%2F%2Fnews.cntv.cn%2Fprogram%2Fxwlb%2F20110406%2F104409.shtml
        Downloading http%3A%2F%2Fnews.cntv.cn%2Fprogram%2Fxwlb%2F20110406%2F104412.shtml
        Downloading http%3A%2F