In [None]:
import re
import os
import json
import requests
from bs4 import BeautifulSoup
import ffmpy3 # 多媒体视频处理工具 ffmpeg
from multiprocessing.dummy import Pool as ThreadPool
import time
import random
from tqdm import tqdm

In [None]:
def get_lessons_name_and_url(course_url):
    '''
    从课程主页获取课程视频列表
    return: a list of (lesson_title, lesson_url)
    '''
    request_headers = {
        'Host': 'www.chinahadoop.cn',
        'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:76.0) Gecko/20100101 Firefox/76.0',
        'Accept': '*/*',
        'Accept-Language': 'en-US,en;q=0.5',
        'Accept-Encoding': 'gzip, deflate, br',
        'X-Requested-With': 'XMLHttpRequest',
        'Connection': 'keep-alive',
        'Referer': 'https://www.chinahadoop.cn/course/1336',
        'Cookie': 'PHPSESSID=860283qvq54bt74tmf2h5tuet6; zg_did=%7B%22did%22%3A%20%221723490fc59c3d-0d04f18cdc6d1e-70236753-1fa400-1723490fc5a397%22%7D; zg_727f75a76e954bc385156eb7ff3fb110=%7B%22sid%22%3A%201590019816541%2C%22updated%22%3A%201590021822158%2C%22info%22%3A%201590019816545%2C%22superProperty%22%3A%20%22%7B%7D%22%2C%22platform%22%3A%20%22%7B%7D%22%2C%22utm%22%3A%20%22%7B%7D%22%2C%22referrerDomain%22%3A%20%22account.xiaoxiangxueyuan.com%22%2C%22landHref%22%3A%20%22https%3A%2F%2Fwww.chinahadoop.cn%2F%22%2C%22cuid%22%3A%20%22184938%22%7D'
    }
    req = requests.get(course_url, headers=request_headers)
    req.encoding = 'utf-8'
    bs_html = BeautifulSoup(req.text, 'lxml')
    lessons_bs = bs_html.find_all('a', class_='course-lesson')
    lessons = []
    for lesson in lessons_bs:
        time_length = lesson.find('div', id="timeLength")
        if time_length:
            time = time_length.text[:20].split('秒')[0] + '秒'
            title = lesson.find('span', class_='title')
            url = 'https://www.chinahadoop.cn' + lesson['href']
            title = title.text.strip() + time.strip()
            lessons.append((title, url))

    return lessons

In [None]:
def get_media_uri(json_url):
    '''
    获取视频的标识符，用于构造最终的视频链接
    '''
    request_headers = {
        'Host': 'www.chinahadoop.cn',
        'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:76.0) Gecko/20100101 Firefox/76.0',
        'Accept': 'application/json, text/javascript, */*; q=0.01',
        'Accept-Language': 'en-US,en;q=0.5',
        'Accept-Encoding': 'gzip, deflate, br',
        'X-Requested-With': 'XMLHttpRequest',
        'Origin': 'https://www.chinahadoop.cn',
        'Connection': 'keep-alive',
        'Referer': 'https://www.chinahadoop.cn/course/1336/learn',
        'Cookie': 'PHPSESSID=860283qvq54bt74tmf2h5tuet6; zg_did=%7B%22did%22%3A%20%221723490fc59c3d-0d04f18cdc6d1e-70236753-1fa400-1723490fc5a397%22%7D; zg_727f75a76e954bc385156eb7ff3fb110=%7B%22sid%22%3A%201590019816541%2C%22updated%22%3A%201590023388307%2C%22info%22%3A%201590019816545%2C%22superProperty%22%3A%20%22%7B%7D%22%2C%22platform%22%3A%20%22%7B%7D%22%2C%22utm%22%3A%20%22%7B%7D%22%2C%22referrerDomain%22%3A%20%22account.xiaoxiangxueyuan.com%22%2C%22landHref%22%3A%20%22https%3A%2F%2Fwww.chinahadoop.cn%2F%22%2C%22cuid%22%3A%20%22184938%22%7D',
        'Cache-Control': 'max-age=0'
    }

    req = requests.get(json_url, headers=request_headers)
    data = req.json()
    return data['mediaUri']


def get_video_json(json_url):
    '''
    获取视频的详细信息的json文件，并从中提取出视频的URL
    '''
    request_header = {
        'Host': 'playvideo.qcloud.com',
        'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:76.0) Gecko/20100101 Firefox/76.0',
        'Accept': '*/*',
        'Accept-Language': 'en-US,en;q=0.5',
        'Accept-Encoding': 'gzip, deflate, br',
        'Origin': 'https://www.chinahadoop.cn',
        'Connection': 'keep-alive',
        'Referer': 'https://www.chinahadoop.cn/course/1336/learn'
    }
    req = requests.get(json_url, headers=request_header)
    return req.json()


def get_m3u8(filename, m3u8_url):
    '''
    下载m3u8文件
    '''
    try:
        ffmpy3.FFmpeg(inputs={m3u8_url: None},
                      outputs={filename: None}).run()
    except Exception as e:
        print("Failed: " + filename)
        print(e)
    return 0


def get_mp4(filename, mp4_url):
    '''
    下载mp4文件
    '''
    startTime = time.time()
    # 直接下载文件，不进行浏览器伪装（提供headers参数）
    req = requests.get(mp4_url, stream=True)
    with(open(filename, 'wb')) as f:
        downsize = 0
        for chunk in req.iter_content(chunk_size=1073741824):
            if chunk:
                f.write(chunk)
                downsize += len(chunk)
                line = 'downloading %d KB/s - %.2f MB， 共 %.2f MB'
                line = line % (
                    downsize / 1024 / (time.time() - startTime),
                    downsize / 1024 / 1024, downsize / 1024 / 1024)
                print(line)


def download_video(filename, url):
    '''
    根据链接类型下载选择不同的方法下载文件
    '''
    filename = filename+'.mp4'
    # 如果获取的链接是m3u8结尾的 直接下载
    if url.endswith('m3u8'):
        get_m3u8(filename, url)
    else:
        get_mp4(filename, url)


def get_downloaded(download_path):
    '''
    从下载目录查看已下载的文件
    '''
    downloaded = []
    for path, dirs, files in os.walk(download_path):
        for file in files:
            # 筛选并去除文件扩展名
            if file.endswith('.mp4'):
                downloaded.append(file[:-4])
    return downloaded


def get_undownloaded(targets, downloaded):
    '''
    对比要下载的课程列表，找出尚未下载的
    '''
    undownloaded = [target for target in targets if target[0].strip() not in downloaded]
    return undownloaded


def get_data(chapter):
    title = chapter[0]
    url = chapter[1]
    media_uri_json_url = url.replace('learn#', '')
    media_uri = get_media_uri(media_uri_json_url)
    pre_url = 'https://playvideo.qcloud.com/getplayinfo/v2/1253931042/'
    video_json_url = pre_url + media_uri
    video_json = get_video_json(video_json_url)
    video_info = video_json['videoInfo']
    source_video = video_info.get('sourceVideo')
    video_url = ''
    if source_video is None:
        video_url = video_info['transcodeList'][0]['url']
    else:
        video_url = source_video['url']

    download_video(title, video_url)

In [None]:
# 下载课程
course_url = 'https://www.chinahadoop.cn/course/1336'     # 预习课
course_url = 'https://www.chinahadoop.cn/course/1244'     # 自然语言处理算法精讲
chapters = get_lessons_name_and_url(course_url)
# 开12个线程池
pool = ThreadPool(12)
results = pool.map(get_data, chapters)
pool.close()
pool.join()

In [None]:
# 检查已下载课程，并重新下载未完成课程
downloaded = get_downloaded('./')
undownloaded = get_undownloaded(chapters, downloaded)
pool = ThreadPool(12)
results = pool.map(get_data, undownloaded)
pool.close()
pool.join()

In [None]:
def get_lessons(course_url):
    request_headers = {
        'Host': 'www.chinahadoop.cn',
        'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:76.0) Gecko/20100101 Firefox/76.0',
        'Accept': '*/*',
        'Accept-Language': 'en-US,en;q=0.5',
        'Accept-Encoding': 'gzip, deflate, br',
        'X-Requested-With': 'XMLHttpRequest',
        'Connection': 'keep-alive',
        'Referer': 'https://www.chinahadoop.cn/course/1382/learn',
        'Cookie': 'PHPSESSID=860283qvq54bt74tmf2h5tuet6; zg_did=%7B%22did%22%3A%20%221723490fc59c3d-0d04f18cdc6d1e-70236753-1fa400-1723490fc5a397%22%7D; zg_727f75a76e954bc385156eb7ff3fb110=%7B%22sid%22%3A%201590019816541%2C%22updated%22%3A%201590021822158%2C%22info%22%3A%201590019816545%2C%22superProperty%22%3A%20%22%7B%7D%22%2C%22platform%22%3A%20%22%7B%7D%22%2C%22utm%22%3A%20%22%7B%7D%22%2C%22referrerDomain%22%3A%20%22account.xiaoxiangxueyuan.com%22%2C%22landHref%22%3A%20%22https%3A%2F%2Fwww.chinahadoop.cn%2F%22%2C%22cuid%22%3A%20%22184938%22%7D'
    }
    req = requests.get(course_url, headers=request_headers)
    req.encoding = 'utf-8'
    bs_html = BeautifulSoup(req.text, 'lxml')
    lessons_bs = bs_html.find_all('a', class_='course-lesson')
    chapters = []
    idx = 0
    for lesson in lessons_bs:
        href = lesson['href']
        title = lesson['title']
        chapters.append((str(idx) + '.' + title, 'https://www.chinahadoop.cn' + href))
        idx += 1

    return chapters               # a list of (title, url)

In [None]:
course_url = 'https://www.chinahadoop.cn/lessonplugin/lesson/hflist/classroom?courseId=1382'     # L2阶段直播课
course_url = 'https://www.chinahadoop.cn/lessonplugin/lesson/hflist/classroom?courseId=1426'     # L2直播课
lessons = get_lessons(course_url)

In [None]:
pool = ThreadPool(12)
results = pool.map(get_data, lessons)
pool.close()
pool.join()

In [None]:
downloaded = get_downloaded('./')
undownloaded = get_undownloaded(lessons, downloaded)
pool = ThreadPool(12)
results = pool.map(get_data, undownloaded)
pool.close()
pool.join()

In [None]:
get_data(lessons[3])