In [274]:
import requests
import bs4
import re


In [275]:
# Fetching news from Sina

def fetchNews(keyword, pagination):
    url = 'https://search.cctv.com/search.php'

    data = {
        "qtext": keyword,
        "sort": 'relevance',
        "type": 'web',
        'page': pagination
    }

    headers = {
        'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.64 Safari/537.36',
        'Content-Type': 'application/x-www-form-urlencoded',
    }


    return requests.post(url, data=data, headers=headers)
    


In [276]:
# Construct data structure
def getCleanData(element):

    data = {}
    data['title'] = {'name': '标题', 'value': re.sub(r'\n', '', element.find('a').text)}
    data['url'] = {'name': '链接', 'value': element.h3.span.get('lanmu1')}
    data['source'] ={'name': '来源','value': element.find(class_='src').text.split('：')[1]}
    data['date'] = {'name': '日期', 'value': element.find(class_='tim').text.split('：')[1]}

    for key, value in data.items():
        print(value['name'], value['value'])
    return data

In [277]:

# Use bs4 to parse returned document
def parseElements(response):
    soup = bs4.BeautifulSoup(response.text, 'html.parser')
    resultWrapper = soup.html.find('div', class_='tuwenjg');

    res = []
    for i in resultWrapper.ul.find_all('li'):
        res.append(getCleanData(i))
        print('\n')

    return res

In [278]:

# Get single target news data based on the keyword and the length of news you want to get
# the server only return limited number of news, so we need to fetch more news using recursion

def getNewsData(keyword, length, pagination=1):
    res = []
    r = fetchNews(keyword, pagination)

    data = parseElements(r)
    res += data

    if(len(res) < length):
        return res + getNewsData(keyword, length - len(res), pagination + 1)

    return res[slice(length)]

In [279]:
#  Get whole news list data based on keywords and the length requirements
#  write the data into news.txt
def scrape(keywords, length=1):
    dataList = []
    for keyword in keywords:
        print('\n' + '*' * 20 + '\n')
        print('Start Scraping \n' )
        print('keyword: ' + keyword + '\n')
        dataList.append({'keyword': keyword, 'result': getNewsData(keyword, length)})
        print('Scrapping finished')
        print('\n' + '*' * 20 + '\n')

    def save(dataList):
        file = open('news.txt', 'w')

        for data in dataList:
            file.write('\n' + '*' * 20 + '\n')
            file.write('\nkeyword: ' + data['keyword'] + '\n')
            file.write('Total: ' + str(len(data['result'])) + '\n')
            file.write('\n' + '*' * 20 + '\n')

            for idx, r in enumerate(data['result']):
                file.write('\n')
                for key, value in r.items():
                    file.write(value['name'] + ': ' + value['value'] + '\n')

        file.close()

    save(dataList)

In [280]:

# Perform the scraping
keywords = ['阿里巴巴', '京东', '万科集团', '腾讯', '小米', '新东方']
scrape(keywords, length=100)


********************

Start Scraping 

keyword: 阿里巴巴

标题 阿里巴巴揭幕新制造数字工厂
链接 https://5gai.cctv.com/2021/12/02/ARTIXuFfdD5mEuPLe9Ggvo0o211123.shtml
来源 AI频道
日期 2021-12-02 16:25:29


标题 我省与阿里巴巴集团签署深化战略合作协议
链接 https://news.cctv.com/2022/03/01/ARTIoEqtJT4nLoiRqYzFI6K4220301.shtml
来源 新闻
日期 2022-03-01 10:33:27


标题 阿里巴巴集团将投入1000亿元助力共同富裕
链接 https://jingji.cctv.com/2021/09/02/ARTIsx8pKxlgzY044s9MRJeD210902.shtml
来源 经济
日期 2021-09-02 18:56:44


标题 阿里巴巴完成回购
链接 http://jingji.cntv.cn/2012/09/20/ARTI1348105226614563.shtml
来源 经济
日期 2012-09-20 09:41:24


标题 荷兰国王造访阿里巴巴
链接 http://jingji.cctv.com/2015/10/30/ARTI1446144863192918.shtml
来源 经济
日期 2015-10-30 03:02:58


标题 为了阿里巴巴们不再流失
链接 http://jingji.cntv.cn/2014/09/10/ARTI1410308446335850.shtml
来源 经济
日期 2014-09-10 08:21:12


标题 阿里巴巴关闭26家网店
链接 http://jingji.cntv.cn/2015/03/26/ARTI1427325970978565.shtml
来源 经济
日期 2015-03-26 07:26:39


标题 成就阿里巴巴的还有谁？
链接 http://opinion.cntv.cn/2014/09/25/ARTI1411604515624652.shtml
来源 新闻
日期 2014-09-25 08:22:40


标题 阿里巴巴将赴纽交所上市
链接 htt