In [4]:
def scrape_news(news_date, use_current_date = False): # 'YYYY-mm-dd'
    
    from datetime import datetime
    import pandas as pd
    import numpy as np
    import requests
    import re
    import warnings
    warnings.filterwarnings('ignore')
    
    if use_current_date:
        current_date = datetime.now().date()
        run_date = str(current_date)
    else:
        run_date = news_date

    # news source codes
    #url_hindu = '9567/' #india
    #url_xinhua = '9532/' #china
    #url_scmp = '9531/' #hongkong
    #url_reuters = '9536/' #world
    #url_cna = '9530/' #cna
    #url_straits = '9630/' #straits

    url_list = ['9567/', '9532/', '9531/', '9536/', '9530/', '9630/']

    all_news = pd.DataFrame()

    print('Downloading news articles...')

    for url in url_list:
        # initialise request arguments
        headers = {
            'Authorization': 'Token d144474219d292d9739e60ac76149ff2c94527e1'
        }

        params = {
            'size': 200,
            'start_date': run_date,
            'end_date': run_date,
            'format': 'json'
        }


        #url = 'https://console.ucrawler.app/api/query/'
        url_base = 'https://console.ucrawler.app/api/query/source/'
        response = requests.get(url_base + url, headers=headers, params=params)

        res_json = response.json() # store to json
        res_df = pd.DataFrame(res_json['data'])
        all_news = pd.concat([all_news, res_df])

    print('Total number of articles: ' + str(len(all_news)))

    # convert publication time to datetime variable
    all_news['pub_time'] = pd.to_datetime(all_news['pub_time'])
    all_news = all_news.sort_values(by='pub_time', ascending=False)

    #get source name
    all_news['source'] = all_news['sources'].apply(lambda x : x[0]['source_name'])

    # get subset of news
    all_mini = all_news[['title','text','pub_time', 'meta_images', 'source', 'url']] # get required columns
    all_mini['meta_images'] = all_mini['meta_images'].apply(lambda x: x[0] if len(x) > 0 else '')

    # get article full set
    all_mini['full_text'] = all_mini.apply(lambda x : x['title'] + x['text'] if x['source'] != 'FT' else x['title'], axis=1)

    # store in a folder
    import os

    folder_path = f'./datasets/ucrawler/{news_date}'

    if not os.path.exists(folder_path):
        os.makedirs(folder_path)
        print("Folder created successfully")
    else:
        print("Folder already exists")

    print(f'Saving to folder path - {folder_path}')
    all_mini.to_csv(f'{folder_path}/{news_date}_rawnews.csv', index=False) # store to csv
    print('success!')

In [5]:
scrape_news('2023-03-02') #YYYY-mm-dd

Downloading news articles...
Total number of articles: 166
Folder already exists
Saving to folder path - ./datasets/ucrawler/2023-03-02
success!
