In [1]:
# This script preprocesses the dataset.
# 1. break the whole json file into seperate json files (each one represents a news)
# 2. remove news that are identifies as not relevant to the task
# 3. extract news id, title, content and other possible fields
# 4. do a shallow cleaning of the content (only removing HTML tags, not removing stop words)

In [2]:
import os
import re
import json

In [3]:
def shallow_clean(text):
    '''
    remove HTML tags or other special characters in the text
    Args: some text
    Returns: text being cleaned
    '''
    p = re.compile(r'<.*?>')
    return p.sub('', text)

In [4]:
def extract_news_fields(news):
    '''
    processing text by extracting news id, title, content and do a shallow clean
    Args: a news (dictionary) to be processed
    Returns: a news (dictionary) after being processed
    '''
    news_parsed = {}

    # extract id, title, author and
    news_parsed["id"] = news["id"]
    news_parsed["title"] = news["title"]
    news_parsed["author"] = news["author"]

    # parse contents (an array)
    contents = news["contents"]

    # news type is the first element in the array and with the "kicker" indicator, but not necessarily exist so need to check
    if contents[0]["type"] == "kicker":
        news_category = contents[0]["content"]
    else:
        news_category = None
    news_parsed["category"] = news_category

    # extract detailed content text from the array
    content_text_list = []
    for c in contents:
        if c is not None and c["type"] == "sanitized_html":
            content_text = shallow_clean(c["content"])
            content_text_list.append(content_text)
    news_parsed["content"] = " ".join(content_text_list)

    return news_parsed

In [5]:
def check_news_type(news):
    '''
    check whether the news is relevant to the task
    Args: a news
    Returns: a boolean value
    '''
    non_rel_news_types = ["Opinion", "Letters to the Editor", "The Post's View"]

    # extract news type
    if "content" in news["contents"][0].keys():
        news_type = news["contents"][0]["content"]
    else:
        news_type = None

    if news_type not in non_rel_news_types:
        return True
    else:
        return False

In [6]:
def preprocessing(file_path, folder_path):
    '''
    preprocess all the news
    '''
    print("Start processing.")

    # keep track of how many documents
    total_news_count = 0
    total_rel_news_count = 0

    with open (file_path, "r", encoding="utf-8") as f:
        for line in f.readlines():
            news = json.loads(line)
            if check_news_type(news):
                # extract information and save to the folder
                news_parsed = extract_news_fields(news)
                news_path = folder_path + news_parsed["id"] + ".json"
                with open (news_path , "w", encoding="utf-8") as fp:
                    json.dump(news_parsed, fp, ensure_ascii=False)
                total_rel_news_count += 1
            total_news_count += 1
            # checkpoint
            if total_news_count % 1000 == 0:
                print("{} news processed.".format(total_news_count))
        f.close()
    print("{} news processed, {} relevant news saved.".format(total_news_count, total_rel_news_count))

In [7]:
def main():
    # be sure to change the file path and folder path (where you want to save the parsed news files)
    file_path = "/Users/jiamingqu/Desktop/NewsRecProj/data/wapo.jl"
    folder_path = "/Users/jiamingqu/Desktop/NewsRecProj/news_json/"
    preprocessing(file_path, folder_path)

In [None]:
if __name__ == "__main__":
    main()
    # it should be a total of 595,037 news, in which news are 571,963 relevant and kept for this task
    # take approximately 8 min on my laptop to finish