# Notebook to fetch the dataset and prepare it for use
- Dataset location [Data for PAN at SemEval 2019 Task 4: Hyperpartisan News Detection](https://zenodo.org/records/1489920)
- Also see [Hyperpartisan News Detection 2019](https://pan.webis.de/semeval19/semeval19-web/#data) and [SemEval-2019 Task 4: Hyperpartisan News Detection](https://aclanthology.org/S19-2145.pdf)

In [1]:
import os
import wget
import zipfile

In [2]:
datasetlinks = {
    "article.xsd": "https://zenodo.org/records/1489920/files/article.xsd?download=1",
    "ground-truth.xsd": "https://zenodo.org/records/1489920/files/ground-truth.xsd?download=1",
    "articles-training-byarticle-20181122.zip": "https://zenodo.org/records/1489920/files/articles-training-byarticle-20181122.zip?download=1",
    "articles-validation-bypublisher-20181122.zip": "https://zenodo.org/records/1489920/files/articles-validation-bypublisher-20181122.zip?download=1",
    "ground-truth-training-byarticle-20181122.zip": "https://zenodo.org/records/1489920/files/ground-truth-training-byarticle-20181122.zip?download=1",
    "ground-truth-training-bypublisher-20181122.zip": "https://zenodo.org/records/1489920/files/ground-truth-training-bypublisher-20181122.zip?download=1",
    "ground-truth-validation-bypublisher-20181122.zip": "https://zenodo.org/records/1489920/files/ground-truth-validation-bypublisher-20181122.zip?download=1",
}

# Omitted from above, to save time, is the main training set with ~600k articles
# I'm using the validation set (150k articles) with an 80/20 test/train split instead
# To include that training set, add the line below:
# "articles-training-bypublisher-20181122.zip": "https://zenodo.org/records/1489920/files/articles-training-bypublisher-20181122.zip?download=1",

In [3]:
dataset_dir = os.path.join(os.getcwd(), "dataset")
os.makedirs(dataset_dir, exist_ok=True)

for fname, url in datasetlinks.items():
    filepath = os.path.join(dataset_dir, fname)
    
    if os.path.isfile(filepath):
        print(f"file {filepath} already exists")
    else:
        print(f"downloading {fname} from {url}")
        wget.download(url, os.path.join(dataset_dir, filepath))

file /home/adhavle/hnd/dataset/article.xsd already exists
file /home/adhavle/hnd/dataset/ground-truth.xsd already exists
file /home/adhavle/hnd/dataset/articles-training-byarticle-20181122.zip already exists
file /home/adhavle/hnd/dataset/articles-validation-bypublisher-20181122.zip already exists
file /home/adhavle/hnd/dataset/ground-truth-training-byarticle-20181122.zip already exists
file /home/adhavle/hnd/dataset/ground-truth-training-bypublisher-20181122.zip already exists
file /home/adhavle/hnd/dataset/ground-truth-validation-bypublisher-20181122.zip already exists


In [4]:
unzip_targets = {
    "articles-training-byarticle-20181122.zip": "articles-training-byarticle-20181122.xml",
    "articles-validation-bypublisher-20181122.zip": "articles-validation-bypublisher-20181122.xml",
    "ground-truth-training-byarticle-20181122.zip": "ground-truth-training-byarticle-20181122.xml",
    "ground-truth-training-bypublisher-20181122.zip": "ground-truth-training-bypublisher-20181122.xml",
    "ground-truth-validation-bypublisher-20181122.zip": "ground-truth-validation-bypublisher-20181122.xml",
}

In [5]:
for zipname, target in unzip_targets.items():
    file_target = os.path.join(dataset_dir, target)

    if os.path.isfile(file_target):
        print(f"file {file_target} already unzipped")
    else:
        zip_path = os.path.join(dataset_dir, zipname)
        print(f"unzipping {zip_path} to {file_target}")
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            zip_ref.extractall(dataset_dir)

file /home/adhavle/hnd/dataset/articles-training-byarticle-20181122.xml already unzipped
file /home/adhavle/hnd/dataset/articles-validation-bypublisher-20181122.xml already unzipped
file /home/adhavle/hnd/dataset/ground-truth-training-byarticle-20181122.xml already unzipped
file /home/adhavle/hnd/dataset/ground-truth-training-bypublisher-20181122.xml already unzipped
file /home/adhavle/hnd/dataset/ground-truth-validation-bypublisher-20181122.xml already unzipped


Since I'm planning to just use the validation set, I only actually need 2 of these files:
 - `articles-validation-bypublisher-20181122.xml` (150k articles - will use in an 80/20 test/train split)
 - `ground-truth-validation-bypublisher-20181122.xml` (the target/bias values for these articles, scored as `left`, `left-center`, `least`, `right-center`, `right`)
 - The first file contains un-escaped HTML tags within article contents. To avoid confusing Pandas' XML loader, I'll convert all angle braces within the articles to their HTML entity codes (so `<` becomes `&lt;` and `>` becomes `&gt;`)
 - I am only doing this cleaning step on `articles-validation-bypublisher-20181122.xml`, but anyone planning to use the other 2 'article' files will need to do the same on them as well.

In [6]:
infile = os.path.join(dataset_dir, "articles-validation-bypublisher-20181122.xml")
outfile = os.path.join(dataset_dir, "articles-validation-bypublisher-20181122-html-escaped.xml")

start_tag = u'<article'
end_tag = u'</article>\n'
articles = 0

with open(infile, "r", encoding='utf-8') as inf:
    with open(outfile, "w", encoding='utf-8') as outf:

        # read the XML header and opening articles tag
        line = inf.readline()
        if line.startswith(u"<?xml version"):
            outf.write(line)
        if u"<articles>" not in line:
            line = inf.readline()
            outf.write(line)

        # clean the rest of the file
        while True:
            line = inf.readline()
            if not line:
                break

            if line.startswith(start_tag):
                articles += 1
                close_article_tag = line.find("\">")
                article_tag = line[:close_article_tag + 2]

                if line.endswith(end_tag):

                    # handle single-line article
                    end_article_tag = line.find(end_tag)
                    content = line[close_article_tag + 2:end_article_tag]
                    content = content.replace("<", "&lt;").replace(">", "&gt;")
                    outf.write(f"{article_tag}{content}{end_tag}")

                else:

                    # handle multi-line article
                    content = line[close_article_tag + 2:]
                    content = content.replace("<", "&lt;").replace(">", "&gt;")
                    outf.write(f"{article_tag}{content}")

                    foundClosingTag = False
                    while not foundClosingTag:
                        line = inf.readline()
                        if line.endswith(end_tag):
                            foundClosingTag = True
                            end_article_tag = line.find(end_tag)
                            content = line[:end_article_tag]
                            content = content.replace("<", "&lt;").replace(">", "&gt;")
                            outf.write(f"{content}{end_tag}")
                        else:
                            content = line.replace("<", "&lt;").replace(">", "&gt;")
                            outf.write(content)

            else:
                # not in an article - just replicate to output file
                outf.write(line)

print(f"Cleaned {articles} articles")

Cleaned 150000 articles
