## Loading and parsing the data from zipped xml files

We create two sets of the input data. One with the full length headline + text `inputs.csv` or `test.csv` and one with a truncated length headline + text `inputs_trunc.csv` or `test_trunc.csv` into the folder `reuters-csv`. In addition we write the topic codes to `topic_codes.txt` in the same folder.

Stop words and multiple whitespace characters are removed and the content is lemmatized.


In [28]:
import os
import zipfile
import xml.etree.ElementTree as ET
import re
import pandas as pd
import nltk
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/jaakkovilenius/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jaakkovilenius/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [29]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

In [37]:
# Some constants and variables and stuff
TESTING = True
TRAIN_DIR = 'REUTERS_CORPUS_2' # Name of the dir for the source files
TEST_DIR = 'reuters-test-data'
zipdir = './' + (TEST_DIR if TESTING else TRAIN_DIR)
print(zipdir)

LEN_TRUNCATED = 64   # Length of trucated input text

WS_REMOVAL = re.compile(r"\s+")

./reuters-test-data


In [38]:
zipfiles = []        # This collects the names of the zip files to be extracted (the actual news files)

itemids = []         # This holds a list of the news item ids

topics = []          # This holds a list of the topic codes
topic_names = []     # This holds a list of the topic names
                     # topics and topic_names use the same indexing

inputs = []          # This holds a list of the inputs (headline + text)
inputs_trunc = []    # This holds a list of truncated inputs (headline + text)
labels = []          # This holds a list of the input labels in 'many-hot' notation e.g. [0, 1, 0, 0, 1, 1, ...]
label_codes = []     # This holds a list of lists of the inputs topic codes e.g. [['C11'], ['6INS', 'C17'], ...]
                     # inputs, labels and label_codes all use the same indexing

In [39]:
# Read topics into a list of topic codes and a list of topic names
zipc = zipfile.ZipFile('./' + TRAIN_DIR + '/codes.zip', 'r')
c = zipc.open('topic_codes.txt')
strc = c.read().decode('utf-8')
strarr = strc.split('\n')
for t in strarr:
    if len(t) > 0 and t[0] != ';': # Discard header rows
        topic = t.split('\t')
        topics.append(topic[0])
        topic_names.append(topic[1])

In [40]:
# Make a list of data zip-files in source directory
for root, dirs, files in os.walk(zipdir):
    for file in files:
        if file.startswith('1997') and file.endswith('.zip'):
            zipfiles.append(file)

In [41]:
# Then extract content
for zipf in zipfiles:
    zipd = zipfile.ZipFile(zipdir + '/' + zipf, 'r')
    for fname in zipd.namelist():
        f = zipd.open(fname)
        xmlroot = ET.fromstring(f.read())
        headline = ''
        text = ''
        codes = []
        itemids.append(xmlroot.attrib.get('itemid'))
        for level1 in xmlroot:
            if level1.tag == 'headline':
                headline = (level1.text if level1.text is not None else '').lower()
            if level1.tag == 'text':
                for level2 in level1:
                    text += (level2.text if level2.text is not None else '').lower() + ' '
            if level1.tag == 'metadata':
                for level2 in level1:
                    if level2.tag == 'codes' and level2.attrib.get('class') == 'bip:topics:1.0':
                        for level3 in level2:
                            codes.append(level3.attrib.get('code'))
        inp = WS_REMOVAL.sub(' ', (headline + ' ' + text)).strip()
        words = word_tokenize(inp)
        lemmad = set()
        for word in words:
            if word not in stop_words:
                lemmad.add(lemmatizer.lemmatize(word))
        processed = " ".join(lemmad)
        inputs.append(processed)
        inputs_trunc.append(processed[:LEN_TRUNCATED])
        codes.sort()
        label_codes.append(codes)
        labs = [0] * len(topics)
        i = -1
        for code in codes:
            for j in range(i + 1, len(topics)):
                if code == topics[j]:
                    labs[j] = 1
                    i = j
                    break
        labels.append(labs)


In [42]:
inputs_df = pd.DataFrame(list(zip(itemids, inputs, labels, label_codes)))
inputs_df.columns = ['id', 'text', 'target', 'codes']
inputs_df.to_csv(f'reuters-csv/{"test" if TESTING else "inputs"}.csv', index = False, sep = ';')
inputs_trunc_df = pd.DataFrame(list(zip(itemids, inputs_trunc, labels, label_codes)))
inputs_trunc_df.columns = ['id', 'text', 'target', 'codes']
inputs_trunc_df.to_csv(f'reuters-csv/{"test" if TESTING else "inputs"}_trunc.csv', index = False, sep = ';')
topics_df = pd.DataFrame(list(zip(topics, topic_names)))
topics_df.columns = ['CODE', 'DESCRIPTION']
topics_df.to_csv('reuters-csv/topic_codes.txt', index = False, sep = '\t')
