In [16]:
import os
import zipfile
import xml.etree.ElementTree as ET
import re

In [17]:
# Some constants and variables and stuff
SOURCE_DIR = 'REUTERS_CORPUS_2' # Name of the dir for the source files
zipdir = './' + SOURCE_DIR
zipfiles = []        # This collects the names of the zip files to be extracted (the actual news files)

topics = []          # This holds a list of the topic codes
topic_names = []     # This holds a list of the topic names
                     # topics and topic_names use the same indexing

inputs = []          # This holds a list of the inputs (headline + text)
labels = []          # This holds a list of the input labels in 'many-hot' notation e.g. [0, 1, 0, 0, 1, 1, ...]
label_codes = []     # This holds a list of lists of the inputs topic codes e.g. [['C11'], ['6INS', 'C17'], ...]
                     # inputs, labels and label_codes all use the same indexing

USE_TRUNCATED = True # Use True for development to cut the input text short, False for full data
LEN_TRUNCATED = 256  # Length of trucated input text if truncated

WS_REMOVAL = re.compile(r"\s+")

In [18]:
# Read topics into a list of topic codes and a list of topic names
zipc = zipfile.ZipFile(zipdir + '/codes.zip', 'r')
c = zipc.open('topic_codes.txt')
strc = c.read().decode('utf-8')
strarr = strc.split('\n')
for t in strarr:
    if len(t) > 0 and t[0] != ';': # Discard header rows
        topic = t.split('\t')
        topics.append(topic[0])
        topic_names.append(topic[1])

In [19]:
# Make a list of data zip-files in source directory
for root, dirs, files in os.walk(zipdir):
    for file in files:
        if file.startswith('1997') and file.endswith('.zip'):
            zipfiles.append(file)

In [20]:
# Then extract content
for zipf in zipfiles:
    zipd = zipfile.ZipFile(zipdir + '/' + zipf, 'r')
    for fname in zipd.namelist():
        f = zipd.open(fname)
        xmlroot = ET.fromstring(f.read())
        headline = ''
        text = ''
        codes = []
        for level1 in xmlroot:
            if level1.tag == 'headline':
                headline = (level1.text if level1.text is not None else '')
            if level1.tag == 'text':
                for level2 in level1:
                    text += (level2.text if level2.text is not None else '') + ' '
            if level1.tag == 'metadata':
                for level2 in level1:
                    if level2.tag == 'codes' and level2.attrib.get('class') == 'bip:topics:1.0':
                        for level3 in level2:
                            codes.append(level3.attrib.get('code'))
        inp = WS_REMOVAL.sub(' ', (headline + ' ' + text)).strip()
        if USE_TRUNCATED:
            inputs.append(inp[:LEN_TRUNCATED])
        else:
            inputs.append(inp)
        codes.sort()
        label_codes.append(codes)
        labs = [0] * len(topics)
        i = -1
        for code in codes:
            for j in range(i + 1, len(topics)):
                if code == topics[j]:
                    labs[j] = 1
                    i = j
                    break
        labels.append(labs)


In [21]:
for t in range(0, 10):
    print(labels[t])
    print(label_codes[t])
    print(inputs[t])
    for i, val in enumerate(labels[t]):
        if val == 1:
            print(i, topics[i])



[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
['G15', 'GCAT']
OFFICIAL JOURNAL CONTENTS - OJ C 126 OF APRIL 23, 1997. * (Note - contents are displayed in reverse order to that in the printed Journal) * Notice of a standing call for awarding contracts for distilling operations in respect of apples withdrawn from the m
80 G15
90 GCAT
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,