# Reuters corpus visualization

I will make a dataframe of the XML files to get data for each of the document. I will want to model these variables of  each Document:


item_id
topic_codes
topic_descs
date
headline
text: Document text
essential: This column will have the headline and document text without stop words
location
country


In [1]:
import os
import zipfile
import xml.etree.ElementTree as ET
import re
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
nltk.download('stopwords')


[nltk_data] Downloading package wordnet to /home/hdheli/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /home/hdheli/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
# constants and other needed initializations

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

CORPUS_DIR = './REUTERS_CORPUS_2' # Name of the dir for the source files
WS_REMOVAL = re.compile(r"\s+")


In [3]:
zipfiles = []         # This collects the names of the zip files to be extracted (the actual news files)
all_topic_codes = []  # This holds a list of the topic codes
all_topic_descs = []       # This holds a list of the topic descriptions
                      # topics and topic_names use the same indexing

item_ids = []         # This holds a list of the news item ids
topic_code_lists = [] # This holds a list of lists of the inputs topic codes e.g. [['C11'], ['6INS', 'C17'], ...]
topic_desc_lists = []
dates = []            # date of the document
headlines = []
texts = []            # This holds a list of the texts
essentials = []       # This column will have the headline and document text without stop words
locations = []
countries = []



In [4]:
# Read topics into a list of topic codes and a list of topic names
zipc = zipfile.ZipFile('./' + CORPUS_DIR + '/codes.zip', 'r')
c = zipc.open('topic_codes.txt')
strc = c.read().decode('utf-8')
strarr = strc.split('\n')
for t in strarr:
    if len(t) > 0 and t[0] != ';': # Discard header rows
        topic = t.split('\t')
        all_topic_codes.append(topic[0])
        all_topic_descs.append(topic[1])

In [6]:
# make a topic dictionary

topics = {all_topic_codes[i]: all_topic_descs[i] for i in range(len(all_topic_codes))}
topics

{'1POL': 'CURRENT NEWS - POLITICS',
 '2ECO': 'CURRENT NEWS - ECONOMICS',
 '3SPO': 'CURRENT NEWS - SPORT',
 '4GEN': 'CURRENT NEWS - GENERAL',
 '6INS': 'CURRENT NEWS - INSURANCE',
 '7RSK': 'CURRENT NEWS - RISK NEWS',
 '8YDB': 'TEMPORARY',
 '9BNX': 'TEMPORARY',
 'ADS10': 'CURRENT NEWS - ADVERTISING',
 'BNW14': 'CURRENT NEWS - BUSINESS NEWS',
 'BRP11': 'CURRENT NEWS - BRANDS',
 'C11': 'STRATEGY/PLANS',
 'C12': 'LEGAL/JUDICIAL',
 'C13': 'REGULATION/POLICY',
 'C14': 'SHARE LISTINGS',
 'C15': 'PERFORMANCE',
 'C151': 'ACCOUNTS/EARNINGS',
 'C1511': 'ANNUAL RESULTS',
 'C152': 'COMMENT/FORECASTS',
 'C16': 'INSOLVENCY/LIQUIDITY',
 'C17': 'FUNDING/CAPITAL',
 'C171': 'SHARE CAPITAL',
 'C172': 'BONDS/DEBT ISSUES',
 'C173': 'LOANS/CREDITS',
 'C174': 'CREDIT RATINGS',
 'C18': 'OWNERSHIP CHANGES',
 'C181': 'MERGERS/ACQUISITIONS',
 'C182': 'ASSET TRANSFERS',
 'C183': 'PRIVATISATIONS',
 'C21': 'PRODUCTION/SERVICES',
 'C22': 'NEW PRODUCTS/SERVICES',
 'C23': 'RESEARCH/DEVELOPMENT',
 'C24': 'CAPACITY/FACILIT

In [8]:
# Make a list of data zip-files in source directory
for root, dirs, files in os.walk(CORPUS_DIR):
    for file in files:
        if file.startswith('1997') and file.endswith('.zip'):
            zipfiles.append(file)

In [10]:
# Then extract content
for zipf in zipfiles:
    zipd = zipfile.ZipFile(CORPUS_DIR + '/' + zipf, 'r')
    for fname in zipd.namelist():
        f = zipd.open(fname)
        xmlroot = ET.fromstring(f.read())
        headline = ''
        text = ''
        topic_codes = []
        topic_descs = []
        item_ids.append(xmlroot.attrib.get('itemid'))
        dates.append(xmlroot.attrib.get('date'))
        for level1 in xmlroot:
            if level1.tag == 'headline':
                headline = (level1.text if level1.text is not None else '')
                headlines.append(headline)
            if level1.tag == 'text':
                for level2 in level1:
                    text += (level2.text if level2.text is not None else '') + ' '
                    texts.append(text)
            if level1.tag == 'metadata':
                for level2 in level1:
                    if level2.tag == 'codes' and level2.attrib.get('class') == 'bip:topics:1.0':
                        for level3 in level2:
                            code = level3.attrib.get('code')
                            topic_codes.append(code)
                            topic_descs.append(topics[code])
                    if level2.tag == 'dc' and level2.attrib.get('element') == 'dc.creator.location':
                        location = level2.attrib.get('value')
                        locations.append(location)
                    if level2.tag == 'dc' and level2.attrib.get('element') == 'dc.creator.location.country.name':
                        country = level2.attrib.get('value')
                        countries.append(country)
                        
        inp = WS_REMOVAL.sub(' ', (headline + ' ' + text)).strip()
        words = word_tokenize(inp)
        lemmad = set()
        for word in words:
            if word not in stop_words:
                lemmad.add(lemmatizer.lemmatize(word))
        processed = " ".join(lemmad)
        essentials.append(processed)
        topic_code_lists.append(topic_codes)
        topic_desc_lists.append(topic_descs)
        

In [11]:

reuters_df = pd.DataFrame(list(zip(item_ids, dates, locations, countries, topic_code_lists, topic_desc_lists, headlines, texts, essentials)))
reuters_df.columns = ['id', 'date', 'location', 'country', 'codes', 'descs', 'headlines', 'texts', 'essentials']
reuters_df.to_csv(f'csv/reuters.csv', index = False, sep = ';')

topics_df = pd.DataFrame(list(zip(all_topic_codes, all_topic_descs)))
topics_df.columns = ['CODE', 'DESCRIPTION']
topics_df.to_csv('csv/topic_codes.txt', index = False, sep = '\t')


In [12]:
reuters_df

Unnamed: 0,id,date,location,country,codes,descs,headlines,texts,essentials
0,635751,1997-06-04,TORONTO,EU,"[G15, GCAT]","[EUROPEAN COMMUNITY, GOVERNMENT/SOCIAL]",Official Journal contents - OJ C 169 of June 4...,* (Note - contents are displayed in reverse or...,COMMISSION 14 4 displayed OJ Commission within...
1,635752,1997-06-04,QUEBEC CITY,EU,"[G15, GCAT]","[EUROPEAN COMMUNITY, GOVERNMENT/SOCIAL]",Official Journal contents - OJ L 144 of June 4...,* (Note - contents are displayed in reverse or...,4 displayed 29 Regulation Annexes Latvia Lithu...
2,635753,1997-06-04,OTTAWA,CANADA,"[M14, M143, MCAT]","[COMMODITY MARKETS, ENERGY MARKETS, MARKETS]",Suncor lowers Canada heavy oil price.,* (Note - contents are displayed in reverse or...,29.3 oil 142.00 .... BLEND 06/04/97 IMPERIAL B...
3,635754,1997-06-04,TORONTO,CANADA,"[M14, M143, MCAT]","[COMMODITY MARKETS, ENERGY MARKETS, MARKETS]",Suncor cuts Canada light oil prices.,* (Note - contents are displayed in reverse or...,25.76 oil .... SWEET 163.00 SOUR 25.44 06/04/9...
4,635755,1997-06-04,TORONTO,CANADA,"[M11, MCAT]","[EQUITY MARKETS, MARKETS]","Toronto stocks end softer, more consolidation ...",* (Note - contents are displayed in reverse or...,said earnings DJI earlier * -42.49 -0.70 . mar...
...,...,...,...,...,...,...,...,...,...
281004,699748,1997-07-01,PARIS,UK,"[M14, M141, MCAT]","[COMMODITY MARKETS, SOFT COMMODITIES, MARKETS]",Euro veg oils little changed despite U.S. 4 yr...,"France's new leftist government, in its first ...",said coconut 4 mixed 512.50 Newsroom south old...
281005,699749,1997-07-01,PARIS,UK,"[C31, CCAT, M14, MCAT]","[MARKETS/MARKETING, CORPORATE/INDUSTRIAL, COMM...",London shipsales.,"France's new leftist government, in its first ...","10 inspection 23,803 survey ORE vessel final S..."
281006,699750,1997-07-01,PARIS,UK,"[GCAT, GENV, GWEA]","[GOVERNMENT/SOCIAL, ENVIRONMENT AND NATURAL WO...",Britain's June rainfall highest since 1860.,"France's new leftist government, in its first ...",said replenished left come flat reservoir Afri...
281007,699751,1997-07-01,BEIJING,UK,"[C21, C24, CCAT]","[PRODUCTION/SERVICES, CAPACITY/FACILITIES, COR...",Hess N.Sea Durward field sees small delay.,"France's new leftist government, in its first ...",said 32 Plc Newsroom 7930 half suffered Group ...
