In [1]:
import os, glob
import xml.etree.ElementTree as ET
from collections import Counter
from bs4 import BeautifulSoup
import pandas as pd
from tqdm import tqdm_notebook as tqdm
import tarfile
import shutil
import codecs

# Process files

In [None]:
corpus_location = "D:/newspaper-front-page-project/nyt_corpus"
xml_files = glob.glob(os.path.join(corpus_location, "data/*"))
tag_class = ".//classifier[@class='online_producer'][@type='taxonomic_classifier']"

for year_folder in tqdm(xml_files):
    year_name = os.path.basename(year_folder) + '_tags'
    year_csv_fname = os.path.join(corpus_location, 'csvs', year_name) + '.csv'
    if os.path.exists(year_csv_fname):
        print('already processed %s' % year_csv_fname)
    else:
        print('processing %s' % year_csv_fname)
        article_chunks = []

        ## clean up if last round failed...
        for last_run in filter(lambda x: '.tgz' not in x, glob.glob(os.path.join(year_folder, '*'))):
            print('removing %s...' % last_run)
            shutil.rmtree(last_run)

        ## iterate through each month
        for month_tgz in tqdm(glob.glob(os.path.join(year_folder, '*'))):
            print('processing %s...' % month_tgz)
            ## untar folder
            tar = tarfile.open(name=month_tgz, mode="r:gz")
            tar.extractall(path=year_folder)

            ## extract data from XML files
            day_xml_files = glob.glob(os.path.join(month_tgz.replace('.tgz', ''), '*', '*') + '.xml')

            ## read through each article
            for xml_file in day_xml_files:
                article_chunk = {}

                ## parse tags
                doc_etree = ET.parse(xml_file)
                tags = doc_etree.findall(path=tag_class)
                tag_text = list(map(lambda x: x.text, tags))
                doc_id = doc_etree.find('.//doc-id').attrib.get('id-string')
                article_chunk['tags'] = tag_text
                article_chunk['doc_id'] = doc_id

                ## parse metadata
                content = codecs.open(xml_file, encoding='utf-8').read()
                soup = BeautifulSoup(content)

                ## parse HTML 
                for metadata in soup.find_all('meta'):
                    name = metadata['name']
                    cont = metadata['content']
                    article_chunk[name] = cont

                if soup.find('hedline'):
                    article_chunk['headline'] = soup.find('hedline').get_text()


                article_chunks.append(article_chunk)

            ## clean up
            shutil.rmtree(month_tgz.replace('.tgz', ''))

        ## dump contents
        pd.DataFrame(article_chunks).to_csv(year_csv_fname)

# Explore files

In [13]:
csv_files = []
for csv_file in glob.glob('D:/newspaper-front-page-project/nyt_corpus/csvs/*'):
    csv_files.append(pd.read_csv(csv_file))

  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)


In [138]:
tag_counter = Counter()
for val_list in doc_tags.values():
    tag_counter.update(val_list)

In [146]:
tag_counts_df = (
    pd.Series(tag_counter)
         .sort_values(ascending=False)
         .reset_index()
         .rename(columns={'index': 'tag', 0:'count'})
)

In [155]:
tag_counts_df

Unnamed: 0,tag,count
0,Top/News,2943
1,Top/News/New York and Region,1541
2,Top/Classifieds/Paid Death Notices,1293
3,Top/Features/Travel/Guides/Destinations/North ...,1021
4,Top/Features/Travel/Guides/Destinations/North ...,990
5,Top/Opinion,989
6,Top/Opinion/Opinion,989
7,Top/News/Business,863
8,Top/Features/Arts,827
9,Top/News/Sports,708
