In [43]:
import shutil
from tqdm.auto import tqdm
import glob
import os
import tarfile
import codecs
from bs4 import BeautifulSoup
import pandas as pd 

# Get Section/Classes from NYT Annotated Corpus

In [58]:
for year_folder in tqdm(glob.glob('../data/nyt_corpus/data/*')):
    article_chunks = []
    
    ## clean up if last round failed...
    for last_run in filter(lambda x: '.tgz' not in x, glob.glob(os.path.join(year_folder, '*'))):
        shutil.rmtree(last_run)
    
    ## iterate through each month
    for month_tgz in glob.glob(os.path.join(year_folder, '*')):
        ## untar folder
        tar = tarfile.open(name=month_tgz, mode="r:gz")
        tar.extractall(path=year_folder)

        ## extract data from XML files
        day_xml_files = glob.glob(os.path.join(month_tgz.replace('.tgz', ''), '*', '*') + '.xml')

        ## read through each article
        for xml_file in day_xml_files:
            content = codecs.open(xml_file, encoding='utf-8').read()
            soup = BeautifulSoup(content)

            ## parse HTML 
            article_chunk = {}
            for metadata in soup.find_all('meta'):
                name = metadata['name']
                cont = metadata['content']
                article_chunk[name] = cont
            
            classification_dict = {}
            for classification in soup.find_all('classifier'):
                attrs = []
                attrs += classification.attrs['class']
                attrs.append(classification.attrs['type'])
                classification_dict['.'.join(attrs)] = classification.text
            article_chunk['classification'] = classification_dict
                
            if soup.find('hedline'):
                article_chunk['headline'] = soup.find('hedline').get_text()
            full_text = soup.find('body.content').find('block', attrs={'class': 'full_text'})
            if full_text:
                ps = full_text.find_all('p')
                ps = list(map(lambda x: x.get_text(), ps))
            else:
                ps = [soup.find('body.content').get_text()]
            article_chunk['body'] = ps 
            article_chunks.append(article_chunk)

        ## clean up
        shutil.rmtree(month_tgz.replace('.tgz', ''))
        
    ## dump contents
    year_name = os.path.basename(year_folder)
    year_csv_fname = os.path.join('../data/nyt_corpus/csvs/', year_name) + '.csv'
    pd.DataFrame(article_chunks).to_csv(year_csv_fname)

  0%|          | 0/21 [00:00<?, ?it/s]

In [45]:
article_chunk 

{'publication_day_of_month': '25',
 'publication_month': '9',
 'publication_year': '1988',
 'publication_day_of_week': 'Sunday',
 'dsk': 'Society Desk',
 'print_page_number': '62',
 'print_section': '1',
 'print_column': '4',
 'online_sections': 'Style',
 'classification': {'indexing_service.descriptor': 'WEDDINGS AND ENGAGEMENTS',
  'online_producer.taxonomic_classifier': 'Top/Features/Style/Fashion and Style/Weddings and Celebrations',
  'online_producer.general_descriptor': 'Weddings and Engagements'},
 'headline': '\nWendy Olsoff Marries\n',
 'body': "\n\nLEAD: The marriage of Wendy G. Olsoff, a co-owner of the P.P.O.W. Gallery in New York, to Gregg L. Deering, an artist, took place last evening at the National Arts Club in New York. Rabbi A. Allen Block officiated. The parents of the couple are Mr. and Mrs. Bernard Olsoff of New York and Mr. and Mrs. Robert L.\n\n\nLEAD: The marriage of Wendy G. Olsoff, a co-owner of the P.P.O.W. Gallery in New York, to Gregg L. Deering, an artist