In [1]:
import glob
import os
import tarfile
from bs4 import BeautifulSoup
import pandas as pd 
from tqdm import tqdm_notebook as tqdm
import shutil
import codecs

# Process Data

In [8]:
for year_folder in tqdm(glob.glob('../nyt_corpus/data/*')):
    article_chunks = []
    
    ## clean up if last round failed...
    for last_run in filter(lambda x: '.tgz' not in x, glob.glob(os.path.join(year_folder, '*'))):
        shutil.rmtree(last_run)
    
    ## iterate through each month
    for month_tgz in glob.glob(os.path.join(year_folder, '*')):
        ## untar folder
        tar = tarfile.open(name=month_tgz, mode="r:gz")
        tar.extractall(path=year_folder)

        ## extract data from XML files
        day_xml_files = glob.glob(os.path.join(month_tgz.replace('.tgz', ''), '*', '*') + '.xml')

        ## read through each article
        for xml_file in day_xml_files:
            content = codecs.open(xml_file, encoding='utf-8').read()
            soup = BeautifulSoup(content)

            ## parse HTML 
            article_chunk = {}
            for metadata in soup.find_all('meta'):
                name = metadata['name']
                cont = metadata['content']
                article_chunk[name] = cont
            
            if soup.find('hedline'):
                article_chunk['headline'] = soup.find('hedline').get_text()
            article_chunk['body'] = soup.find('body.content').get_text()

            article_chunks.append(article_chunk)

        ## clean up
        shutil.rmtree(month_tgz.replace('.tgz', ''))
        
    ## dump contents
    year_name = os.path.basename(year_folder)
    year_csv_fname = os.path.join('../nyt_corpus/csvs/', year_name) + '.csv'
    pd.DataFrame(article_chunks).to_csv(year_csv_fname)

HBox(children=(IntProgress(value=0, max=13), HTML(value='')))

# Sample Article XML

In [10]:
soup

<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE nitf SYSTEM "http://www.nitf.org/IPTC/NITF/3.3/specification/dtd/nitf-3-3.dtd">
<html><body><nitf change.date="June 10, 2005" change.time="19:30" version="-//IPTC//DTD NITF 3.3//EN">
<title>Corrections: For the Record</title>
<meta content="19botcorrex-007" name="slug"/>
<meta content="19" name="publication_day_of_month"/>
<meta content="6" name="publication_month"/>
<meta content="2007" name="publication_year"/>
<meta content="Tuesday" name="publication_day_of_week"/>
<meta content="Metropolitan Desk" name="dsk"/>
<meta content="2" name="print_page_number"/>
<meta content="A" name="print_section"/>
<meta content="Corrections; New York and Region" name="online_sections"/>
<docdata>
<doc-id id-string="1855670"></doc-id>
<doc.copyright holder="The New York Times" year="2007"></doc.copyright>
<identified-content>
<classifier class="online_producer" type="types_of_material">Correction</classifier>
<classifier class="online_producer" type="tax

# Sample Article CSV

In [13]:
pd.DataFrame(article_chunks).head().T

Unnamed: 0,0,1,2,3,4
alternate_url,,,,,
banner,,,,,
body,\n\nBLUMENTHAL--Martin. A New York business ma...,"\n\nBRADLEY--Carol L., 84, of Tinton Falls, NJ...","\n\nCRAWFORD--Perry Jr., died at 89 on Decembe...","\n\nFLOOD--Robert Francis, husband of the late...","\n\nGEISLER--Enid (Friedman), on December 29, ..."
column_name,,,,,
correction_date,,,,,
dsk,Classified,Classified,Classified,Classified,Classified
feature_page,,,,,
headline,"\nPaid Notice: Deaths BLUMENTHAL, MARTIN\n","\nPaid Notice: Deaths BRADLEY, CAROL L.\n","\nPaid Notice: Deaths CRAWFORD, PERRY JR.\n","\nPaid Notice: Deaths FLOOD, ROBERT FRANCIS\n","\nPaid Notice: Deaths GEISLER, ENID (FRIEDMA..."
online_sections,Paid Death Notices,Paid Death Notices,Paid Death Notices,Paid Death Notices,Paid Death Notices
print_column,3,3,3,3,3
