In [6]:
import glob
import os
import tarfile
from bs4 import BeautifulSoup
import pandas as pd 
from tqdm.auto import tqdm
import shutil
import codecs

In [None]:
for year_folder in tqdm(glob.glob('../data/nyt-ldc/nyt_corpus/data/*')):
    article_chunks = []
    
    ## clean up if last round failed...
    for last_run in filter(lambda x: '.tgz' not in x, glob.glob(os.path.join(year_folder, '*'))):
        shutil.rmtree(last_run)
    
    ## iterate through each month
    for month_tgz in glob.glob(os.path.join(year_folder, '*')):
        ## untar folder
        tar = tarfile.open(name=month_tgz, mode="r:gz")
        tar.extractall(path=year_folder)

        ## extract data from XML files
        day_xml_files = glob.glob(os.path.join(month_tgz.replace('.tgz', ''), '*', '*') + '.xml')

        ## read through each article
        for xml_file in day_xml_files:
            content = codecs.open(xml_file, encoding='utf-8').read()
            soup = BeautifulSoup(content, features="xml")

            ## parse HTML 
            article_chunk = {}
            for metadata in soup.find_all('meta'):
                name = metadata['name']
                cont = metadata['content']
                article_chunk[name] = cont
            
            if soup.find('hedline'):
                article_chunk['headline'] = soup.find('hedline').get_text()
            article_chunk['body'] = soup.find('body.content').get_text()

            article_chunks.append(article_chunk)

        ## clean up
        shutil.rmtree(month_tgz.replace('.tgz', ''))
        
    ## dump contents
    year_name = os.path.basename(year_folder)
    year_csv_fname = os.path.join('../data/nyt-ldc/nyt_corpus/csvs/', year_name) + '.csv'
    pd.DataFrame(article_chunks).to_csv(year_csv_fname)

  0%|          | 0/21 [00:00<?, ?it/s]

# Look at Data

In [23]:
import glob

In [25]:
csv_files = glob.glob('../data/nyt-ldc/nyt_corpus/*')

In [77]:
all_data = []
for f in tqdm(csv_files):
    data_df = pd.read_csv(f, index_col=0)
    data_df = (
        data_df
            .loc[lambda df: df['body'].notnull()]
            .loc[lambda df: ~df['body'].str.contains('COMPANY REPORTS')]
    )
    
    data_df['body'] = (
        data_df['body']
         .str.strip()
         .str.split('\n')
         .apply(lambda x: list(filter(lambda y: not y.startswith('LEAD:'), x)))
         .apply(lambda x: list(filter(lambda y: y.strip() != '', x)))
         .str.join('\n')
    )
    all_data.append(data_df)

  0%|          | 0/21 [00:00<?, ?it/s]

  data_df = pd.read_csv(f, index_col=0)
  data_df = pd.read_csv(f, index_col=0)
  data_df = pd.read_csv(f, index_col=0)
  data_df = pd.read_csv(f, index_col=0)
  data_df = pd.read_csv(f, index_col=0)
  data_df = pd.read_csv(f, index_col=0)


In [127]:
all_data_df = pd.concat(all_data)
all_data_df = (all_data_df
                   .loc[lambda df: df['online_sections'].notnull()]
                   .loc[lambda df: ~df['online_sections'].str.contains('Paid Death Notices')]
                   .loc[lambda df: ~df['online_sections'].str.contains('Corrections')]               
              )
all_data_df = all_data_df.loc[lambda df: ~df['headline'].str.strip().str.lower().isin(headlines_to_exclude)]
all_data_df = (
    all_data_df
         .loc[lambda df: df['body'].str.len() > 3_500]
         .loc[lambda df: df['body'].str.len() < 10_000]
)

In [186]:
import string
import re

In [198]:
def replace_punct(s, include_space=False):
    for p in string.punctuation:
        if include_space:
            s = s.replace(p, ' ')
        else:
            s = s.replace(p, '')
    return s
    
def make_id(x):
    ## make section
    section = x['dsk']
    if pd.isnull(section):
        section = 'other'
    else:
        section = replace_punct(section, include_space=True).lower().strip().split()[0]

    ## make headline slug
    headline = x['headline']
    if pd.isnull(headline):
        headline = 'no-headline'
    else:
        headline = replace_punct(headline, include_space=True)
        headline = re.sub('\s+', ' ', headline)
        headline = '-'.join(headline.split()[:8]).lower()

    year = str(int(x['publication_year'])).zfill(4)
    month = str(int(x['publication_month'])).zfill(2)
    day = str(int(x['publication_day_of_month'])).zfill(2)
    return f"{year}/{month}/{day}/{section}/{headline}"

In [200]:
all_data_df['id'] = all_data_df.apply(make_id, axis=1)

In [201]:
all_data_df.to_csv('../data/nyt-ldc/nyt-ldc-docs-to-score.csv.gz', compression='gzip')

In [202]:
all_data_df.head()

Unnamed: 0,publication_day_of_month,publication_month,publication_year,publication_day_of_week,dsk,print_page_number,print_section,print_column,online_sections,headline,body,banner,correction_date,series_name,slug,feature_page,column_name,alternate_url,id
10,3.0,2.0,1990.0,Saturday,Style Desk,35.0,1,1.0,Style,\nCONSUMER'S WORLD: Coping; With Tired Bathr...,Your 1950's bathroom is showing its age. It lo...,,,,,,,,1990/02/03/style/consumer-s-world-coping-with-...
11,3.0,2.0,1990.0,Saturday,Metropolitan Desk,31.0,1,1.0,New York and Region,"\nDinkins Names 6 Officials, Including Head of...",Mayor David N. Dinkins said yesterday that he ...,,,,,,,,1990/02/03/metropolitan/dinkins-names-6-offici...
13,3.0,2.0,1990.0,Saturday,Style Desk,35.0,1,4.0,Style,\nCar Makers Fight to Reclaim Market in Replac...,An embattled automobile industry is mounting a...,,,,,,,,1990/02/03/style/car-makers-fight-to-reclaim-m...
18,3.0,2.0,1990.0,Saturday,Foreign Desk,3.0,1,1.0,World,\nPinochet Gets Role in Inauguration of New Le...,After initially seeking ways to avoid taking p...,,,,,,,,1990/02/03/foreign/pinochet-gets-role-in-inaug...
21,3.0,2.0,1990.0,Saturday,Sports Desk,49.0,1,1.0,Sports,\nOne on One With Magic And Michael\n,A one-on-one basketball game between Magic Joh...,,,,,,,,1990/02/03/sports/one-on-one-with-magic-and-mi...


In [144]:
pd.read_csv('../data/nyt-ldc/nyt-ldc-docs-to-score.csv.gz', skiprows=range(1,1), nrows=10)

Unnamed: 0.1,Unnamed: 0,publication_day_of_month,publication_month,publication_year,publication_day_of_week,dsk,print_page_number,print_section,print_column,online_sections,headline,body,banner,correction_date,series_name,slug,feature_page,column_name,alternate_url
0,10,3.0,2.0,1990.0,Saturday,Style Desk,35.0,1,1.0,Style,\nCONSUMER'S WORLD: Coping; With Tired Bathr...,Your 1950's bathroom is showing its age. It lo...,,,,,,,
1,11,3.0,2.0,1990.0,Saturday,Metropolitan Desk,31.0,1,1.0,New York and Region,"\nDinkins Names 6 Officials, Including Head of...",Mayor David N. Dinkins said yesterday that he ...,,,,,,,
2,13,3.0,2.0,1990.0,Saturday,Style Desk,35.0,1,4.0,Style,\nCar Makers Fight to Reclaim Market in Replac...,An embattled automobile industry is mounting a...,,,,,,,
3,18,3.0,2.0,1990.0,Saturday,Foreign Desk,3.0,1,1.0,World,\nPinochet Gets Role in Inauguration of New Le...,After initially seeking ways to avoid taking p...,,,,,,,
4,21,3.0,2.0,1990.0,Saturday,Sports Desk,49.0,1,1.0,Sports,\nOne on One With Magic And Michael\n,A one-on-one basketball game between Magic Joh...,,,,,,,
5,49,3.0,2.0,1990.0,Saturday,Financial Desk,38.0,1,1.0,Business,\nYour Money; Planning Now For '90 Taxes\n,As taxpayers go over their financial records i...,,,,,,,
6,56,3.0,2.0,1990.0,Saturday,Financial Desk,1.0,1,2.0,Business,\nCompany News; U.S. Biotechnology Leader to...,"Genentech Inc., the crown jewel of the United ...",,,,,,,
7,57,3.0,2.0,1990.0,Saturday,Metropolitan Desk,29.0,1,5.0,Health; New York and Region,\nCUNY Panel Says Doctors Fail to Serve As Pro...,City College opened the Sophie Davis School of...,,,,,,,
8,64,3.0,2.0,1990.0,Saturday,National Desk,11.0,1,1.0,U.S.,\nFight On Cleanup Of Harbor Goes On\n,Long after the last vote was counted in the 19...,,,,,,,
9,65,3.0,2.0,1990.0,Saturday,Metropolitan Desk,29.0,1,2.0,New York and Region,\nSurprise Witness for Gotti: Victim of the '8...,A former union official who prosecutors say wa...,,,,,,,
