# Preprocessing

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from w266_common import utils, vocabulary

# Data

In [2]:
df = pd.read_csv('data/nyt_corpus.csv')
n = df.shape[0]
df.shape

(10000, 32)

In [3]:
df.columns

Index(['abstract', 'author_info', 'copyright_holder', 'copyright_year',
       'descriptor', 'desk', 'full_text', 'general_descriptor', 'headline',
       'id', 'indexing_descriptor', 'indexing_location', 'indexing_org',
       'indexing_person', 'lead_paragraph', 'length', 'length_unit',
       'normalized_byline', 'online_headline', 'online_sections',
       'print_byline', 'print_column', 'print_page_number', 'print_section',
       'publication_day_of_month', 'publication_day_of_week',
       'publication_month', 'publication_year', 'series_name',
       'taxonomic_classifier', 'title', 'types_of_material'],
      dtype='object')

### Nouns

In [8]:
from nltk.corpus import treebank

In [14]:
treebank.sents()[100]

['Alan',
 'Spoon',
 ',',
 'recently',
 'named',
 '*',
 'Newsweek',
 'president',
 ',',
 'said',
 '0',
 'Newsweek',
 "'s",
 'ad',
 'rates',
 'would',
 'increase',
 '5',
 '%',
 'in',
 'January',
 '.']

### Lemmas

### Labels

In [4]:
# possible labels - desk works best
df[['desk','general_descriptor','online_sections','taxonomic_classifier']][-10:]

Unnamed: 0,desk,general_descriptor,online_sections,taxonomic_classifier
9990,Editorial Desk,"['Budgets and Budgeting', 'National Debt (US)'...",Opinion,"['Top/Opinion', 'Top/Opinion/Opinion', 'Top/Ne..."
9991,Foreign Desk,"['Immigration and Refugees', 'Jews', 'Music', ...",World,"['Top/News', 'Top/News/World/Countries and Ter..."
9992,Foreign Desk,['Airlines and Airplanes'],,['Top/Classifieds/Job Market/Job Categories/Ma...
9993,Metropolitan Desk,"['Murders and Attempted Murders', 'Basketball'...",New York and Region,"['Top/News/U.S./U.S. States, Territories and P..."
9994,Book Review Desk,['Books and Literature'],Arts; Books,"['Top/Features/Books/Book Reviews', 'Top/Featu..."
9995,Editorial Desk,,Opinion,"['Top/Opinion/Opinion/Letters', 'Top/Opinion',..."
9996,National Desk,,U.S.,"['Top/News/U.S.', 'Top/News']"
9997,Metropolitan Desk,,New York and Region,['Top/News/New York and Region']
9998,Classified,,Paid Death Notices,['Top/Classifieds/Paid Death Notices']
9999,Leisure/Weekend Desk,,Arts,['Top/Features/Arts']


In [5]:
# pct nulls for each possible labels
(n - df[['desk','general_descriptor','online_sections','taxonomic_classifier']].count())/n

desk                    0.0041
general_descriptor      0.2140
online_sections         0.0244
taxonomic_classifier    0.0144
dtype: float64

In [6]:
# these will be cleaned up for the final dataframe
df.groupby('desk')['desk'].count()

desk
A Nation Challenged                                                          1
Adventure Sports                                                             2
Arts & Ideas/Cultural Desk                                                  18
Arts & Leisure Desk                                                         29
Arts and Leisure Desk                                                      129
Automobiles                                                                  9
Book Review Desk                                                           176
Business World Magazine                                                      1
Business/Finance Desk                                                        1
Business/Financial Desk                                                    628
Business\Financial Desk                                                      1
Cars                                                                         4
Circuits                                       

In [237]:
def clean_labels(c):
    c = c.str.lower().str.strip()
    c = c.str.replace('desk', '')
    c = c.str.replace(';', '')
    c = c.str.replace(' and ', ' & ')
    c = c.str.replace('\\', '/')
    c = c.str.replace('arts & .*|cultural|museums|the arts/cultural|.*weekend.*', 'arts')
    c = c.str.replace('automobiles', 'cars')
    c = c.str.replace('classifed|classifieds|job market', 'classified')
    c = c.str.replace('.*dining out.*', 'dining')
    c = c.str.replace('education.*', 'education')
    c = c.str.replace('business/financ.*|business world magazine|e-commerce|.*money.*financ.*|sundaybusiness', 'business')
    c = c.str.replace('health&fitness', 'health & fitness')
    c = c.str.replace('home|house & home/style', 'home & garden')
    c = c.str.replace('metropolitian', 'metropolitan')
    c = c.str.replace('new jersey.*', 'new jersey weekly')
    c = c.str.replace('connecticut weekly|new jersey weekly|long island weekly|the city weekly.*|westchester weekly', 'city & region weekly')
    c = c.str.replace('thursday styles|styles of the times', 'style')
    c = c.str.replace('.*design.*magazine|.*fashion.*magazine|.*style.*magazine|.*travel.*magazine|t: \w+.*', 't magazine')
    c = c.str.replace('adventure sports|sports sports', 'sports')
    c = c.str.replace('circuits|flight', 'technology')
    c = c.str.strip()
    return c

labels = pd.DataFrame(clean_labels(df.desk))
labels.groupby('desk')['desk'].count()

desk
a nation challenged                                                          1
arts                                                                       882
book review                                                                176
business                                                                   725
cars                                                                        13
city & region weekly                                                       558
classified                                                                 734
dining                                                                      47
editorial                                                                  666
education                                                                   13
escapes                                                                     18
financial                                                                 1108
foreign                                        

In [243]:
#labels.groupby('desk')['desk'].count().filter()

labels.groupby('desk').filter(lambda x: x['desk'].count() <= 10).groupby('desk')['desk'].count().count()
# >10: 28 categories with 9,935 articles
# <=10: 14 categories with 24 articles
# 41 articles with null category

14

In [143]:
df_final = pd.DataFrame(pd.concat([labels, df.full_text, df.lead_paragraph#,
                                   #df.nouns,
                                   #df.lemmas
                                  ], axis=1))
df_final.head()

Unnamed: 0,desk,full_text,lead_paragraph
0,financial,"The Bethlehem Steel Corporation, after report...","The Bethlehem Steel Corporation, after report..."
1,foreign,Seeking to raise morale at home and improve i...,Seeking to raise morale at home and improve i...
2,editorial,By the time Lord Elgin obtained the authority...,By the time Lord Elgin obtained the authority...
3,classified,SPEIER-Claire. Born New York City. Age 70. Gr...,SPEIER-Claire. Born New York City. Age 70. Gr...
4,national,LEAD: The Japanese videotape begins with the ...,LEAD: The Japanese videotape begins with the ...


In [242]:
df[df.desk == 'The Business of Green'][['desk','general_descriptor','online_sections','taxonomic_classifier']][-10:]

Unnamed: 0,desk,general_descriptor,online_sections,taxonomic_classifier
4690,The Business of Green,"['Oil (Petroleum) and Gasoline', 'Chemicals', ...",Business,"['Top/News/Business', 'Top/News/Science/Enviro..."


### Convert to Vocabulary Objects

In [90]:
from nltk.tokenize.treebank import TreebankWordTokenizer

def create_vocab(articles):
    article_list = articles.tolist()
    tokenizer = TreebankWordTokenizer()    
    token_list = [tokenizer.tokenize(a) for a in article_list if not pd.isnull(a)]
    tokens = utils.canonicalize_words(utils.flatten(token_list))
    vocab = vocabulary.Vocabulary(tokens, size=None)
    vocab.token_count = len(tokens)
        
    return vocab

In [91]:
full_text_vocab = create_vocab(df_final.full_text[:1000])
lead_paragraph_vocab = create_vocab(df_final.lead_paragraph[:1000])
#noun_vocab = create_vocab(df_final.nouns)
#lemma_vocab = create_vocab(df_final.lemmas)

In [92]:
from IPython.display import display, HTML

html = '<table><tr><th>Article Type</th><th>Vocabulary Size</th><th>Token Count</th></tr>'
html += '<tr><td>Full Text</td><td>{:,}</td>'.format(full_text_vocab.size)
html += '<td>{:,}</td></tr>'.format(full_text_vocab.token_count)
html += '<tr><td>Lead Paragraph</td><td>{:,}</td>'.format(lead_paragraph_vocab.size)
html += '<td>{:,}</td></tr>'.format(lead_paragraph_vocab.token_count)
#html += '<tr><td>Nouns</td><td>{:,}</td>'.format(noun_vocab.size)
#html += '<td>{:,}</td></tr>'.format(noun_vocab.token_count)
#html += '<tr><td>Lemmas</td><td>{:,}</td>'.format(lemma_vocab.size)
#html += '<td>{:,}</td></tr>'.format(lemma_vocab.token_count)
html += "</tr></table>"
display(HTML(html))

Article Type,Vocabulary Size,Token Count
Full Text,45060,632031
Lead Paragraph,15296,91684


In [161]:
full_text_vocab.wordset

46063