In [18]:
import os
import pandas as pd
import seaborn as sns
import glob
from tqdm import tqdm_notebook as tqdm
from datetime import datetime
import matplotlib.pyplot as plt
import unidecode
import swifter
import spacy
import util
nlp = spacy.load('en_core_web_sm')
import nltk.corpus
import re
from sklearn.model_selection import train_test_split
%matplotlib inline

In [19]:
weekdays = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
print_sections = ['A', 'B', 'C', 'D', 'E', '1', '2', '3', '4']
fnames = glob.glob('../nyt_corpus/csvs/*')

In [20]:
import uuid

In [25]:
modeling_data = []
testing_data = []

for fname in tqdm(fnames):
    year = int(os.path.basename(fname).replace('.csv', ''))  
    article_df = pd.read_csv(fname, index_col=0)
    
    ## clean up 
    article_df = (article_df
      .loc[lambda df: df[['publication_year', 'publication_month', 'publication_day_of_month']].notnull().any(axis=1)]
    )

    ## extra auxiliary columns
    article_df['print_section_and_page'] = (article_df
     .loc[lambda x: x['print_page_number'].notnull()]
     .apply(lambda x: '%s-%03.f' % (x['print_section'] , int(x['print_page_number'])), axis=1)
    )

    article_df['pub_date'] = ((article_df
        .apply(lambda x: datetime(
            year, int(x['publication_month']), 
            int(x['publication_day_of_month'])
        ), axis=1)
    ))

    ## add id column
    article_df['id'] = article_df.apply(lambda x: uuid.uuid1(), axis=1)
    
    ## filter to weekdays and top sections
    data_df = (article_df
     .loc[lambda df: df['publication_day_of_week'].isin(weekdays[:-2])]
     .loc[lambda df: df['print_section'].isin(print_sections[:-4])]
    )

    ## set index
    pub_date_df = data_df.set_index('pub_date')

    ## take sample
    for day in pub_date_df.index.unique():
        day_articles = pub_date_df.loc[day]

        if len(day_articles.shape) > 1:
            a_1 = (
                day_articles
                  .loc[lambda df: df['print_section_and_page'] == 'A-001']
                  .loc[lambda df: df['body'].notnull()==True]
                  .assign(label=1)
            )

            not_a_1 = (
                day_articles
                  .loc[lambda df: ~df['print_section_and_page'].isin(['A-001', 'A-002'])]
                  .loc[lambda df: df['body'].notnull()==True]
                  .sample(len(a_1) * 5)
                  .assign(label=0)
            )

            modeling_data.append(a_1.reset_index())
            modeling_data.append(not_a_1.reset_index())
    
    testing_data.append(pub_date_df)

HBox(children=(IntProgress(value=0, max=21), HTML(value='')))

  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)





In [26]:
modeling_data_df = pd.concat(modeling_data)
modeling_data_df = modeling_data_df.reset_index(drop=True)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


In [27]:
testing_data_df = pd.concat(testing_data)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


In [8]:
modeling_data_df.to_csv("exploratory_analysis/unprocessed_sampled_data_df.csv")

In [30]:
testing_data_df.to_csv("exploratory_analysis/unprocessed_full_data_df.csv")

# Split by time

In [59]:
time_balanced_train_df, time_balanced_test_df = (modeling_data_df
 .sort_values('pub_date')
 .pipe(lambda df: (df.iloc[:int(df.shape[0] * .75)], df.iloc[int(df.shape[0] * .75):]))
)

In [60]:
testing_data_df = testing_data_df.reset_index()

In [62]:
time_unbalanced_test_df = testing_data_df.loc[lambda df: df['pub_date'] > time_balanced_train_df['pub_date'].max()]

In [63]:
time_unbalanced_test_df.to_csv("exploratory_analysis/unprocessed_test_time_unbalanced_df.csv")
time_balanced_train_df.to_csv("exploratory_analysis/unprocessed_train_time_balanced_df.csv")
time_balanced_test_df.to_csv("exploratory_analysis/unprocessed_test_time_balanced_df.csv")

In [2]:
time_balanced_test_df = pd.read_csv("exploratory_analysis/unprocessed_test_time_balanced_df.csv", index_col=0)

# Preprocess data

In [4]:
modeling_data_df = pd.read_csv("exploratory_analysis/unprocessed_sampled_data_df.csv", index_col=0)

In [3]:
page_num_regex = re.compile('\s[a-f]\d+(\s|$)')
specific_stop_words = [
    'article',
    'page',
    'sportsmonday',
    'sportstuesday',
    'sportswednesday',
    'sportsthursday',
    'sportsfriday',
    'sportssaturday',
    'sportssunday',
    'times',
    'caption',
    'science times',
    'business day',
    'editing error page',
    'ap sports',
    'ap',
    'reuters',
    'op ed contributor',
    'books times',
    'music review',
    'op ed',
    'sports times',
    'articles , pages',
    'articles pages',
    'special today',
    'science f1',
    'art review',
    'television review',
    'articles series',
    'ed contributor',
    'news briefs',
    'articles series',
    'news analysis',
    'sports people',
    'company news',
    'metro : new york',
    'metro : new jersey',
    'metro : new york city',
    'metro : new york state',
    'lead : editor',
    'op - ed',
    'company reports',
    'dance review',
    'theater review',
    'public lives',
    'world business , section w',
    'world business briefing : europe',
    'world business briefing : asia',
    'world business briefings : middle east',
    'world business briefing : africa',
    'world business briefing : americas',
    'world business briefings : europe',
    'world business briefings : asia',
    'world business briefing : world trade'
]

english_stopwords = [item.strip('\n') for item in nltk.corpus.stopwords.open('english')]
stopwords = specific_stop_words + english_stopwords

def preprocess(body):
    """preprocess with spacy."""
    try:
        ### spacy split
        text = body.split()
        text = ' '.join(text)
        doc = nlp(text)
        text = [word.text for word in doc]
        body = ' '.join(text).replace('\' \'', '"')

        ### other cleanup
        body = body.lower()
        body = body.strip()
        
        ### replace stopwords
        for stopword in stopwords:
            ## stopword in body
            body = body.replace(' ' + stopword + ' ', ' ')
            ## stopword at start
            if body[:len(stopword + ' ')] == (stopword + ' '):
                body = body[len(stopword + ' '):]
            ## stopword at end
            if body[-len(' ' + stopword):] == (' ' + stopword):
                body = body[:-len(' ' + stopword)]

        ### replace page numbers
        body = re.sub(page_num_regex, ' ', body)
       
        return unidecode.unidecode(body)
    except:
        return ''
    
    
def preprocess_lite(body):
    """preprocess without spacy."""
    body = body.strip()
    
    ### replace stopwords
    for stopword in stopwords:
        ## stopword in body
        body = body.replace(' ' + stopword + ' ', ' ')
        ## stopword at start
        if body[:len(stopword + ' ')] == (stopword + ' '):
            body = body[len(stopword + ' '):]
        ## stopword at end
        if body[-len(' ' + stopword):] == (' ' + stopword):
            body = body[:-len(' ' + stopword)]
        
        
    ### replace page numbers
    body = re.sub(page_num_regex, ' ', body)
        
    return unidecode.unidecode(body)

In [52]:
processed_bodies = []

for processed_body in tqdm(util.multiprocess(modeling_data_df['body'], preprocess), total=len(modeling_data_df)):
    processed_bodies.append(processed_body)

HBox(children=(IntProgress(value=0, max=91054), HTML(value='')))

In [273]:
modeling_data_df['processed_bodies'] = modeling_data_df['processed_bodies'].swifter.apply(preprocess_lite)

HBox(children=(IntProgress(value=0, description='Pandas Apply', max=91054, style=ProgressStyle(description_wid…

In [257]:
modeling_data_df['processed_bodies'] = pd.Series(processed_bodies)

In [274]:
modeling_data_df.to_csv("exploratory_analysis/processed_sampled_data_df.csv")

# Write for fasttext

In [297]:
lines = []
for line in modeling_data_df.apply(lambda x: '%s __label__%d' % (x['processed_bodies'], x['label']), axis=1):
    lines.append(line)

In [302]:
## write full dataset
with open('exploratory_analysis/fasttext_processed_data.txt', 'w') as f:
    for line in lines:
        f.write(line)
        f.write('\n')

In [300]:
lines_train, lines_test = train_test_split(lines)

In [303]:
## write training
with open('exploratory_analysis/fasttext_processed_data_train.txt', 'w') as f:
    for line in lines_train:
        f.write(line)
        f.write('\n')
        
## write test
with open('exploratory_analysis/fasttext_processed_data_test.txt', 'w') as f:
    for line in lines_test:
        f.write(line)
        f.write('\n')

In [13]:
### write time-stratified data

In [15]:
time_balanced_train_df = pd.read_csv("data/processed_train_time_balanced_df.csv", index_col=0)

  interactivity=interactivity, compiler=compiler, result=result)


In [17]:
lines = []
for line in time_balanced_train_df.apply(lambda x: '%s __label__%d' % (x['processed_bodies'], x['label']), axis=1):
    lines.append(line)

with open('data/fasttext_processed_data_train_balanced.txt', 'w') as f:
    for line in lines:
        f.write(line)
        f.write('\n')

In [8]:
time_unbalanced_test_df = pd.read_csv("data/processed_test_time_unbalanced_df.csv", index_col=0)

  interactivity=interactivity, compiler=compiler, result=result)


In [10]:
lines = []
for line in time_unbalanced_test_df.apply(lambda x: '%s __label__%d' % (x['processed_bodies'], x['label']), axis=1):
    lines.append(line)

with open('data/fasttext_processed_data_test_unbalanced.txt', 'w') as f:
    for line in lines:
        f.write(line)
        f.write('\n')

# Preprocess time splits

In [69]:
time_unbalanced_test_df['processed_bodies'] = time_unbalanced_test_df['body'].swifter.apply(preprocess)

HBox(children=(IntProgress(value=0, description='Pandas Apply', max=276882, style=ProgressStyle(description_wi…




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [None]:
time_unbalanced_test_df = (
    time_unbalanced_test_df
    .loc[lambda df: df['body'].notnull()==True]
    .loc[lambda df: df['print_section_and_page'] != 'A-002']
)
time_unbalanced_test_df['label'] = (
    time_unbalanced_test_df
    .apply(lambda x: 1 if (x['print_section_and_page'] == 'A-001') else 0, axis=1)
)

In [71]:
time_unbalanced_test_df.to_csv("exploratory_analysis/processed_test_time_unbalanced_df.csv")

In [72]:
time_balanced_train_df['processed_bodies'] = time_balanced_train_df['body'].swifter.apply(preprocess)

HBox(children=(IntProgress(value=0, description='Pandas Apply', max=68290, style=ProgressStyle(description_wid…




In [73]:
time_balanced_train_df.to_csv("exploratory_analysis/processed_train_time_balanced_df.csv")

In [5]:
time_balanced_test_df['processed_bodies'] = time_balanced_test_df['body'].swifter.apply(preprocess)

HBox(children=(IntProgress(value=0, description='Pandas Apply', max=22764, style=ProgressStyle(description_wid…




In [6]:
time_balanced_test_df.to_csv("exploratory_analysis/processed_test_time_balanced_df.csv")