# DSCI 511-900 Final Project
#### Name: Alec Peterson
#### Email: ap3842@drexel.edu

# Import modules

In [1]:
# requests, json - for API calls
import requests
import json

# Pandas - for dataframes
import pandas as pd

# spaCy - for nlp
import spacy

# timeit - for timing operations
import timeit

# time - for pausing between loop iterations to adhere to API call rate limit
import time

import pyarrow as pa # For changing dataframes to Apache Arrow tables
import pyarrow.parquet as pq # For saving Arrow tables as Apache Parquet files

In [2]:
# Modify spacy pipeline to only include tokenizer and named entity recognition (ner)
nlp = spacy.load("en_core_web_sm", disable=["tagger", "parser"])

# Define functions

## `gen_url(year_no, month_no)`
__Make URL for API call to New York Times' Archive API__

In [3]:
# Define a function to generate a url given input year number (year_no) and month number (month_no, a number 1 through 12)
def gen_url(year_no, month_no):
    
    url = "https://api.nytimes.com/svc/archive/v1/{}/{}.json?api-key={api_key}".format(str(year_no),
                                                                                str(month_no),
                                                                                api_key = "jQd6lZrlLaHAN4oAvYIvGzkq6tGBYTXO") # api_key is unique to user. Please do not distribute.
    
    return url

## `doc_df(docs)`
__Iterate over document list (articles) and return a pandas dataframe__

In [4]:
def doc_df(docs):
    import pandas as pd
    from collections import defaultdict
    
    doc_keys = ["pub_date", "abstract", "web_url", 
                "lead_paragraph","document_type", "news_desk",
                "section_name", "type_of_material", "source", 
                "headline", "byline", "_id", 
                "word_count"]
        
    doc_dict = defaultdict(list)
        
    for key in doc_keys:
        for doc in docs:
            
            # headline is a dict
            if key == "headline":
                doc_dict[key].append(doc[key]["main"])
                
            # get "original" string instead of dealing with "person" dictionary which may not be present
            elif key == "byline":
                doc_dict[key].append(doc[key]["original"])
                
            # all other keys
            else:
                doc_dict[key].append(doc[key])
    
    return pd.DataFrame(doc_dict)

## `parse_docs(year_no, month_no)`
__Generate URL -> call API -> call doc_df() to generate dataframe__

In [5]:
def parse_docs(year_no, month_no):
    
    # Generate the url
    url = gen_url(year_no, month_no)
    response = requests.get(url)
    results = response.json()
    docs = results["response"]["docs"] # a list of documents, each item is a dictionary
    
    # Generate the dataframe
    df = doc_df(docs)
    
    return df

## `make_df_NYTarchive(year_start, year_end)`
__Iterate over every month for all years from year_start to year_end (exclusive), and return a large dataframe. Also prints time it takes to do so (~2 minutes / year)__

In [6]:
def make_df_NYTarchive(year_start, year_end):
    
    # Timer start
    start_time = timeit.default_timer()

    # Collect dataframes for each year and month, calling the parse_docs function
    df_list = []
    for year in range(year_start, year_end):
        for month in range(1, 13):
            # Start timer
            parse_time_start = timeit.default_timer()
            
            # Call parse_docs, then append the resulting dataframe to df_list
            df = parse_docs(year, month)
            df_list.append(df)
            
            # End timer
            parse_time_end = timeit.default_timer()
            
            # Wait at least 6 seconds between API calls
            parse_time_total = parse_time_end - parse_time_start
            if parse_time_total > 6:
                continue
            else:
                time.sleep(6 - parse_time_total)
    
    # Make a "big" dataframe combining the dataframes for each year-month's articles
    df_big = df_list[0]
    for i in range(1, len(df_list)):
        df_big = pd.concat([df_big, df_list[i]])
    
    ## Reindex the big dataframe
    df_big.index = pd.RangeIndex(len(df_big))
    
    ## Change pub_date to date, add "year" and "month" column
    df_big["pub_date"] = pd.to_datetime(df_big["pub_date"])
    df_big["pub_date"] = df_big["pub_date"].apply(lambda x: x.date())
    df_big["year"] = df_big["pub_date"].apply(lambda x: x.year)
    df_big["month"] = df_big["pub_date"].apply(lambda x: x.month)
    
    ## Reorder the columns
    df_big = df_big.loc[:, ["_id", "pub_date", "year", "month", "headline", "abstract", "lead_paragraph", "byline", 
                            "word_count", "document_type", "news_desk", "section_name", "type_of_material", "source", "web_url"]]
    
    '''
    ## Optional: Change specific text columns to dtype = "string[pyarrow]" for more efficient memory representation / downstream processing
    df_big["headline"] = df_big["headline"].astype("string[pyarrow]")
    df_big["abstract"] = df_big["abstract"].astype("string[pyarrow]")
    df_big["lead_paragraph"] = df_big["lead_paragraph"].astype("string[pyarrow]")
    '''
    
    # Timer elapsed
    elapsed = timeit.default_timer() - start_time
    print("Elapsed time: {} minutes".format(elapsed/60))
    return df_big

# Generate Desired Dataframe

Make a smaller "test" dataframe with only a year's worth of data...

In [7]:
df_test = make_df_NYTarchive(1990, 1991) # Should take ~2 minutes to run, will get all articles from 1990 up to 1991

Elapsed time: 1.592971148333333 minutes


<br>
...Or make a "big" dataframe with a decade's worth of data.

In [None]:
df_big = make_df_NYTarchive(1990, 2000) # Should take ~20 - 25 minutes to run, will get all articles from 1990 up to 2000

# Saving as .parquet file with `pyarrow`

* Apache Parquet files (.parquet, https://parquet.apache.org/) are a columnar storage format that have more efficient storage and can be read in faster than the equivalent .csv file
* Parquet files can be efficiently read and written in Python with the `pyarrow` package, which takes advantage of a language-independent in-memory columnar data Apache Arrow (https://arrow.apache.org/) format
    * pandas can also read in Parquet files using `pandas.read_parquet()`, which is faster than pandas.read_csv() (by ~3x)
   
A .csv for a decade's worth of data could be ~0.8 to ~1 gigabyte. Parquet is a more efficient storage format. Optionally, analyses maybe done with the more efficient in-memory Arrow representation through packages like `pyarrow`, `ibis`, or `polars`.

## `save_as_parquet(year_start, year_end`
__Call `make_df_NYTarchive()` and save the table as a .parquet file in local directory__

In [13]:
def save_as_parquet(year_start, year_end):
    import pyarrow as pa # import pyarrow
    import pyarrow.parquet as pq # shorten parquet module from pyarrow
    
    df_big = make_df_NYTarchive(year_start, year_end)
    
    table = pa.Table.from_pandas(df_big)
    pq.write_table(table ,"{}_to_{}.parquet".format(year_start, year_end))
    
    print("Dataframe saved as {}_to_{}.parquet".format(year_start, year_end))
    return df_big

# Generate `spaCy` doc objects for - headline, abstract, lead_paragraph

## headline_docs, abstract_docs, lead_para_docs

In [8]:
print(nlp.pipeline) # Verify pipepline components

[('ner', <spacy.pipeline.pipes.EntityRecognizer object at 0x000001D8FC8C32E0>)]


### Either operate on df_test (and reassign to df_big), df_big generated in earlier cell, or load .parquet file

In [9]:
# Run if using df_test from earlier cell i.e. importing only 1 year's worth of data
df_big = df_test

In [40]:
# If didn't want to wait for make_df_NYT_archive(), can load local .parquet file
df_big = pd.read_parquet("1990_to_2000.parquet") # Example...

### Make list of spaCy doc objects with `nlp.pipe()`

In [None]:
# Make list of spaCy doc objects from text columns
headline_docs = list(nlp.pipe(df_big["headline"]))

abstract_docs = list(nlp.pipe(df_big["abstract"]))

lead_para_docs = list(nlp.pipe(df_big["lead_paragraph"]))


# nlp.pipe() allows multithreading with the n_process argument. For this task, using n_process = 2 gave 40-50% speed increase
# Using n_process = 3 or higher gave diminishing returns on my computer
'''
headline_docs = list(nlp.pipe(df_big["headline"], n_process=2))

abstract_docs = list(nlp.pipe(df_big["abstract"], n_process=2))

lead_para_docs = list(nlp.pipe(df_big["lead_paragraph"], n_process=2))
'''

### Make a dictionary, `article_dict` containing article id, pub_date, headline, abstract, lead_paragraph, and associated lemmas and entities that will eventually become a dataframe

In [20]:
article_dict = {}
for i in range(len(df_big["_id"])):
    article_dict[df_big["_id"][i]] = {"pub_date": df_big["pub_date"][i],
                                  
                                      # headline and its tokens, and entities
                                      "headline": df_big["headline"][i],

                                      "lemmas_headline": [token.lemma_ for token in headline_docs[i] 
                                                          if not token.is_stop and not token.is_punct and not token.is_space and not token.like_num],
                                      "entities_headline": list(headline_docs[i].ents),

                                      # abstract and its tokens and entities
                                      "abstract": df_big["abstract"][i],

                                      "lemmas_abstract": [token.lemma_ for token in abstract_docs[i] 
                                                          if not token.is_stop and not token.is_punct and not token.is_space and not token.like_num],
                                      "entities_abstract": list(abstract_docs[i].ents),
                                      
                                      # lead paragraph and its tokens and entities
                                      "lead_paragraph": df_big["lead_paragraph"],
                                      
                                      "lemmas_lead_para": [token.lemma_ for token in lead_para_docs[i] 
                                                           if not token.is_stop and not token.is_punct and not token.is_space and not token.like_num],
                                      "entities_lead_para": list(lead_para_docs[i].ents)}


## df2_big - turn `article_dict` into dataframe

In [21]:
df2_big = pd.DataFrame(article_dict).transpose()

# Make sure "pub_date" is a date
df2_big["pub_date"] = pd.to_datetime(df2_big["pub_date"])
df2_big["pub_date"] = df2_big["pub_date"].apply(lambda x: x.date())
                                     
df2_big.head()

Unnamed: 0,pub_date,headline,lemmas_headline,entities_headline,abstract,lemmas_abstract,entities_abstract,lead_paragraph,lemmas_lead_para,entities_lead_para
nyt://article/065d970c-0342-5066-a441-f59423263e3d,1990-01-01,Bridge,[Bridge],[],LEAD: One of the many sad bridge stories of 19...,"[LEAD, sad, bridge, story, concern, final, Aus...","[(One), (1989), (the, Australian, National, Te...",0 One of the many sad bridge stories of...,"[sad, bridge, story, concern, final, Australia...","[(One), (1989), (the, Australian, National, Te..."
nyt://article/0a0e2668-b979-56d1-b675-c7f521131236,1990-01-01,He Has Tyson On His Mind,"[Tyson, Mind]",[],LEAD: THOSE on Donovan Ruddock's Christmas-car...,"[LEAD, Donovan, Ruddock, Christmas, card, list...","[(Donovan, Ruddock, 's), (Christmas), (this, y...",0 One of the many sad bridge stories of...,"[Donovan, Ruddock, Christmas, card, list, get,...","[(Donovan, Ruddock, 's), (Christmas), (this, y..."
nyt://article/0a71f8ee-d649-5ae8-9524-e2f1d6d5a0aa,1990-01-01,"For Dinkins, Pomp, Ceremony, Triumph And a Dre...","[Dinkins, Pomp, Ceremony, Triumph, Dream, Real...","[(Dinkins), (Pomp)]",LEAD: The walls of David N. Dinkins's borough ...,"[LEAD, wall, David, N., Dinkins, borough, pres...","[(David, N., Dinkins, 's), (dozens), (City, Ha...",0 One of the many sad bridge stories of...,"[wall, David, N., Dinkins, borough, president,...","[(David, N., Dinkins, 's), (dozens), (City, Ha..."
nyt://article/0da7cd16-c122-51c0-b392-2484ac4b7de3,1990-01-01,Army Doesn't Have to Compete With Marines; W...,"[Army, Compete, Marines, Need, Navy]","[(Army), (Navy)]",LEAD: To the Editor:,"[LEAD, Editor]",[],0 One of the many sad bridge stories of...,[Editor],[]
nyt://article/0ecac4c0-3b20-5037-b395-475a416687ad,1990-01-01,Students Must Get to School and Back Safely,"[Students, School, Safely]",[],LEAD: To the Editor:,"[LEAD, Editor]",[],0 One of the many sad bridge stories of...,[Editor],[]


## `make_exploded_df(df, col_name)` - make df with row for each lemma or entity via df.explode()

In [22]:
def make_exploded_df(df, col_name):
    if col_name == "lemmas_headline" or "entities_headline":
        base_col = "headline"
    elif col_name == "lemmas_abstract" or "entities_abstract":
        base_col = "abstract"
    else:
        base_col = "lead_paragraph"
    
    df_exp = df.explode(column = col_name).loc[:, ["pub_date", base_col, col_name]]
    
    return df_exp

In [23]:
df_lems_headline = make_exploded_df(df2_big, "lemmas_headline")
df_lems_headline.head()

Unnamed: 0,pub_date,headline,lemmas_headline
nyt://article/065d970c-0342-5066-a441-f59423263e3d,1990-01-01,Bridge,Bridge
nyt://article/0a0e2668-b979-56d1-b675-c7f521131236,1990-01-01,He Has Tyson On His Mind,Tyson
nyt://article/0a0e2668-b979-56d1-b675-c7f521131236,1990-01-01,He Has Tyson On His Mind,Mind
nyt://article/0a71f8ee-d649-5ae8-9524-e2f1d6d5a0aa,1990-01-01,"For Dinkins, Pomp, Ceremony, Triumph And a Dre...",Dinkins
nyt://article/0a71f8ee-d649-5ae8-9524-e2f1d6d5a0aa,1990-01-01,"For Dinkins, Pomp, Ceremony, Triumph And a Dre...",Pomp


In [24]:
df_ents_abs = make_exploded_df(df2_big, "entities_abstract")
df_ents_abs.head()

Unnamed: 0,pub_date,headline,entities_abstract
nyt://article/065d970c-0342-5066-a441-f59423263e3d,1990-01-01,Bridge,(One)
nyt://article/065d970c-0342-5066-a441-f59423263e3d,1990-01-01,Bridge,(1989)
nyt://article/065d970c-0342-5066-a441-f59423263e3d,1990-01-01,Bridge,"(the, Australian, National, Team, Championship)"
nyt://article/065d970c-0342-5066-a441-f59423263e3d,1990-01-01,Bridge,(Canberra)
nyt://article/065d970c-0342-5066-a441-f59423263e3d,1990-01-01,Bridge,(February)


In [25]:
df_ents_abs["entities_abstract"][2]

the Australian National Team Championship

In [26]:
df_lems_lead_para = make_exploded_df(df2_big, "lemmas_lead_para")
df_lems_lead_para.head()

Unnamed: 0,pub_date,headline,lemmas_lead_para
nyt://article/065d970c-0342-5066-a441-f59423263e3d,1990-01-01,Bridge,sad
nyt://article/065d970c-0342-5066-a441-f59423263e3d,1990-01-01,Bridge,bridge
nyt://article/065d970c-0342-5066-a441-f59423263e3d,1990-01-01,Bridge,story
nyt://article/065d970c-0342-5066-a441-f59423263e3d,1990-01-01,Bridge,concern
nyt://article/065d970c-0342-5066-a441-f59423263e3d,1990-01-01,Bridge,final


In [27]:
df_big

Unnamed: 0,_id,pub_date,year,month,headline,abstract,lead_paragraph,byline,word_count,document_type,news_desk,section_name,type_of_material,source,web_url
0,nyt://article/065d970c-0342-5066-a441-f5942326...,1990-01-01,1990,1,Bridge,LEAD: One of the many sad bridge stories of 19...,One of the many sad bridge stories of 1989 con...,By Alan Truscott,411,article,Metropolitan Desk,New York,News,The New York Times,https://www.nytimes.com/1990/01/01/nyregion/br...
1,nyt://article/0a0e2668-b979-56d1-b675-c7f52113...,1990-01-01,1990,1,He Has Tyson On His Mind,LEAD: THOSE on Donovan Ruddock's Christmas-car...,THOSE on Donovan Ruddock's Christmas-card list...,By Phil Berger,199,article,Sports Desk,Sports,News,The New York Times,https://www.nytimes.com/1990/01/01/sports/spor...
2,nyt://article/0a71f8ee-d649-5ae8-9524-e2f1d6d5...,1990-01-01,1990,1,"For Dinkins, Pomp, Ceremony, Triumph And a Dre...",LEAD: The walls of David N. Dinkins's borough ...,The walls of David N. Dinkins's borough presid...,By Don Terry,1233,article,Metropolitan Desk,New York,News,The New York Times,https://www.nytimes.com/1990/01/01/nyregion/fo...
3,nyt://article/0da7cd16-c122-51c0-b392-2484ac4b...,1990-01-01,1990,1,Army Doesn't Have to Compete With Marines; W...,LEAD: To the Editor:,To the Editor:,,524,article,Editorial Desk,Opinion,Letter,The New York Times,https://www.nytimes.com/1990/01/01/opinion/l-a...
4,nyt://article/0ecac4c0-3b20-5037-b395-475a4166...,1990-01-01,1990,1,Students Must Get to School and Back Safely,LEAD: To the Editor:,To the Editor:,,363,article,Editorial Desk,Opinion,Letter,The New York Times,https://www.nytimes.com/1990/01/01/opinion/l-s...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
98900,nyt://article/f4e78851-de2b-5d9e-9e08-c525ef53...,1990-12-31,1990,12,Canada-Korea A-Plant Pact,Canada's state-owned nuclear power company h...,Canada's state-owned nuclear power company has...,AP,204,article,Financial Desk,Business Day,News,The New York Times,https://www.nytimes.com/1990/12/31/business/th...
98901,nyt://article/f5d13149-6618-5d0e-855a-0b7fe93b...,1990-12-31,1990,12,Carlos Fuentes Trades His Pen for Television,Carlos Fuentes is an uncontrollable wanderer...,"Carlos Fuentes is an uncontrollable wanderer, ...","By Alan Riding, Special To the New York Times",1402,article,Cultural Desk,Arts,News,The New York Times,https://www.nytimes.com/1990/12/31/arts/carlos...
98902,nyt://article/f627c63e-c7f8-59a4-bf1f-986774ae...,1990-12-31,1990,12,World Markets Begin 1991 Pressed by Problems o...,World stock markets begin 1991 facing the th...,World stock markets begin 1991 facing the thre...,By Jonathan Fuerbringer,1843,article,Financial Desk,Business Day,News,The New York Times,https://www.nytimes.com/1990/12/31/business/in...
98903,nyt://article/fb863530-af68-54a8-a6b4-bb764a60...,1990-12-31,1990,12,Soviet Consumer Vigilantes Ferret Out Hoarded ...,The secrets of the Faton Vegetable and Fruit...,The secrets of the Faton Vegetable and Fruit S...,"By Esther B. Fein, Special To the New York Times",1355,article,Foreign Desk,World,News,The New York Times,https://www.nytimes.com/1990/12/31/world/sovie...
