# NLP Project Pt. 1: Web Scraping

In [1]:
import requests
from bs4 import BeautifulSoup as bs
import os
from selenium import webdriver
import time

#provide path to the chromedriver
chromedriver = "/Applications/chromedriver"
os.environ["webdriver.chrome.driver"] = \
    chromedriver
driver = webdriver.Chrome(chromedriver)

In [2]:
# create list of page urls to scrape
#https://www.newyorker.com/magazine/fiction/page/100 -- starting here we have summaries instead of stories
page_list = []
url_base = 'https://www.newyorker.com/magazine/fiction/page/'
counter = 0
for i in range(1, 100):
    counter += 1
    page_list.append(url_base+ str(counter))

print(page_list[0:5])

['https://www.newyorker.com/magazine/fiction/page/1', 'https://www.newyorker.com/magazine/fiction/page/2', 'https://www.newyorker.com/magazine/fiction/page/3', 'https://www.newyorker.com/magazine/fiction/page/4', 'https://www.newyorker.com/magazine/fiction/page/5']


In [3]:
# function to scrape each url for stories
from random import randint

def storylink_finder(page_list):
    linklist=[]
    story_url_base = 'https://www.newyorker.com'
    for url in page_list:
        url = url.encode('ascii', 'ignore').decode('unicode_escape')
        driver.get(url)
        page_html = driver.page_source
        time.sleep(randint(2,10))
        soup = bs(page_html)
        for link in soup.find_all('h4', class_="River__hed___re6RP"):
            story_url_end = link.find_parent('a')['href']
            linklist.append(story_url_base+str(story_url_end))
    return linklist

storylinks = storylink_finder(page_list)

In [4]:
len(storylinks)

990

In [8]:
# function to scrape the story from each url
def story_scraper(storylinks):
    story_info = []
    for url in storylinks:
        driver.get(url)
        page = driver.page_source
        time.sleep(randint(1, 20))
        soup = bs(page)
        story_text = []
        for item in soup.find_all('p', class_=['has-dropcap has-dropcap__lead-standard-heading',\
                                               'has-dropcap has-dropcap__lead-standard-heading paywall',\
                                               'paywall']):
            if item.find('a') == None:
                story_text.append(item.text) #preserving paragraph level 
        try:
            title = soup.find('h1', attrs={'data-testid': 'ContentHeaderHed'}).text
        except:
            title = None
        try:
            author = soup.find('span', attrs={'data-testid': 'BylineName'}).text
        except:
            author = None
        try:
            date = soup.find('time', attrs={'data-testid': 'ContentHeaderPublishDate'}).text
        except:
            date = None
        
        story_info.append({
            'URL' : url,
            'TITLE': title,
            'AUTHOR': author,
            'DATE': date,
            'TEXT': story_text
        })
    return story_info

corpus = story_scraper(storylinks)

In [9]:
import pandas as pd
pd.set_option('max_colwidth',200)
corpus_df = pd.DataFrame(corpus)
corpus_df.head()

Unnamed: 0,URL,TITLE,AUTHOR,DATE,TEXT
0,https://www.newyorker.com/magazine/2022/02/14/annunciation,Annunciation,Lauren Groff,"February 7, 2022","[Some nights, in my dreams, I find myself running through those hills above Palo Alto again. It is always just before dawn, and as I run I smell the sun-crisped fields, the sage, the eucalyptus. T..."
1,https://www.newyorker.com/magazine/2022/02/07/once-removed,Once Removed,Alexander MacLeod,"January 31, 2022","[She did not want to visit the old lady., Amy studied the stroller, then the bags, then her boyfriend and the baby. She checked her phone: 11:26 a.m. It was time to go. Ninety degrees, ninety-per-..."
2,https://www.newyorker.com/magazine/2022/01/31/long-distance,Long Distance,Ayşegül Savaş,"January 24, 2022","[Lea changed the sheets when she got up. She’d bought flowers the previous day, tulips that she’d put on the dresser. There were carnations on the kitchen table, in a squat glass vase. She thought..."
3,https://www.newyorker.com/magazine/2022/01/24/whats-the-deal-hummingbird,"What’s the Deal, Hummingbird?",Arthur Krystal,"January 17, 2022","[On or around May 5th of 2020, he just stopped. He stopped exercising, stopped walking, stopped reading, stopped planning. He ate, drank, washed, and paid the bills, but that was it. He was sevent..."
4,https://www.newyorker.com/magazine/2022/01/17/fireworks,Fireworks,Graham Swift,"January 10, 2022","[It was late October, 1962. Russian missiles were being shipped to Cuba. Kennedy was having words with Khrushchev. The world might be coming to an end., It was a common remark: “Cheer up, it’s not..."


In [10]:
#pickle
corpus_df.to_pickle("corpus_df3.pkl")

In [11]:
corpus_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 990 entries, 0 to 989
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   URL     990 non-null    object
 1   TITLE   984 non-null    object
 2   AUTHOR  980 non-null    object
 3   DATE    984 non-null    object
 4   TEXT    990 non-null    object
dtypes: object(5)
memory usage: 38.8+ KB


In [12]:
corpus_df[corpus_df.TEXT.isna()]

Unnamed: 0,URL,TITLE,AUTHOR,DATE,TEXT


In [13]:
corpus_df[corpus_df.AUTHOR.isna()]

Unnamed: 0,URL,TITLE,AUTHOR,DATE,TEXT
165,https://www.newyorker.com/magazine/2018/09/24/poor-girl,,,,[]
170,https://www.newyorker.com/magazine/2018/08/06/displaced,,,,[]
301,https://www.newyorker.com/magazine/2015/12/14/jelly-and-jack,,,,[]
309,https://www.newyorker.com/magazine/2015/10/19/cold-little-bird,,,,[]
372,https://www.newyorker.com/magazine/2011/08/01/reverting-to-a-wild-state,,,,[]
442,https://www.newyorker.com/magazine/2013/04/15/the-night-of-the-satellite,,,,[]
848,https://www.newyorker.com/magazine/2004/09/27/hanwell-in-hell,Hanwell in Hell,,"September 19, 2004","[I spent just one night with your father, in Bristol, thirty-four years ago. He was down on his luck at the time, as was I. We had both suffered dramatic reversals of fortune and recognized immedi..."
922,https://www.newyorker.com/magazine/2003/02/10/deep-junior-press-conference,Deep Junior Press Conference,,"February 2, 2003","[I married an ice man. I first met him in a hotel at a ski resort, which is probably the perfect place to meet an ice man. The hotel lobby was crowded with animated young people, but the ice man w..."
943,https://www.newyorker.com/magazine/2002/06/17/the-wrestling-match,The Wrestling Match,,"June 9, 2002","[The New Yorker, June 17, 2002 P. 116, Short story about a father, La Jin, who takes his son Alex and his son’s two friends to a wrestling match. Alex, twelve, is half-Chinese, half-Jewish, sullen..."
956,https://www.newyorker.com/magazine/2002/02/18/lost-and-found-10,Lost and Found,,"February 10, 2002","[The New Yorker, February 18, 2002 P. 168, Short story about two women in England who have been friends since the age of twelve. Helly brings her new boyfriend to Clare’s for a weekend. Clare’s yo..."


In [14]:
corpus_df.loc[(corpus_df.TITLE.isna() == True) & (corpus_df.AUTHOR.isna() == True)]

Unnamed: 0,URL,TITLE,AUTHOR,DATE,TEXT
165,https://www.newyorker.com/magazine/2018/09/24/poor-girl,,,,[]
170,https://www.newyorker.com/magazine/2018/08/06/displaced,,,,[]
301,https://www.newyorker.com/magazine/2015/12/14/jelly-and-jack,,,,[]
309,https://www.newyorker.com/magazine/2015/10/19/cold-little-bird,,,,[]
372,https://www.newyorker.com/magazine/2011/08/01/reverting-to-a-wild-state,,,,[]
442,https://www.newyorker.com/magazine/2013/04/15/the-night-of-the-satellite,,,,[]


In [15]:
corpus_df.loc[(corpus_df.TITLE.isna() == True) & (corpus_df.AUTHOR.isna() == True), 'TEXT'] = None
corpus_df[corpus_df.TEXT.isna()]

Unnamed: 0,URL,TITLE,AUTHOR,DATE,TEXT
165,https://www.newyorker.com/magazine/2018/09/24/poor-girl,,,,
170,https://www.newyorker.com/magazine/2018/08/06/displaced,,,,
301,https://www.newyorker.com/magazine/2015/12/14/jelly-and-jack,,,,
309,https://www.newyorker.com/magazine/2015/10/19/cold-little-bird,,,,
372,https://www.newyorker.com/magazine/2011/08/01/reverting-to-a-wild-state,,,,
442,https://www.newyorker.com/magazine/2013/04/15/the-night-of-the-satellite,,,,


In [17]:
missing_urls = corpus_df[corpus_df.TEXT.isna()==True].URL
missing_urls = missing_urls.tolist()
missing_urls

['https://www.newyorker.com/magazine/2018/09/24/poor-girl',
 'https://www.newyorker.com/magazine/2018/08/06/displaced',
 'https://www.newyorker.com/magazine/2015/12/14/jelly-and-jack',
 'https://www.newyorker.com/magazine/2015/10/19/cold-little-bird',
 'https://www.newyorker.com/magazine/2011/08/01/reverting-to-a-wild-state',
 'https://www.newyorker.com/magazine/2013/04/15/the-night-of-the-satellite']

In [18]:
corpus2 = story_scraper(missing_urls)

In [19]:
corpus2_df = pd.DataFrame(corpus2, index=[165, 170, 301, 309, 372, 442])
corpus2_df

Unnamed: 0,URL,TITLE,AUTHOR,DATE,TEXT
165,https://www.newyorker.com/magazine/2018/09/24/poor-girl,Poor Girl,Ludmilla Petrushevskaya,"September 17, 2018","[The wretched mother could easily have lost her sanity watching her husband love their daughter—the way he stroked the child when she was falling asleep or waking up, his blissful expression when ..."
170,https://www.newyorker.com/magazine/2018/08/06/displaced,Displaced,Richard Ford,"July 30, 2018","[When your father dies and you are only sixteen, many things change. School life changes. You are now the boy whose father is missing. People feel sorry for you, but they also devalue you, even re..."
301,https://www.newyorker.com/magazine/2015/12/14/jelly-and-jack,Jelly and Jack,Dana Spiotta,"December 6, 2015","[In the damp late spring of 1985, Jelly picked up the handset of her pink plastic Trimline phone and the dial tone hummed into her ear. She tilted the earpiece slightly away from her and heard the..."
309,https://www.newyorker.com/magazine/2015/10/19/cold-little-bird,Cold Little Bird,Ben Marcus,"October 12, 2015","[It started with bedtime. A coldness. A formality., Martin and Rachel tucked the boy in, as was their habit, then stooped to kiss him good night., “Please don’t do that,” he said, turning to face ..."
372,https://www.newyorker.com/magazine/2011/08/01/reverting-to-a-wild-state,Reverting to a Wild State,Justin Torres,"July 21, 2014","[I spotted a golden feather on the edge of the concrete platform, waiting for me, while I was waiting for the train. I thought of a joke, about rats devouring an entire golden pigeon—but there was..."
442,https://www.newyorker.com/magazine/2013/04/15/the-night-of-the-satellite,The Night of the Satellite,T. Coraghessan Boyle,"April 8, 2013","[What we were arguing about that night—and it was late, very late, 3:10 A.M. by my watch—was something that had happened nearly twelve hours earlier. A small thing, really, but by this time it had..."


In [20]:
#fill na values
corpus_df = corpus_df.fillna(corpus2_df)
corpus_df[corpus_df.AUTHOR.isna()]

Unnamed: 0,URL,TITLE,AUTHOR,DATE,TEXT
848,https://www.newyorker.com/magazine/2004/09/27/hanwell-in-hell,Hanwell in Hell,,"September 19, 2004","[I spent just one night with your father, in Bristol, thirty-four years ago. He was down on his luck at the time, as was I. We had both suffered dramatic reversals of fortune and recognized immedi..."
922,https://www.newyorker.com/magazine/2003/02/10/deep-junior-press-conference,Deep Junior Press Conference,,"February 2, 2003","[I married an ice man. I first met him in a hotel at a ski resort, which is probably the perfect place to meet an ice man. The hotel lobby was crowded with animated young people, but the ice man w..."
943,https://www.newyorker.com/magazine/2002/06/17/the-wrestling-match,The Wrestling Match,,"June 9, 2002","[The New Yorker, June 17, 2002 P. 116, Short story about a father, La Jin, who takes his son Alex and his son’s two friends to a wrestling match. Alex, twelve, is half-Chinese, half-Jewish, sullen..."
956,https://www.newyorker.com/magazine/2002/02/18/lost-and-found-10,Lost and Found,,"February 10, 2002","[The New Yorker, February 18, 2002 P. 168, Short story about two women in England who have been friends since the age of twelve. Helly brings her new boyfriend to Clare’s for a weekend. Clare’s yo..."


In [21]:
#add title/author info to Hanwell in Hell, Deep Junior Press Conference
corpus_df.loc[corpus_df['TITLE']=='Deep Junior Press Conference', 'AUTHOR'] = 'Richard L. Peterson'
corpus_df.loc[corpus_df['TITLE']=='Hanwell in Hell', 'AUTHOR'] = 'Zadie Smith'
corpus_df[corpus_df.AUTHOR.isna()]

Unnamed: 0,URL,TITLE,AUTHOR,DATE,TEXT
943,https://www.newyorker.com/magazine/2002/06/17/the-wrestling-match,The Wrestling Match,,"June 9, 2002","[The New Yorker, June 17, 2002 P. 116, Short story about a father, La Jin, who takes his son Alex and his son’s two friends to a wrestling match. Alex, twelve, is half-Chinese, half-Jewish, sullen..."
956,https://www.newyorker.com/magazine/2002/02/18/lost-and-found-10,Lost and Found,,"February 10, 2002","[The New Yorker, February 18, 2002 P. 168, Short story about two women in England who have been friends since the age of twelve. Helly brings her new boyfriend to Clare’s for a weekend. Clare’s yo..."


In [22]:
#dropping other stories because they are synopses
corpus_df = corpus_df[~corpus_df.AUTHOR.isna()]
corpus_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 988 entries, 0 to 989
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   URL     988 non-null    object
 1   TITLE   988 non-null    object
 2   AUTHOR  988 non-null    object
 3   DATE    988 non-null    object
 4   TEXT    988 non-null    object
dtypes: object(5)
memory usage: 46.3+ KB


In [23]:
corpus_df.to_csv('corpus_df3.csv', index=None)