In [80]:
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import requests
import re
from collections import OrderedDict

'''
Scrape Speeches from the NPR website. NPR lists speeches under a series. Input
this URL into the function and the corresponding party
'''

In [81]:
source = requests.get('https://www.npr.org/series/94216845/speeches-from-the-republican-convention').text
soup = BeautifulSoup(source, 'lxml')

In [83]:
links = []
for link in soup.findAll('a', attrs={'href': re.compile("https://www.npr.org/templates/story")}):
    links.append(link.get('href'))

# This method appends duplicates to the links list. Remove those
links = list(OrderedDict.fromkeys(links))
print(links)

['https://www.npr.org/templates/story/story.php?storyId=94302894', 'https://www.npr.org/templates/story/story.php?storyId=94301516', 'https://www.npr.org/templates/story/story.php?storyId=94303964', 'https://www.npr.org/templates/story/story.php?storyId=94258995', 'https://www.npr.org/templates/story/story.php?storyId=94254610', 'https://www.npr.org/templates/story/story.php?storyId=94256318', 'https://www.npr.org/templates/story/story.php?storyId=94254989', 'https://www.npr.org/templates/story/story.php?storyId=94215430', 'https://www.npr.org/templates/story/story.php?storyId=94215026', 'https://www.npr.org/templates/story/story.php?storyId=94213979']


In [109]:
# Get Speech titles
titles = []
for article in soup.findAll('h2', class_='title'):
    titles.append(article.text[12:])
print(titles)

for title in titles:
    print(title)

["John McCain's Speech", "Cindy McCain's Speech", 'South Carolina Sen. Lindsey Graham', 'Gov. Sarah Palin At The RNC', 'Former New York Mayor Rudy Giuliani', 'Mike Huckabee At The RNC', 'Mitt Romney At The RNC', 'President Bush At The RNC', "Former Sen. Fred Thompson's Speech", "Sen. Joseph Lieberman's Speech"]
John McCain's Speech
Cindy McCain's Speech
South Carolina Sen. Lindsey Graham
Gov. Sarah Palin At The RNC
Former New York Mayor Rudy Giuliani
Mike Huckabee At The RNC
Mitt Romney At The RNC
President Bush At The RNC
Former Sen. Fred Thompson's Speech
Sen. Joseph Lieberman's Speech


In [88]:
# Make dict with speech titles and links
speech_dict = dict(zip(titles, links))
for title,speech in speech_dict.itemsn():
    print(title, speech)

John McCain's Speech https://www.npr.org/templates/story/story.php?storyId=94302894
Cindy McCain's Speech https://www.npr.org/templates/story/story.php?storyId=94301516
South Carolina Sen. Lindsey Graham https://www.npr.org/templates/story/story.php?storyId=94303964
Gov. Sarah Palin At The RNC https://www.npr.org/templates/story/story.php?storyId=94258995
Former New York Mayor Rudy Giuliani https://www.npr.org/templates/story/story.php?storyId=94254610
Mike Huckabee At The RNC https://www.npr.org/templates/story/story.php?storyId=94256318
Mitt Romney At The RNC https://www.npr.org/templates/story/story.php?storyId=94254989
President Bush At The RNC https://www.npr.org/templates/story/story.php?storyId=94215430
Former Sen. Fred Thompson's Speech https://www.npr.org/templates/story/story.php?storyId=94215026
Sen. Joseph Lieberman's Speech https://www.npr.org/templates/story/story.php?storyId=94213979


In [111]:
path = 'C:\Users\Aaron\Desktop\Projects\PoliticalParty\Speeches\Republican'
for title,link in speech_dict.items():
    speech = requests.get(link).text
    soup = BeautifulSoup(speech, 'lxml')
    
    f = open(str(title) + '.txt', 'w')
    for paragraph in soup.findAll('p')[1:-2]:
        paragraphs.append(paragraph.text)
        f.write(paragraph.text)
    f.close()

In [147]:
def scrape_NPR(npr_url, party):
    '''
    Scrape NPR speech page and write speeches to text files
    npr_url: url of speech list
    party: Democrat or Republican
    '''
    source = requests.get(npr_url).text
    soup = BeautifulSoup(source, 'lxml')
    
    
    links = []
    for link in soup.findAll('a', attrs={'href': re.compile("https://www.npr.org/templates/story")}):
        links.append(link.get('href'))

    # This method appends duplicates to the links list. Remove those
    links = list(OrderedDict.fromkeys(links))
    
    # Get Speech titles
    titles = []
    for article in soup.findAll('h2', class_='title'):
        titles.append(article.text[12:])
        
    # Make dict with speech titles and links
    speech_dict = dict(zip(titles, links))
    for title,speech in speech_dict.items():
        print(title, speech)
        
    path = "C:\\Users\\Aaron\\Desktop\\Projects\\PoliticalParty\\Speeches\\"
    for title,link in speech_dict.items():
        speech = requests.get(link).text
        soup = BeautifulSoup(speech, 'lxml')
    
        f = open(path + str(party) + "\\" + str(title) + '.txt', 'w')
        for paragraph in soup.findAll('p')[1:-2]:
            paragraphs.append(paragraph.text)
            f.write(paragraph.text)
        f.close()
        
    print('Done!', '\n')
        
    return None


In [148]:
scrape_NPR('https://www.npr.org/series/94216845/speeches-from-the-republican-convention', party='Republican')
scrape_NPR('https://www.npr.org/series/94070408/speeches-from-the-democratic-convention', party='Democrat')

John McCain's Speech https://www.npr.org/templates/story/story.php?storyId=94302894
Cindy McCain's Speech https://www.npr.org/templates/story/story.php?storyId=94301516
South Carolina Sen. Lindsey Graham https://www.npr.org/templates/story/story.php?storyId=94303964
Gov. Sarah Palin At The RNC https://www.npr.org/templates/story/story.php?storyId=94258995
Former New York Mayor Rudy Giuliani https://www.npr.org/templates/story/story.php?storyId=94254610
Mike Huckabee At The RNC https://www.npr.org/templates/story/story.php?storyId=94256318
Mitt Romney At The RNC https://www.npr.org/templates/story/story.php?storyId=94254989
President Bush At The RNC https://www.npr.org/templates/story/story.php?storyId=94215430
Former Sen. Fred Thompson's Speech https://www.npr.org/templates/story/story.php?storyId=94215026
Sen. Joseph Lieberman's Speech https://www.npr.org/templates/story/story.php?storyId=94213979
Done! 

Barack Obama's Acceptance Speech https://www.npr.org/templates/story/story.php?s