In [11]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import spacy
import re

In [12]:
# Load spacy English model
nlp = spacy.load('en_core_web_sm')

# We define a class, and name it PresidentialSpeechScraper.
class PresidentialSpeechScraper:
    # Initialize the class; then define the basic URL, create the president_id, president_name, and speech_data_list in order to transfer or receive data later.
    def __init__(self, president_id, president_name):
        self.base_url = "https://millercenter.org/the-presidency/presidential-speeches"
        self.president_id = president_id
        self.president_name = president_name
        self.speech_data_list = []

    # Now we begin to scrape links of speeches.
    def scrape_speech_links(self):
        # Create a complete link for each specific president, using '?' as a query string to combain self.base_url with target self.president_id. Besides, '=' here represents only the relationship between the key and the value, not the assignment operation, which means the string of this key-value pair is used to tell the server about a particular parameter and its value.
        url = f"{self.base_url}?field_president_target_id[{self.president_id}]={self.president_id}"
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')
        links = soup.find_all('div', class_='views-field-title')

        for link in links:
            title = link.find('a').text
            speech_link = link.find('a')['href']
            self.speech_data_list.append({'Title': title, 'Link': speech_link})
    
    # Then we start to scrape more details in every speeches.
    def scrape_speech_details(self):
        for speech_data in self.speech_data_list:
            link = speech_data['Link']
            response = requests.get(link)
            soup = BeautifulSoup(response.text, 'html.parser')

            title = soup.find('h1').text.strip()
            president = soup.find('p', class_='president-name').text.strip()
            date = soup.find('p', class_='episode-date').text.strip()
            summary = soup.find('div', class_='about-sidebar--intro').p.text.strip()
            speech_elements = soup.find_all('div', class_='transcript-inner')
            # Connect all the texts in <div> and delete the possible 'Transcript' in texts.
            speech = '\n'.join([element.text.strip().replace("Transcript\n", "") for element in speech_elements])
            
            # Now we can add the information extracted from the speech details to the dictionary separately.
            speech_data.update({
                'Title': title,
                'President': president,
                'Date': date,
                'Summary': summary,
                'Speech': speech
            })
            
    # Before creating metadata, we need to clean the text.
    def clean_text(self, text):
        cleaned_text = ' '.join(text.split())
        cleaned_text = re.sub(r'[^\w\s]', '', cleaned_text)
        cleaned_text = cleaned_text.lower()
        return cleaned_text
    
    # We use spacy to tokenize, lemmatize, and parts-of-speech tag.
    def preprocess_text(self, text):
        doc = nlp(text)
        tokens = [token.text for token in doc]
        lemmas = [token.lemma_ for token in doc]
        pos_tags = [token.pos_ for token in doc]
        return tokens, lemmas, pos_tags
        
    # Save the data above to csv. files.
    def save_to_csv(self):
        filename= f"{self.president_name}_presidential_speeches.csv"
        df = pd.DataFrame(self.speech_data_list)
        
        df['Tokens'], df['Lemmas'], df['Parts-of-speech'] = zip(*df['Speech'].apply(self.preprocess_text))
        df['Filename'] = [f"{self.president_name}_speech_{i+1}.txt" for i in range(len(df))]
        df = df[['Filename', 'Title', 'Speech', 'Tokens', 'Lemmas', 'Parts-of-speech', 'President', 'Date', 'Summary', 'Link']]
        
        df.to_csv(filename, index=False)
        print(f"Speech data saved to {filename}")
    
    # Moreover, export each of Trump's speeches as a separate txt document.
    def export_individual_speeches_to_txt(self):
        for i, speech_data in enumerate(self.speech_data_list):
            title = speech_data['Title']
            speech_text = speech_data['Speech']
            speech_filename = f"{self.president_name}_speech_{i+1}_{title}.txt"
            with open(speech_filename, 'w', encoding='utf-8') as file:
                file.write(speech_text)

In [13]:
# Create an instance of Trump for the class of PresidentialSpeechScraper.
Trump_scraper = PresidentialSpeechScraper(president_id=8396, president_name ='Trump')

Trump_scraper.scrape_speech_links()
Trump_scraper.scrape_speech_details()

Trump_scraper.save_to_csv()

Trump_scraper.export_individual_speeches_to_txt()

Speech data saved to Trump_presidential_speeches.csv
