In [6]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import spacy

In [17]:
# We define a class, and name it PresidentialSpeechScraper.
class PresidentialSpeechScraper:
    # Initialize the class; then define the basic URL, create the president_id, president_name, and speech_data_list in order to transfer or receive data later.
    def __init__(self, president_id, president_name):
        self.base_url = "https://millercenter.org/the-presidency/presidential-speeches"
        self.president_id = president_id
        self.president_name = president_name
        self.speech_data_list = []

    # Now we begin to scrape links of speeches.
    def scrape_speech_links(self):
        # Create a complete link for each specific president, using '?' as a query string to combain self.base_url with target self.president_id. Besides, '=' here represents only the relationship between the key and the value, not the assignment operation, which means the string of this key-value pair is used to tell the server about a particular parameter and its value.
        url = f"{self.base_url}?field_president_target_id[{self.president_id}]={self.president_id}"
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')
        links = soup.find_all('div', class_='views-field-title')

        for link in links:
            title = link.find('a').text
            speech_link = link.find('a')['href']
            self.speech_data_list.append({'Title': title, 'Link': speech_link})
    
    # Process text
    def save_speech_to_text_file(self, title, speech):
        filename = f"{self.president_name}_{title.replace(' ', '_')}.txt"
        with open(filename, 'w', encoding='utf-8') as file:
            file.write(speech)
        print(f"Speech text saved to {filename}")
    
    # Then we start to scrape more details in every speeches.
    def scrape_speech_details(self):
        nlp = spacy.load("en_core_web_sm")
        for speech_data in self.speech_data_list:
            link = speech_data['Link']
            response = requests.get(link)
            soup = BeautifulSoup(response.text, 'html.parser')

            title = soup.find('h1').text.strip()
            president = soup.find('p', class_='president-name').text.strip()
            date = soup.find('p', class_='episode-date').text.strip()
            summary = soup.find('div', class_='about-sidebar--intro').p.text.strip()
            speech_elements = soup.find_all('div', class_='transcript-inner')
            # Connect all the texts in <div> and delete the possible 'Transcript' in texts/
            speech = '\n'.join([element.text.strip().replace("Transcript\n", "") for element in speech_elements])
            
            self.save_speech_to_text_file(title, speech)
            
            # Now we can add the information extracted from the speech details to the dictionary separately.
            doc = nlp(speech)
            tokens = [token.text for token in doc]
            lemmas = [token.lemma_ for token in doc]
            pos_tags = [token.pos_ for token in doc]
            
            speech_data.update({
                'Title': title,
                'President': president,
                'Date': date,
                'Summary': summary,
                'Speech': speech,
                'Tokens': tokens,
                'Lemmas': lemmas,
                'Parts-of-speech': pos_tags
            })

    # Save the data above to csv. files.
    def save_to_csv(self):
        filename= f"{self.president_name}_presidential_speeches.csv"
        df = pd.DataFrame(self.speech_data_list)
        df.to_csv(filename, index=False)
        print(f"Speech data saved to {filename}")

In [18]:
# Create an instance of Biden for the class of PresidentialSpeechScraper.
Biden_scraper = PresidentialSpeechScraper(president_id=30721, president_name ='Biden')

Biden_scraper.scrape_speech_links()
Biden_scraper.scrape_speech_details()

Biden_scraper.save_to_csv()

Speech text saved to Biden_October_20,_2023:_Remarks_on_the_US_Response_in_Support_of_Israel_and_Ukraine.txt
Speech text saved to Biden_February_21,_2023:_Remarks_on_the_One-Year_Anniversary_of_the_Ukraine_War.txt
Speech text saved to Biden_February_7,_2023:_State_of_the_Union_Address.txt
Speech text saved to Biden_September_21,_2022:_Speech_before_the_77th_Session_of_the_United_Nations_General_Assembly.txt
Speech text saved to Biden_September_1,_2022:_Remarks_on_the_Continued_Battle_for_the_Soul_of_the_Nation.txt
Speech text saved to Biden_May_24,_2022:_Remarks_on_School_Shooting_in_Uvalde,_Texas.txt
Speech text saved to Biden_March_26,_2022:_Remarks_in_Support_of_the_People_of_Ukraine.txt
Speech text saved to Biden_March_1,_2022:_State_of_the_Union_Address.txt
Speech text saved to Biden_February_24,_2022:_Remarks_on_the_Russian_Invasion_of_Ukraine.txt
Speech text saved to Biden_January_6,_2022:_Remarks_on_the_Anniversary_of_the_January_6th_Assault_on_the_US_Capitol.txt
Speech text sa