In [1]:
import pandas as pd
import numpy as np

In [2]:
from tqdm import tqdm
import time

In [3]:
import requests
from bs4 import BeautifulSoup

def scrape_links(base_url, num_pages):
    # Initialize a list to store all links
    all_links = []

    # Loop over the range of pages to scrape
    for page in range(1, num_pages + 1):
        # Modify the URL to access each page
        url = f"{base_url}?page={page}"

        # Send a GET request to the website
        response = requests.get(url)
        response.raise_for_status()  # This will raise an exception for HTTP errors

        # Parse the HTML content of the page with BeautifulSoup
        soup = BeautifulSoup(response.text, 'html.parser')

        # Find the tbody element
        tbody = soup.find('tbody')

        # Check if tbody is found
        if tbody:
            # Extract all 'a' tags within the tbody that contain 'href'
            for a in tbody.find_all('a', href=True):
                all_links.append(a['href'])

    # Return all collected links
    return all_links

In [5]:
# URL of the base webpage to scrape
base_url = 'https://projects.propublica.org/represent/statements'

# Number of pages you want to scrape
num_pages = 20

# Get all links from the specified number of pages
links = scrape_links(base_url, num_pages)

In [8]:
links

['/represent/members/P000617-ayanna-pressley',
 'https://pressley.house.gov/2024/05/12/watch-in-powerful-mothers-day-speech-pressley-calls-for-policy-change-to-support-mothers-and-caregivers/',
 '/represent/members/S001172-adrian-smith',
 'http://adriansmith.house.gov/media/column/pushing-back-unworkable-executive-mandates',
 '/represent/members/L000566-robert-e-latta',
 'http://latta.house.gov/news/documentsingle.aspx?DocumentID=404392',
 '/represent/members/G000568-morgan-griffith',
 'http://morgangriffith.house.gov/news/documentsingle.aspx?DocumentID=403088',
 '/represent/members/C001068-steve-cohen',
 'http://cohen.house.gov/media-center/enewsletters/celebrating-grand-opening-orange-mound-library',
 '/represent/members/C001068-steve-cohen',
 'http://cohen.house.gov/media-center/enewsletters/honoring-civil-rights-activist-daisy-bates-statuary-hall',
 '/represent/members/L000569-blaine-luetkemeyer',
 'http://luetkemeyer.house.gov/news/documentsingle.aspx?DocumentID=401088',
 '/repres

In [9]:
press_links = []
for i in range(len(links)):
    if i % 2 != 0:
        press_links.append(links[i])
press_links

['https://pressley.house.gov/2024/05/12/watch-in-powerful-mothers-day-speech-pressley-calls-for-policy-change-to-support-mothers-and-caregivers/',
 'http://adriansmith.house.gov/media/column/pushing-back-unworkable-executive-mandates',
 'http://latta.house.gov/news/documentsingle.aspx?DocumentID=404392',
 'http://morgangriffith.house.gov/news/documentsingle.aspx?DocumentID=403088',
 'http://cohen.house.gov/media-center/enewsletters/celebrating-grand-opening-orange-mound-library',
 'http://cohen.house.gov/media-center/enewsletters/honoring-civil-rights-activist-daisy-bates-statuary-hall',
 'http://luetkemeyer.house.gov/news/documentsingle.aspx?DocumentID=401088',
 'http://luetkemeyer.house.gov/news/documentsingle.aspx?DocumentID=401087',
 'https://barragan.house.gov/2024/05/10/affordable-housing-crisis-looms-large-in-election/',
 'http://cohen.house.gov/node/19531',
 'http://cohen.house.gov/node/19533',
 'http://cohen.house.gov/node/19535',
 'http://cohen.house.gov/node/19537',
 'http:/

In [10]:
def scrape_p_tags(url):
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')
        p_tags = soup.find_all('p')
        text_content = ' '.join(tag.get_text() for tag in p_tags)
        return text_content
    except Exception as e:
        return f"Failed to scrape {url}: {str(e)}"

In [11]:
results = []
for url in tqdm(press_links):
    p_text = scrape_p_tags(url)
    results.append(p_text)
results

100%|███████████████████████████████████████| 1000/1000 [10:48<00:00,  1.54it/s]


['May 12, 2024 “Mr. Speaker, mothers across America don’t want a Hallmark card. They want policy change.” “…policies that see them, center them, and serve them, and they would prefer that over bouquets, verbal or otherwise.” Video (YouTube) BOSTON\xa0–\xa0Congresswoman Ayanna Pressley (MA-07)\xa0marked Mother’s Day with a powerful speech on the House floor in which she called for meaningful policy change to better support mothers and caregivers, including maternal health justice, affordable childcare, universal paid leave, reproductive freedom, home and community-based services, and more. Footage from the speech can be found\xa0here\xa0and a transcript is below. Transcript: In Powerful Mother’s Day Speech, Ayanna Pressley Calls for Policy Change to Support Mothers & Caregivers\xa0 March 8, 2024 U.S House of Representatives This time of year, Mr. Speaker, we wax poetic about the contributions of mothers. We call their work valued, their love endless, their role invaluable.\xa0 Mr. Speak

In [15]:
results[900:901]

["Click HERE to watch and HERE to download. Washington, D.C. – At a hearing to review the Fiscal Year 2025 budget request for the U.S. Department of Homeland Security, U.S. Senator Susan Collins, Vice Chair of the Appropriations Committee, called on Secretary of Homeland Security Alejandro Mayorkas to help find a solution to staffing issues found at Maine’s ports that are threatening the arrival of international cruise ships in Eastport this fall.\xa0 During the hearing, Senator Collins also highlighted her concern that the President’s budget request does not do enough to address the flood of illegal migrants and fentanyl entering the United States. U.S. Customs and Border Protection (CBP) officials recently informed the City of Eastport that the agency cannot accommodate the four planned international cruise ship arrivals this fall, even though CBP staffed a larger number of international cruise ship arrivals in Eastport last year, and Bar Harbor is seeing a reduction in international

In [21]:
PressDf = pd.DataFrame({'Press Release':results})

In [29]:
PressDf

Unnamed: 0,Press Release
0,"May 12, 2024 “Mr. Speaker, mothers across Amer..."
1,"502 Cannon HOB, Washington, DC 20515Email Me(2..."
2,Congressman Bob Latta (R-OH5) penned an op-ed ...
3,It has been roughly four and a half years sinc...
4,"May 3, 2024 Dear Friend, Last Friday, I atte..."
...,...
995,"[WASHINGTON, D.C.] – U.S. Senator Tammy Duck..."
996,"[WASHINGTON, DC] – Combat Veteran and U.S. S..."
997,WASHINGTON – U.S. Senate Majority Whip Dick Du...
998,WASHINGTON – U.S. Senate Majority Whip Dick Du...


In [26]:
def scrape_table(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Find the table, assuming there's only one table per page or you know the specific id/class
    table = soup.find('table')  # Add class_ or id if necessary
    
    # Convert table to DataFrame
    df = pd.read_html(str(table))[0]  # Converts the HTML table to a DataFrame
    return df

In [27]:
def scrape_all_pages(base_url, start, end):
    all_data = pd.DataFrame()  # Initialize empty DataFrame to hold all the data

    for i in range(start, end + 1):
        url = f"{base_url}?page={i}"
        df = scrape_table(url)
        all_data = pd.concat([all_data, df], ignore_index=True)  # Concatenate the current page's DataFrame to the full DataFrame

    return all_data

base_url = "https://projects.propublica.org/represent/statements"
all_data = scrape_all_pages(base_url, 1, 20)  # Scrape pages 1 to 20

In [28]:
all_data

Unnamed: 0,Date,Member,Party,State / District,Title
0,May 12,Ayanna Pressley,D,Mass.-7,"WATCH: In Powerful Mothers Day Speech, Pressle..."
1,May 10,Adrian Smith,R,Neb.-3,Pushing Back on Unworkable Executive Mandates
2,May 9,Robert E. Latta,R,Ohio-5,Latta's Op-Ed: EPAs new power plant rule is ne...
3,May 10,Morgan Griffith,R,Va.-9,Congressman Griffiths Weekly E-Newsletter 5.10.24
4,May 3,Steve Cohen,D,Tenn.-9,Celebrating the Grand Opening of the Orange Mo...
...,...,...,...,...,...
995,April 10,Tammy Duckworth,D,Ill.,Duckworth Discusses Chicagoland Transportation...
996,April 10,Tammy Duckworth,D,Ill.,Duckworth Secures Support from Defense Secreta...
997,April 10,Richard J. Durbin,D,Ill.,Durbin Delivers Opening Statement During Senat...
998,April 10,Richard J. Durbin,D,Ill.,Durbin Questions COPS Office Director During S...


In [30]:
all_data['Press Release'] = PressDf['Press Release']

In [31]:
all_data

Unnamed: 0,Date,Member,Party,State / District,Title,Press Release
0,May 12,Ayanna Pressley,D,Mass.-7,"WATCH: In Powerful Mothers Day Speech, Pressle...","May 12, 2024 “Mr. Speaker, mothers across Amer..."
1,May 10,Adrian Smith,R,Neb.-3,Pushing Back on Unworkable Executive Mandates,"502 Cannon HOB, Washington, DC 20515Email Me(2..."
2,May 9,Robert E. Latta,R,Ohio-5,Latta's Op-Ed: EPAs new power plant rule is ne...,Congressman Bob Latta (R-OH5) penned an op-ed ...
3,May 10,Morgan Griffith,R,Va.-9,Congressman Griffiths Weekly E-Newsletter 5.10.24,It has been roughly four and a half years sinc...
4,May 3,Steve Cohen,D,Tenn.-9,Celebrating the Grand Opening of the Orange Mo...,"May 3, 2024 Dear Friend, Last Friday, I atte..."
...,...,...,...,...,...,...
995,April 10,Tammy Duckworth,D,Ill.,Duckworth Discusses Chicagoland Transportation...,"[WASHINGTON, D.C.] – U.S. Senator Tammy Duck..."
996,April 10,Tammy Duckworth,D,Ill.,Duckworth Secures Support from Defense Secreta...,"[WASHINGTON, DC] – Combat Veteran and U.S. S..."
997,April 10,Richard J. Durbin,D,Ill.,Durbin Delivers Opening Statement During Senat...,WASHINGTON – U.S. Senate Majority Whip Dick Du...
998,April 10,Richard J. Durbin,D,Ill.,Durbin Questions COPS Office Director During S...,WASHINGTON – U.S. Senate Majority Whip Dick Du...


In [33]:
all_data.to_csv('RawData.csv',index=False)

In [34]:
FreedomP = ['Barry Moore', 'Gary Palmer', 'Eli Crane', 'Andy Biggs', 'Debbie Lesko', 'Paul Gosar', 'Lauren Boebert', 'Bill Posey', 'Anna Paulina', 'Greg Steube', 'Byron Donalds', 'Andrew Clyde', 'Mike Collins', 'Russ Fulcher', 'Mary Miller', 'Clay Higgins', 'Andy Harris', 'Eric Burlison', 'Matt Rosendale', 'North Carolina', 'Greg Murphy', 'Dan Bishop', 'Jim Jordan', 'Warren Davidson', 'Josh Brecheen', 'Scott Perry', 'South Carolina', 'Jeff Duncan', 'Ralph Norman', 'Diana Harshbarger', 'Scott Des', 'Andy Ogles', 'Mark Green', 'Keith Self', 'Chip Roy', 'Troy Nehls', 'Michael Cloud', 'Bob Good', 'Morgan Griffith', 'West Virginia', 'Alex Mooney', 'Tom Tiffany', 'Harriet Hageman']

In [35]:
CongProg = [    "Raúl Grijalva", "Jared Huffman", "John Garamendi", "Mark DeSaulnier",     "Barbara Lee", "Ro Khanna", "Judy Chu", "Grace Napolitano",     "Brad Sherman", "Jimmy Gomez", "Ted Lieu", "Sydney Kamlager-Dove", "Linda Sánchez",     "Mark Takano", "Robert Garcia", "Maxine Waters", "Nanette Barragán", "Katie Porter",     "Mike Levin", "Sara Jacobs", "Diana DeGette", "Joe Neguse",     "Rosa DeLauro", "Debbie Wasserman Schultz", "Lucy McBath",    "Hank Johnson", "Jill Tokuda", "Jonathan Jackson", "Delia Ramirez", "Chuy García", "Danny Davis",     "Jan Schakowsky", "Sharice Davids", "Chellie Pingree",     "Kweisi Mfume", "Jamie Raskin", "Jim McGovern", "Lori Trahan", "Ayanna Pressley",     "Debbie Dingell", "Rashida Tlaib", "Shri Thanedar", "Ilhan Omar", "Cori Bush",     "Donald Norcross", "Andy Kim", "Frank Pallone", "Bonnie Watson Coleman",     "Melanie Stansbury", "Teresa Leger Fernandez", "Grace Meng", "Nydia Velázquez",     "Yvette Clarke", "Dan Goldman", "Jerry Nadler", "Adriano Espaillat", "Alexandria Ocasio-Cortez",     "Jamaal Bowman", "Paul Tonko", "Jennifer McClellan", "Pramila Jayapal",     "Mark Pocan", "Gwen Moore", "Eleanor Holmes Norton"]

In [36]:
MainStreet =[
    "Don Bacon",
    "Andy Barr",
    "Aaron Bean",
    "Stephanie Bice",
    "Larry Bucshon",
    "Ken Calvert",
    "Lori Chavez-DeRemer",
    "Juan Ciscomani",
    "Jenniffer González-Colón",
    "Dan Crenshaw",
    "Monica De La Cruz",
    "Mario Díaz-Balart",
    "John Duarte",
    "Anthony D'Esposito",
    "Chuck Edwards",
    "Jake Ellzey",
    "Randy Feenstra",
    "Brad Finstad",
    "Brian Fitzpatrick",
    "Mike Flood",
    "Mike Gallagher",
    "Andrew Garbarino",
    "Michael Guest",
    "Erin Houchin",
    "Bill Huizenga",
    "David Joyce",
    "Tom Kean Jr.",
    "Jen Kiggans",
    "Young Kim",
    "Nick LaLota",
    "Nick Langworthy",
    "Mike Lawler",
    "Laurel Lee",
    "Michael McCaul",
    "Lisa McClain",
    "Max Miller",
    "Carol Miller",
    "Marc Molinaro",
    "John Moolenaar",
    "Blake Moore",
    "Nathaniel Moran",
    "Dan Newhouse",
    "Jay Obernolte",
    "Guy Reschenthaler",
    "David Rouzer",
    "John Rutherford",
    "Peter Sessions",
    "Mike Simpson",
    "Michelle Steel",
    "Mike Turner",
    "David Valadao",
    "Derrick Van Orden",
    "Michael Waltz"
]


In [37]:
Coalition = [    "Terri Sewell", "Greg Stanton", "Ami Bera", "Josh Harder",    "Jim Costa", "Salud Carbajal", "Raul Ruiz", "Julia Brownley",    "Adam Schiff", "Pete Aguilar", "Norma Torres", "Lou Correa",    "Scott H. Peters", "Jason Crow", "Brittany Pettersen", "Yadira Caraveo",    "Joe Courtney", "Jim Himes", "Jared Moskowitz",    "David Scott", "Ed Case", "Mike Quigley", "Sean Casten",    "Raja Krishnamoorthi", "Brad Schneider", "Bill Foster", "Nikki Budzinski",    "Eric Sorensen", "Frank J. Mrvan", "Glenn Ivey", "David Trone",    "Seth Moulton", "Bill Keating", "Hillary Scholten", "Elissa Slotkin",    "Haley Stevens", "Angie Craig", "Dean Phillips", "Susie Lee",    "Chris Pappas", "Ann McLane Kuster", "Josh Gottheimer", "Mikie Sherrill",    "Gabe Vasquez", "Gregory Meeks", "Pat Ryan", "Joe Morelle",    "Tom Suozzi", "Don Davis", "Deborah K. Ross", "Kathy Manning",    "Wiley Nickel", "Jeff Jackson", "Greg Landsman", "Emilia Sykes",    "Lizzie Fletcher", "Joaquin Castro", "Henry Cuellar", "Colin Allred",    "Marc Veasey", "Vicente Gonzalez", "Abigail Spanberger", "Jennifer Wexton",    "Gerry Connolly", "Suzan DelBene", "Rick Larsen", "Derek Kilmer",    "Kim Schrier", "Marilyn Strickland", "Stacey Plaskett"]

In [38]:
def determine_caucus(member):
    if member in FreedomP:
        return 'Freedom Party'
    elif member in CongProg:
        return 'Congress Progressive'
    elif member in MainStreet:
        return 'Main Street'
    elif member in Coalition:
        return 'New Democrat Coalition'
    return 'NaN'


all_data['Caucus'] = all_data['Member'].apply(determine_caucus)

In [40]:
all_data.head()

Unnamed: 0,Date,Member,Party,State / District,Title,Press Release,Caucus
0,May 12,Ayanna Pressley,D,Mass.-7,"WATCH: In Powerful Mothers Day Speech, Pressle...","May 12, 2024 “Mr. Speaker, mothers across Amer...",Congress Progressive
1,May 10,Adrian Smith,R,Neb.-3,Pushing Back on Unworkable Executive Mandates,"502 Cannon HOB, Washington, DC 20515Email Me(2...",
2,May 9,Robert E. Latta,R,Ohio-5,Latta's Op-Ed: EPAs new power plant rule is ne...,Congressman Bob Latta (R-OH5) penned an op-ed ...,
3,May 10,Morgan Griffith,R,Va.-9,Congressman Griffiths Weekly E-Newsletter 5.10.24,It has been roughly four and a half years sinc...,Freedom Party
4,May 3,Steve Cohen,D,Tenn.-9,Celebrating the Grand Opening of the Orange Mo...,"May 3, 2024 Dear Friend, Last Friday, I atte...",


In [42]:
Caucus = all_data[all_data['Caucus']!='NaN']
Caucus

Unnamed: 0,Date,Member,Party,State / District,Title,Press Release,Caucus
0,May 12,Ayanna Pressley,D,Mass.-7,"WATCH: In Powerful Mothers Day Speech, Pressle...","May 12, 2024 “Mr. Speaker, mothers across Amer...",Congress Progressive
3,May 10,Morgan Griffith,R,Va.-9,Congressman Griffiths Weekly E-Newsletter 5.10.24,It has been roughly four and a half years sinc...,Freedom Party
8,May 10,Nanette Barragán,D,Calif.-44,Affordable housing crisis looms large in election,"The Hill | Taylor Giorno May 1, 2024 “Congre...",Congress Progressive
21,May 8,John Moolenaar,R,Mich.-2,Congressman Moolenaar Named Chairman of the Ho...,Subscribe to Congressman Moolenaar's Newslette...,Main Street
23,May 9,Morgan Griffith,R,Va.-9,Griffith Celebrates Approximately $75 Million ...,The U.S. Department of the Interior’s National...,Freedom Party
...,...,...,...,...,...,...,...
984,April 11,Mike Levin,D,Calif.-49,SEEC Clean Energy Deployment Task Force Co-Cha...,"Washington, D.C. — Today, the Co-Chairs of the...",Congress Progressive
985,April 12,Mike Levin,D,Calif.-49,Rep. Levin and House Natural Resources Committ...,"Washington, D.C.- Rep. Mike Levin (D-Calif) jo...",Congress Progressive
986,April 12,Mike Levin,D,Calif.-49,Rep. Mike Levin and San Diego Congressional De...,"Washington, D.C. – Today, Rep. Mike Levin (CA-...",Congress Progressive
990,April 11,Jared Huffman,D,Calif.-2,Rep. Huffman Statement on 2024 Salmon Season C...,"Washington, D.C. – Today, U.S. Representative ...",Congress Progressive


In [43]:
Caucus.to_csv('RawCaucus.csv',index=False)