In [2]:
import pandas as pd
import numpy as np

In [3]:
from tqdm import tqdm
import time

## Webscraping from Pro Publica to get data

In [4]:
import requests
from bs4 import BeautifulSoup

def scrape_links(base_url, num_pages):
    # Initialize a list to store all links
    all_links = []

    # Loop over the range of pages to scrape
    for page in range(1, num_pages + 1):
        # Modify the URL to access each page
        url = f"{base_url}?page={page}"

        # Send a GET request to the website
        response = requests.get(url)
        response.raise_for_status()  # This will raise an exception for HTTP errors

        # Parse the HTML content of the page with BeautifulSoup
        soup = BeautifulSoup(response.text, 'html.parser')

        # Find the tbody element
        tbody = soup.find('tbody')

        # Check if tbody is found
        if tbody:
            # Extract all 'a' tags within the tbody that contain 'href'
            for a in tbody.find_all('a', href=True):
                all_links.append(a['href'])

    # Return all collected links
    return all_links

In [5]:
# URL of the base webpage to scrape
base_url = 'https://projects.propublica.org/represent/statements'

# Number of pages you want to scrape
num_pages = 40

# Get all links from the specified number of pages
links = scrape_links(base_url, num_pages)

In [6]:
type(links)

list

In [7]:
len(links)

2000

In [8]:
press_links = []
for i in range(len(links)):
    if i % 2 != 0:
        press_links.append(links[i])

In [9]:
def scrape_p_tags(url):
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')
        p_tags = soup.find_all('p')
        text_content = ' '.join(tag.get_text() for tag in p_tags)
        return text_content
    except Exception as e:
        return f"Failed to scrape {url}: {str(e)}"

In [11]:
results = []
for url in tqdm(press_links):
    p_text = scrape_p_tags(url)
    results.append(p_text)

100%|███████████████████████████████████████| 1000/1000 [05:46<00:00,  2.89it/s]


In [12]:
PressDf = pd.DataFrame({'Press Release':results})

In [13]:
PressDf

Unnamed: 0,Press Release
0,FOR IMMEDIATE RELEASE | Contact: Olivia Porcar...
1,"2306 Rayburn HOB Washington, DC 20004Email Me2..."
2,"2306 Rayburn HOB Washington, DC 20004Email Me2..."
3,"2306 Rayburn HOB Washington, DC 20004Email Me2..."
4,"2306 Rayburn HOB Washington, DC 20004Email Me2..."
...,...
995,"Today, U.S. Representatives Gerald E. Connoll..."
996,"Congressman Gerry Connolly (D-VA), the Ranking..."
997,Gil Navarro believed in service to others and ...
998,"WASHINGTON, D.C. – Congressman Mark Takano (CA..."


In [14]:
PressDf.iloc[0,0]

'FOR IMMEDIATE RELEASE |\xa0Contact: Olivia Porcaro 202-225-6165 Over the past few months, we have watched violent, antisemitic, un-American, and pro-terrorist mobs take over college campuses across the country, endangering Jewish students and other innocent bystanders. Yet, for months, many Democrats have refused to speak out against the protests.  Sadly, this is no surprise. Antisemitism is a disease that the Far-Left, along with liberal university leaders, has allowed to spread.  In fact, over the past year, we have watched Democrats spread anti-Israel rhetoric time and time again. Some have called for Israeli Prime Minister Netanyahu to be replaced, one said it was an “honor” to visit with the pro-Hamas protestors at Columbia University, one defended the phrase “from the river to the sea”, which implies the destruction of Israel, one denounced Israel as a “racist state,” and more.  In addition, Far-Left Democrats in the House of Representatives have repeatedly voted against legisla

In [15]:
PressDf[PressDf['Press Release'].str.contains('Permalink:')]

Unnamed: 0,Press Release
80,"WASHINGTON, DC – Rep. Donald Norcross (D-NJ), ..."
86,"CHERRY HILL, NJ – Today, Rep. Donald Norcross ..."
157,"CHERRY HILL, NJ – Today, Rep. Donald Norcross ..."
212,"CHERRY HILL, NJ – Today, Rep. Donald Norcross ..."
282,"WASHINGTON – Today, the House Committee on Vet..."
...,...
920,"WASHINGTON, DC – Today, U.S. Representatives A..."
921,"Washington, DC – Today, the U.S. Department of..."
979,"WASHINGTON, D.C. - Today, Congressman Carlos A..."
980,"MIAMI, FL - Today, Congressman Carlos A. Gimén..."


In [17]:
len(results)

1000

In [18]:
def scrape_table(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Find the table, assuming there's only one table per page or you know the specific id/class
    table = soup.find('table')  # Add class_ or id if necessary
    
    # Convert table to DataFrame
    df = pd.read_html(str(table))[0]  # Converts the HTML table to a DataFrame
    return df


In [19]:
def scrape_all_pages(base_url, start, end):
    all_data = pd.DataFrame()  # Initialize empty DataFrame to hold all the data

    for i in range(start, end + 1):
        url = f"{base_url}?page={i}"
        df = scrape_table(url)
        all_data = pd.concat([all_data, df], ignore_index=True)  # Concatenate the current page's DataFrame to the full DataFrame

    return all_data

base_url = "https://projects.propublica.org/represent/statements"
all_data = scrape_all_pages(base_url, 1, 20)  # Scrape pages 1 to 20


In [20]:
all_data

Unnamed: 0,Date,Member,Party,State / District,Title
0,May 8,Kevin McCarthy,R,Calif.-20,Speaker Johnson: Americans Should Decide Ameri...
1,May 8,Kevin McCarthy,R,Calif.-20,"Speaker Johnson, Leader McConnell: “Security A..."
2,May 8,Kevin McCarthy,R,Calif.-20,Speaker Johnson Remarks Following Failed Motio...
3,May 8,Kevin McCarthy,R,Calif.-20,Speaker Johnson Statement on House Passage of ...
4,May 9,Kevin McCarthy,R,Calif.-20,Speaker Johnson on Fox and Friends: “I Appreci...
...,...,...,...,...,...
995,April 10,Rick Scott,R,Fla.,"Senators Rick Scott, Ted Budd & Joni Ernst Int..."
996,April 11,Rick Scott,R,Fla.,Sen. Rick Scott: Bidenomics is Hammering Flori...
997,April 11,Rick Scott,R,Fla.,ICYMI... Sen. Rick Scott in Fox News: I Just C...
998,April 11,Rick Scott,R,Fla.,Sen. Rick Scott: Social Security Administratio...


In [21]:
data = pd.concat([all_data, PressDf], axis=1)

In [31]:
data.iloc[15,5]

'May 8, 2024 Lawmakers Continue Calls for Extending TPS for Haiti, Halting Deportations, Disrupting Arms Trafficking, Providing Humanitarian & Economic Assistance, and More Press Conference (Twitter) | Photos (Dropbox) WASHINGTON\xa0– Haiti Caucus Co-Chairs Congresswomen Ayanna Pressley (MA-07), Yvette D. Clarke (NY-09) and Sheila Cherfilus-McCormick (FL-20), along with Congresswoman Cori Bush (MO-01), Congresswoman Frederica Wilson (FL-24), and Congresswoman Barbara Lee (CA-13), held a Capitol Hill press conference with colleagues and advocates to renew their calls for urgent federal action to address the growing crisis in Haiti. The press conference comes at a time when the humanitarian, political, economic, and security crises in Haiti are becoming increasingly dire.\xa0 The policies called for by the lawmakers and advocates include: extending and redesignating TPS for Haiti; halting all deportations to Haiti; disrupting arms trafficking to the island; providing urgent humanitarian 

In [27]:
data.to_csv('PressRelease.csv', index=False)

## List of Caucuses and adding to the dataframe

In [59]:
MemberList = data['Member'].tolist()

In [175]:
CongProg = [    "Raúl Grijalva", "Jared Huffman", "John Garamendi", "Mark DeSaulnier",     "Barbara Lee", "Ro Khanna", "Judy Chu", "Grace Napolitano",     "Brad Sherman", "Jimmy Gomez", "Ted Lieu", "Sydney Kamlager-Dove", "Linda Sánchez",     "Mark Takano", "Robert Garcia", "Maxine Waters", "Nanette Barragán", "Katie Porter",     "Mike Levin", "Sara Jacobs", "Diana DeGette", "Joe Neguse",     "Rosa DeLauro", "Debbie Wasserman Schultz", "Lucy McBath",    "Hank Johnson", "Jill Tokuda", "Jonathan Jackson", "Delia Ramirez", "Chuy García", "Danny Davis",     "Jan Schakowsky", "Sharice Davids", "Chellie Pingree",     "Kweisi Mfume", "Jamie Raskin", "Jim McGovern", "Lori Trahan", "Ayanna Pressley",     "Debbie Dingell", "Rashida Tlaib", "Shri Thanedar", "Ilhan Omar", "Cori Bush",     "Donald Norcross", "Andy Kim", "Frank Pallone", "Bonnie Watson Coleman",     "Melanie Stansbury", "Teresa Leger Fernandez", "Grace Meng", "Nydia Velázquez",     "Yvette Clarke", "Dan Goldman", "Jerry Nadler", "Adriano Espaillat", "Alexandria Ocasio-Cortez",     "Jamaal Bowman", "Paul Tonko", "Jennifer McClellan", "Pramila Jayapal",     "Mark Pocan", "Gwen Moore", "Eleanor Holmes Norton"]

In [176]:
FreedomP = ['Barry Moore', 'Gary Palmer', 'Eli Crane', 'Andy Biggs', 'Debbie Lesko', 'Paul Gosar', 'Lauren Boebert', 'Bill Posey', 'Anna Paulina', 'Greg Steube', 'Byron Donalds', 'Andrew Clyde', 'Mike Collins', 'Russ Fulcher', 'Mary Miller', 'Clay Higgins', 'Andy Harris', 'Eric Burlison', 'Matt Rosendale', 'North Carolina', 'Greg Murphy', 'Dan Bishop', 'Jim Jordan', 'Warren Davidson', 'Josh Brecheen', 'Scott Perry', 'South Carolina', 'Jeff Duncan', 'Ralph Norman', 'Diana Harshbarger', 'Scott Des', 'Andy Ogles', 'Mark Green', 'Keith Self', 'Chip Roy', 'Troy Nehls', 'Michael Cloud', 'Bob Good', 'Morgan Griffith', 'West Virginia', 'Alex Mooney', 'Tom Tiffany', 'Harriet Hageman']

In [177]:
MainStreet =[
    "Don Bacon",
    "Andy Barr",
    "Aaron Bean",
    "Stephanie Bice",
    "Larry Bucshon",
    "Ken Calvert",
    "Lori Chavez-DeRemer",
    "Juan Ciscomani",
    "Jenniffer González-Colón",
    "Dan Crenshaw",
    "Monica De La Cruz",
    "Mario Díaz-Balart",
    "John Duarte",
    "Anthony D'Esposito",
    "Chuck Edwards",
    "Jake Ellzey",
    "Randy Feenstra",
    "Brad Finstad",
    "Brian Fitzpatrick",
    "Mike Flood",
    "Mike Gallagher",
    "Andrew Garbarino",
    "Michael Guest",
    "Erin Houchin",
    "Bill Huizenga",
    "David Joyce",
    "Tom Kean Jr.",
    "Jen Kiggans",
    "Young Kim",
    "Nick LaLota",
    "Nick Langworthy",
    "Mike Lawler",
    "Laurel Lee",
    "Michael McCaul",
    "Lisa McClain",
    "Max Miller",
    "Carol Miller",
    "Marc Molinaro",
    "John Moolenaar",
    "Blake Moore",
    "Nathaniel Moran",
    "Dan Newhouse",
    "Jay Obernolte",
    "Guy Reschenthaler",
    "David Rouzer",
    "John Rutherford",
    "Peter Sessions",
    "Mike Simpson",
    "Michelle Steel",
    "Mike Turner",
    "David Valadao",
    "Derrick Van Orden",
    "Michael Waltz"
]


In [178]:
Coalition = [    "Terri Sewell", "Greg Stanton", "Ami Bera", "Josh Harder",    "Jim Costa", "Salud Carbajal", "Raul Ruiz", "Julia Brownley",    "Adam Schiff", "Pete Aguilar", "Norma Torres", "Lou Correa",    "Scott H. Peters", "Jason Crow", "Brittany Pettersen", "Yadira Caraveo",    "Joe Courtney", "Jim Himes", "Jared Moskowitz",    "David Scott", "Ed Case", "Mike Quigley", "Sean Casten",    "Raja Krishnamoorthi", "Brad Schneider", "Bill Foster", "Nikki Budzinski",    "Eric Sorensen", "Frank J. Mrvan", "Glenn Ivey", "David Trone",    "Seth Moulton", "Bill Keating", "Hillary Scholten", "Elissa Slotkin",    "Haley Stevens", "Angie Craig", "Dean Phillips", "Susie Lee",    "Chris Pappas", "Ann McLane Kuster", "Josh Gottheimer", "Mikie Sherrill",    "Gabe Vasquez", "Gregory Meeks", "Pat Ryan", "Joe Morelle",    "Tom Suozzi", "Don Davis", "Deborah K. Ross", "Kathy Manning",    "Wiley Nickel", "Jeff Jackson", "Greg Landsman", "Emilia Sykes",    "Lizzie Fletcher", "Joaquin Castro", "Henry Cuellar", "Colin Allred",    "Marc Veasey", "Vicente Gonzalez", "Abigail Spanberger", "Jennifer Wexton",    "Gerry Connolly", "Suzan DelBene", "Rick Larsen", "Derek Kilmer",    "Kim Schrier", "Marilyn Strickland", "Stacey Plaskett"]

In [179]:
test = data

In [201]:
def determine_caucus(member):
    if member in FreedomP:
        return 'Freedom Party'
    elif member in CongProg:
        return 'Congress Progressive'
    elif member in MainStreet:
        return 'Main Street'
    elif member in Coalition:
        return 'New Democrat Coalition'
    return 'NaN'


test['Caucus'] = test['Member'].apply(determine_caucus)

In [198]:
len(test['Member'].unique())

187

In [204]:
Caucus = test[test['Caucus']!='NaN']

In [207]:
Caucus

Unnamed: 0,Date,Member,Party,State / District,Title,Press Release,Caucus
6,May 9,Pramila Jayapal,D,Wash.-7,Jayapal Statement Commending Biden for Stateme...,"2306 Rayburn HOB Washington, DC 20004Email Me2...",Congress Progressive
23,May 8,Morgan Griffith,R,Va.-9,Griffith Statement on President Biden Pause on...,"Washington, DC – Today, Democratic Leader Hake...",Freedom Party
26,May 8,Ayanna Pressley,D,Mass.-7,"Pressley, Haiti Caucus, Colleagues, Advocates ...","Washington, DC – Today, Congresswoman Julia Br...",Congress Progressive
27,May 8,Pramila Jayapal,D,Wash.-7,"Jayapal, Padilla, Congressional Leaders, Advoc...","April 23, 2024 Text of Letter (PDF) WASHINGTON...",Congress Progressive
30,May 3,Andy Biggs,R,Ariz.-5,Opposing the Uniparty's Latest Attack on the C...,In advance of chairing a congressi...,Freedom Party
...,...,...,...,...,...,...,...
958,April 11,Mike Levin,D,Calif.-49,SEEC Clean Energy Deployment Task Force Co-Cha...,\nReference ID: 18.d83a2217.1715273339.7623682...,Congress Progressive
959,April 12,Mike Levin,D,Calif.-49,Rep. Levin and House Natural Resources Committ...,\nReference ID: 18.d83a2217.1715273339.7623c7a...,Congress Progressive
960,April 12,Mike Levin,D,Calif.-49,Rep. Mike Levin and San Diego Congressional De...,\nReference ID: 18.d83a2217.1715273339.762408d...,Congress Progressive
964,April 11,Jared Huffman,D,Calif.-2,Rep. Huffman Statement on 2024 Salmon Season C...,\nReference ID: 18.d83a2217.1715273340.7625986...,Congress Progressive


In [206]:
len(Caucus['Member'].unique())

54

Removing anyrows where the press release is less than 150 characters because these texts tend to be websites, addresses, etc. - texts which are not useful to us

In [216]:
import pandas as pd

# These are rows which need to be removed because their press released was removed from the website
EmptyPressCaucus = Caucus[Caucus['Press Release'].apply(lambda x: len(x) < 150)]
EmptyPressAll = data[data['Press Release'].apply(lambda x: len(x) < 150)]

In [214]:
# Caucus.to_csv('Caucus.csv', index=False)

In [225]:
(EmptyPressAll)

Unnamed: 0,Date,Member,Party,State / District,Title,Press Release,Caucus
1,May 8,Kevin McCarthy,R,Calif.-20,"Speaker Johnson, Leader McConnell: “Security A...","2306 Rayburn HOB Washington, DC 20004Email Me2...",
2,May 8,Kevin McCarthy,R,Calif.-20,Speaker Johnson Remarks Following Failed Motio...,"2306 Rayburn HOB Washington, DC 20004Email Me2...",
3,May 8,Kevin McCarthy,R,Calif.-20,Speaker Johnson Statement on House Passage of ...,"2306 Rayburn HOB Washington, DC 20004Email Me2...",
4,May 9,Kevin McCarthy,R,Calif.-20,Speaker Johnson on Fox and Friends: “I Appreci...,"2306 Rayburn HOB Washington, DC 20004Email Me2...",
5,May 9,Kevin McCarthy,R,Calif.-20,Speaker Johnson on Biden Abandoning Israel: “1...,"2306 Rayburn HOB Washington, DC 20004Email Me2...",
6,May 9,Pramila Jayapal,D,Wash.-7,Jayapal Statement Commending Biden for Stateme...,"2306 Rayburn HOB Washington, DC 20004Email Me2...",Congress Progressive
7,May 7,Robert E. Latta,R,Ohio-5,Latta Announces 2024 Congressional Art Competi...,"2306 Rayburn HOB Washington, DC 20004Email Me2...",
8,May 8,Robert E. Latta,R,Ohio-5,Latta Honors Ohio Students Accepted to U.S. Mi...,"2306 Rayburn HOB Washington, DC 20004Email Me2...",
9,May 9,Blaine Luetkemeyer,R,Mo.-3,Rep. Luetkemeyer Calls for Chairman Gruenbergs...,"2306 Rayburn HOB Washington, DC 20004Email Me2...",
10,May 9,Michael C. Burgess,R,Tex.-26,Burgess Delivers Commencement Speech at UTSW M...,"2306 Rayburn HOB Washington, DC 20004Email Me2...",


In [220]:
len(EmptyPressCaucus)

9

In [223]:
CaucusFilter = Caucus[Caucus['Press Release'].apply(lambda x: len(x) > 150)]
AllData = data[data['Press Release'].apply(lambda x: len(x) > 150)]

In [226]:
AllData.head()

Unnamed: 0,Date,Member,Party,State / District,Title,Press Release,Caucus
0,May 8,Kevin McCarthy,R,Calif.-20,Speaker Johnson: Americans Should Decide Ameri...,FOR IMMEDIATE RELEASE | Contact: Olivia Porcar...,
11,May 6,Tom Cole,R,Okla.-4,The Far-Lefts Serious Anti-Israel Problem,"May is Asian American, Native Hawaiian, and Pa...",
12,May 6,Gus Bilirakis,R,Fla.-12,Financial Disclosure Pasco County Utilities Pr...,"""It is unconscionable that President Biden is ...",
13,May 6,Gus Bilirakis,R,Fla.-12,Hernando Sheriff Project Financial Disclosure,"Washington, DC – House Democratic Leader Hakee...",
14,May 6,Gus Bilirakis,R,Fla.-12,Hernando Schools Financial Disclosure,"May 8, 2024 Contact: Taylor Haulsee “Never A...",


In [234]:
AllData.to_csv('AllData.csv', index=False)
CaucusFilter.to_csv('CaucusFilter.csv', index=False)