In [None]:
# # https://www.house.gov/representatives
# # I want to scrape the names of all the representatives from New York (state).

# import requests
# from bs4 import BeautifulSoup

# url = 'https://www.house.gov/representatives'
# response = requests.get(url)
# print(response.status_code)
# soup = BeautifulSoup(response.text, 'html.parser')
# # print(soup.prettify())

# #by-state > div > div > div.view-content (contains tables of representatives by state)
# # <caption id="state-alabama">Alabama </caption> (state looks like this)
# #housegov_reps_by_state-block_default-883106430 > tbody > tr:nth-child(1) is the first row of the table (a single representative from the first state)

# # I want to find the table that contains the representatives from New York

# # Locate the caption for New York using a text match
# ny_caption = soup.find('caption', string=lambda text: 'New York' in text)
# if ny_caption:
#     # Get the parent table of this caption
#     ny_table = ny_caption.find_parent('table')
#     if ny_table:
#         representatives = ny_table.find_all('tr')
        
#         # Print each representative's name assuming the name is in the first td
#         for rep in representatives[1:]:  # skip the header row if present
#             # for all td elements in the row, put all td elements in a list
#             tds = rep.find_all('td')
#             # strip and put into a list
#             print([td.text.strip() for td in tds])
#     else:
#         print("No table found for New York")
# else:
#     print("Couldn't find the New York section on the page")

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import os

"""
GET DATA
"""
url = 'https://www.house.gov/representatives'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
# print(soup.prettify())


"""
PARSE DATA
"""
state_section = soup.find('a', {'href': '#by-state'})
if state_section is None:
    print("Couldn't find the section for states")
    exit()

# Find all caption elements which typically contain state names
captions = soup.find_all('caption')

# Prepare a list to hold all representative data
all_reps = []

# Iterate over each state's caption
for caption in captions:
    state = caption.get_text(strip=True)
    table = caption.find_parent('table')
    if table:
        headers = [th.get_text(strip=True) for th in table.find('tr').find_all('th')]
        rows = table.find_all('tr')[1:]  # Skipping the header row

        for row in rows:
            cols = row.find_all('td')
            if len(cols) == len(headers):  # Ensuring each row has the correct number of columns
                rep_data = {'State': state}
                rep_data.update({headers[i]: cols[i].get_text(strip=True) for i in range(len(cols))})
                all_reps.append(rep_data)

"""
CONVERT DATA TO DATAFRAME
"""
# Convert the list of dictionaries into a DataFrame
df_reps = pd.DataFrame(all_reps)

# df should only contain first half of the rows
df_reps = df_reps.head(int(len(df_reps)/2))

"""
ALTER DATAFRAME FOR ANALYSIS
"""
# Split the Name column into separate First Name and Last Name columns
df_reps[['first_name', 'last_name']] = df_reps['Name'].str.split(', ', expand=True)

# Drop the original Name column
df_reps.drop(columns=['Name'], inplace=True)

# Create a full_name column by concatenating first_name and last_name
df_reps['full_name'] = df_reps['first_name'] + ' ' + df_reps['last_name']

# Split Committee assignments into separate columns; find max number of committees for any representative
max_cols = df_reps['Committee Assignment'].str.split('|').apply(len).max()

# Create new columns for each committee
for i in range(max_cols):
    df_reps[f'committee_{i+1}'] = df_reps['Committee Assignment'].str.split('|').str[i]

# Drop the original Committee Assignment column
df_reps.drop(columns=['Committee Assignment'], inplace=True)

"""
SAVE DATA TO CSV
"""
# Display the DataFrame
# print(df_reps)

# check if inputs folder exists; if not, create it
DATA_PATH = '../../data'

if not os.path.exists(DATA_PATH+'/inputs'):
    os.makedirs(DATA_PATH+'/inputs')

# Save the DataFrame to a CSV file
df_reps.to_csv(DATA_PATH+'/inputs/house_committees.csv', index=False)


In [None]:
# url = "https://senate.gov/general/committee_assignments/assignments.htm"
# response = requests.get(url)
# print(response.status_code)