In [72]:
import requests
from bs4 import BeautifulSoup


ALL_DATA = {}
for year in range(2022, 2006, -2):
    print(year)
    url = f"https://en.wikipedia.org/wiki/{year}_United_States_House_of_Representatives_elections"
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    federal_header = soup.find(['span', 'h2', 'h3', 'h4'], {'id': 'Federal'})
    if federal_header:
        table = federal_header.find_next('table', {'class': 'wikitable'})
        rows = table.find_all('tr')[2:] if year == 2012 else table.find_all('tr')[1:]  # Skip header row

    data = []
    for row in rows:
        cols = row.find_all('td')
        if cols and len(cols) > 4:  # Check if the row has any columns
            def clean_value(value, convert_func=None):
                cleaned = value.text.strip().strip('%')
                if cleaned in ("—", "–", "-"):
                    return None
                return convert_func(cleaned.replace(',', '')) if convert_func else cleaned

            starting_col = 0
            if clean_value(cols[starting_col]) == "":
                starting_col = 1
            if year > 2018:
                party = cols[starting_col].text.strip()
                votes = clean_value(cols[starting_col + 1], int)
                percentage = clean_value(cols[starting_col + 2], float)
                change = clean_value(cols[starting_col + 3])
                seats_prev = clean_value(cols[starting_col + 4])
                seats_curr = clean_value(cols[starting_col + 5])
                seat_change = clean_value(cols[starting_col + 6])
                strength = clean_value(cols[starting_col + 7])
            elif year == 2008:
                party = cols[starting_col].text.strip()
                if "Democratic" in party:
                    party = "Democratic Party"
                elif "Republican" in party:
                    party = "Republican Party"
                votes = clean_value(cols[starting_col + 1], int)
                percentage = clean_value(cols[starting_col + 2], float)
                seats_curr = clean_value(cols[starting_col + 3])
                seat_change = clean_value(cols[starting_col + 4])
            else:
                if year == 2014:
                    starting_col = 0
                    party = row.find('th').text.strip()
                else:
                    party = cols[starting_col].text.strip()
                seats_prev = clean_value(cols[starting_col + 1])
                seats_curr = clean_value(cols[starting_col + 2])
                seat_change = clean_value(cols[starting_col + 3])
                strength = clean_value(cols[starting_col + 4])
                votes = clean_value(cols[starting_col + 5], int)
                percentage = clean_value(cols[starting_col + 6], float)
                change = clean_value(cols[starting_col + 7])            
            
            data.append({
                'party': party,
                'votes': votes,
                'percentage': percentage,
                'change': change.replace('\u2013', '-') if change else None,
                'seats_prev': seats_prev,
                'seats_curr': seats_curr,
                'seat_change': seat_change,
                'strength': strength
            })

            # reset the values
            party = None
            votes = None
            percentage = None
            change = None
            seats_prev = None
            seats_curr = None
            seat_change = None
            strength = None

    ALL_DATA[year] = data

2022
2020
2018
2016
2014
2012
2010
2008


In [76]:
{
    'party': party,
    'votes': votes,
    'percentage': percentage,
    'change': change.replace('\u2013', '-').replace('\u2212', '-') if change else None,
    'seats_prev': seats_prev,
    'seats_curr': seats_curr,
    'seat_change': seat_change,
    'strength': strength
}

{'party': None,
 'votes': None,
 'percentage': None,
 'change': None,
 'seats_prev': None,
 'seats_curr': None,
 'seat_change': None,
 'strength': None}

In [74]:
year

2008

In [77]:
import json

with open('wiki_scrape_nhe.json', 'w') as f:
    json.dump(ALL_DATA, f, indent=4)