In [23]:
import requests
from bs4 import BeautifulSoup
from tqdm.notebook import tqdm


ALL_DATA = {}

for year in tqdm(range(2022, 2008, -2), desc="Processing years"):
    url = f"https://en.wikipedia.org/wiki/{year}_United_States_House_of_Representatives_elections"
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    data = {}
    state_names = [
        'Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California', 'Colorado',
        'Connecticut', 'Delaware', 'Florida', 'Georgia', 'Hawaii', 'Idaho',
        'Illinois', 'Indiana', 'Iowa', 'Kansas', 'Kentucky', 'Louisiana',
        'Maine', 'Maryland', 'Massachusetts', 'Michigan', 'Minnesota',
        'Mississippi', 'Missouri', 'Montana', 'Nebraska', 'Nevada',
        'New Hampshire', 'New Jersey', 'New Mexico', 'New York',
        'North Carolina', 'North Dakota', 'Ohio', 'Oklahoma', 'Oregon',
        'Pennsylvania', 'Rhode Island', 'South Carolina', 'South Dakota',
        'Tennessee', 'Texas', 'Utah', 'Vermont', 'Virginia', 'Washington',
        'West Virginia', 'Wisconsin', 'Wyoming'
    ]
    for state_name in tqdm(state_names, desc=f"Processing states for {year}", leave=False):
        state_header = soup.find(['span', 'h2', 'h3', 'h4'], {'id': state_name.replace(' ', '_')})
        if state_header:
            state_name = state_header.text.strip()
            state_table = state_header.find_next('table', {'class': 'wikitable'})
            if state_table:
                rows = state_table.find_all('tr')[1:]  # Skip header row
                state_data = []
                
                for row in rows:
                    cols = row.find_all(['th', 'td'])
                    district = None
                    pvi = None
                    incumbent = None
                    party = None
                    first_elected = None
                    status = None
                    candidates = None

                    district = cols[0].text.strip()
                    if district == 'Location':
                        continue
                    pvi = cols[1].text.strip()
                    if "+" not in pvi and "even" not in pvi.lower():
                        continue

                    if len(cols) >= 6:
                        incumbent = cols[2].text.strip()
                        party = cols[3].text.strip()
                        first_elected = cols[4].text.strip()
                        status = cols[5].text.strip()
                        
                        candidates = []
                        if len(cols) > 6:
                            candidate_list = cols[6].find_all('li')
                            for candidate in candidate_list:
                                candidate_text = candidate.text.strip()
                                candidates.append(candidate_text)


                    state_data.append({
                        'district': district,
                        'pvi': pvi,
                        'incumbent': incumbent,
                        'party': party,
                        'first_elected': first_elected,
                        'status': status,
                        'candidates': candidates
                    })
            
            data[state_name] = state_data

    ALL_DATA[year] = data

Processing years:   0%|          | 0/7 [00:00<?, ?it/s]

Processing states for 2022:   0%|          | 0/50 [00:00<?, ?it/s]

Processing states for 2020:   0%|          | 0/50 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [22]:
import json

with open('wiki_scrape_pvi.json', 'w') as f:
    json.dump(ALL_DATA, f, indent=4)
