# How this code works

* We first create a function to look at the HTML of a single state / single district website. This returns the political data for one page.
    * Return: i.e AZ District 01
* We then wrap that function in a for loop, to find the political data for every district, still for one state.
    * Return: i.e AZ Districts 01-07
* After that, we take THAT function, put it in another for loop (for a total of 3 layers at this point), and go through every state to return all data.
    * Return: All states & all districts


In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko)\
 Chrome/56.0.2924.76 Safari/537.36'} 

def get_candidates(state, district, year):

    URL = "https://www.opensecrets.org/races/candidates?cycle={}&id={}{}&spec=N".format(year, state, str(district).zfill(2))

    time.sleep(2)
    session = requests.Session()
    page = session.get(URL, headers=headers)


    if not page:
        print(state, district, "request returned NONE:", URL)
        print(page)

    soup = BeautifulSoup(page.content, "html.parser")


    candidates = []

    for L1 in soup.find_all("div", {"class": "Members--list-item"}): 
        
        for L2 in L1.find_all('div', {'class':'Members--bio u-richtext'}):
            
            for L3 in L2.find_all("h2"):

                for L4 in L3.find_all("strong"):

                    
                    name = L4.text

                    is_incumbant = False
                    if "Incumbent" in name:
                        is_incumbant = True

                    is_winner = False
                    if "Winner" in name:
                        is_winner = True

                    name = L4.text.split("\n")[1].replace("\t", "").replace("\n", "").replace("• Incumbent", "").strip()
                
                    party = name.split(" (")

                    name = party[0]
                    party = party[1][:-1] 

                 
                    title = soup.find_all('title', limit=1)[0].text.split(" ")
                    state = title[0]
                    district = title[2]

                   
                    for L5 in L4.find_all("span", {"class":"Members--vote-pct"}):
                       
                        pct_won = L5.text.split("%")[0][1:]
                    
                    raised = L1.find_all("td", {"class": "Members--number"})[0].text.replace("$", "").replace(",", "")
                    spent = L1.find_all("td", {"class": "Members--number"})[1].text.replace("$", "").replace(",", "")

                   
                    candidate = [name, party, state, district, is_incumbant, is_winner, pct_won, raised, spent]
                    candidates.append(candidate)

   
    df = pd.DataFrame(candidates, columns =["Name", "Party", "State", "District", "Incumbant", "Winner", "% Votes", "$ Raised", "$ Spent"])
    return df, URL

print("Testing: Evaluating for AZ D01")
a = get_candidates("AZ", 1, 2008)
print(a)

def get_all_districts_for_state(state, year):
    

    all_state_districts = []

    for district in range(1, 55):
        
        try:
           
            df, url = get_candidates(state, district, year)

           
            if df.shape[0] == 0:
                break
            all_state_districts.append(df)
        except Exception as e:
            break 

   
    asd_df = pd.concat(all_state_districts)
    asd_df["Year"] = year
    return asd_df

print("Evaluating for CO All Districts")
a = get_all_districts_for_state("CO", 2020)
print(a)

list_of_state_dfs = []

states = ["AL", "AK", "AZ", "AR", "CO", "CT", "DE", "FL", "GA", "HI", "ID", "IL", "IN", "IA", "KS", "KY", "LA", "ME", "MD", \
            "MA", "MI", "MN", "MS", "MO", "MT", "NE", "NV", "NH", "NJ", "NM", "NY", "NC", "ND", "OH", "OK", "OR", "PA", "RI", "SC", \
            "SD", "TN", "TX", "UT", "VT", "VA", "WA", "WV", "WI", "WY", "CA"]



for state in states:
    print("Evaluating", state)
    try:
        result = get_all_districts_for_state(state, 2020)
        list_of_state_dfs.append(result)
    except Exception as e:
        
        print(state, "FAILED", e)


all_states = pd.concat(list_of_state_dfs)

print( len(list_of_state_dfs) )
print(all_states)
print(all_states.shape)

Testing: Evaluating for AZ D01
(                Name Party    State District  Incumbant  Winner % Votes  \
0    Ann Kirkpatrick     D  Arizona       01      False    True    55.9   
1         Sydney Hay     R  Arizona       01      False   False    39.4   
2  Brent Geer Maupin     I  Arizona       01      False   False     3.3   

  $ Raised  $ Spent  
0  1996232  1976392  
1   634271   633171  
2    26706    26682  , 'https://www.opensecrets.org/races/candidates?cycle=2008&id=AZ01&spec=N')
Evaluating for CO All Districts
                Name Party     State District  Incumbant  Winner % Votes  \
0      Diana DeGette     D  Colorado       01       True    True    73.7   
0      Joseph Neguse     D  Colorado       02       True    True    61.5   
1       Charles Winn     R  Colorado       02      False   False    35.4   
2       Alex Johnson     I  Colorado       02      False   False     0.0   
0  Diane Mitsch Bush     D  Colorado       03      False   False    45.2   
1     Lauren Boe