In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

#insecure request warning from "verify=false" for each requests.get got annoying
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

In [2]:
district_7_url = 'https://www.opensecrets.org/races/candidates?cycle=2020&id=TN07&spec=N'

district_7_url_response = requests.get(district_7_url, verify=False)
district_7_url_response.status_code

200

In [3]:
district_7_soup = BeautifulSoup(district_7_url_response.text)

In [4]:
candidate_info_df = pd.DataFrame()
party_dict={'R':'Republican','D':'Democrat','I':'Independent', 'L':'Libertarian', '3':'3rd Party'}

page_title = district_7_soup.find('h1',attrs={'class':'Hero-title'}).text
state = page_title[0:page_title.index('District')-1]
district_num = int(page_title[page_title.index('District')+9:page_title.index('District')+11])

candidate_name_list = []
candidate_party_list = []
is_incumbent_list = []
is_winner_list = []
vote_pct_list = []

candidate_blocks = district_7_soup.findAll('div',attrs={'class':'Members--bio u-richtext'})
#print(candidate_blocks) #-----> great reference for the complicated objects that results here
for block in candidate_blocks:
    
    #initialize variable for each candidate
    candidate_info_list = []
    is_incumbent = 'No'
    is_winner = 'No'
    
    #create list of "stripped strings" with all candidate info from each block
    for string in block.stripped_strings:
        candidate_info_list.append(repr(string))
    #print(candidate_info_list) #-----> great reference for the complicated list that results here
    
    #find candidate name, party, and vote % in candidate_info_list
    candidate_name=candidate_info_list[0][1:candidate_info_list[0].index('(')-1]
    
    candidate_party=candidate_info_list[0][candidate_info_list[0].index('(')+1]
    candidate_party = party_dict.get(candidate_party,candidate_party)
    
    #note: vote_pct was made into a float for ease of use later
    vote_pct = float(candidate_info_list[-1][2:candidate_info_list[-1].index('%')])
    
    #find whether or not candidate is incumbent within candidate_info_list
    for item in candidate_info_list:
        if item == "'Incumbent'":
            is_incumbent = 'Yes'
            
    #find whether or not candidate is winner within candidate_info_list
    for item in candidate_info_list:
        if item == "'Winner'":
            is_winner = 'Yes'            
            
    #make a lists for cleaned relevant information on each candidate
    candidate_name_list.append(candidate_name)
    candidate_party_list.append(candidate_party)
    is_incumbent_list.append(is_incumbent)
    is_winner_list.append(is_winner)
    vote_pct_list.append(vote_pct)

#Find money raised and spent
candidate_bigger_blocks = district_7_soup.findAll('div',attrs={'class':'Members--list-item'})
money_raised_list = []
money_spent_list = []
for bigger_block in candidate_bigger_blocks:
    money_table=pd.read_html(str(bigger_block.find('table',attrs={'class':'Members--table'})))[0]
    money_raised=money_table.loc[0,1]
    money_spent=money_table.loc[1,1]   
    #Put money raised and spent in lists as int values
    money_raised_list.append(int(money_raised[0:].replace(',','').replace('$','')))
    money_spent_list.append(int(money_spent[0:].replace(',','').replace('$','')))

    
#add cleaned info lists to the dataframe
candidate_info_df['Candidate_Name']=candidate_name_list
candidate_info_df['Candidate_Party']=candidate_party_list
candidate_info_df['State'] = state
candidate_info_df['District'] = district_num
candidate_info_df['Is_Incumbent?'] = is_incumbent_list
candidate_info_df['Is_Winner'] = is_winner_list
candidate_info_df['Percent_of_Vote_(%)'] = vote_pct_list
candidate_info_df['Money_Raised_($)'] = money_raised_list
candidate_info_df['Money_Spent_($)'] = money_spent_list

candidate_info_df

Unnamed: 0,Candidate_Name,Candidate_Party,State,District,Is_Incumbent?,Is_Winner,Percent_of_Vote_(%),Money_Raised_($),Money_Spent_($)
0,Mark Green,Republican,Tennessee,7,Yes,Yes,69.9,1194960,935487
1,Kiran Sreepada,Democrat,Tennessee,7,No,No,27.3,206644,207191
2,Ronald Brown,Independent,Tennessee,7,No,No,2.2,1750,0
3,Scott Vieira Jr,Independent,Tennessee,7,No,No,0.6,655,1049


In [5]:
#Pull Number of state reps and state abbreviations
state_abbrev_url = 'https://www.ssa.gov/international/coc-docs/states.html'
state_abbrev_url_response = requests.get(state_abbrev_url,verify=False)
#state_abbrev_url_response.status_code
state_abbrev_soup = BeautifulSoup(state_abbrev_url_response.text)
state_abbrev_df=pd.read_html(str(state_abbrev_soup.find('table')))[0]


state_rep_num_url = 'https://www.britannica.com/topic/United-States-House-of-Representatives-Seats-by-State-1787120'
state_rep_num_url_response = requests.get(state_rep_num_url,verify=False)
#state_rep_num_url_response.status_code
state_rep_num_soup = BeautifulSoup(state_rep_num_url_response.text)
state_rep_num_df = pd.read_html(str(state_rep_num_soup.find('table')))[0]

state_abbrev_df = state_abbrev_df.rename(columns = {0:'state', 1:'abbreviation'}).copy()
state_rep_num_df['state'] = state_rep_num_df['state'].str.upper()
#state_abbrev_df
#state_rep_num_df
state_info_df = state_abbrev_df.merge(state_rep_num_df,on='state',how='inner')
state_info_df


Unnamed: 0,state,abbreviation,representatives
0,ALABAMA,AL,7
1,ALASKA,AK,1
2,ARIZONA,AZ,9
3,ARKANSAS,AR,4
4,CALIFORNIA,CA,52
5,COLORADO,CO,8
6,CONNECTICUT,CT,5
7,DELAWARE,DE,1
8,FLORIDA,FL,28
9,GEORGIA,GA,14


In [6]:
def pull_candidates(url):
    url_response = requests.get(url, verify=False)
    url_soup = BeautifulSoup(url_response.text)
    
    candidate_info_df = pd.DataFrame()
    party_dict={'R':'Republican','D':'Democrat','I':'Independent', 'L':'Libertarian', '3':'3rd Party'}

    page_title = url_soup.find('h1',attrs={'class':'Hero-title'}).text
    state = page_title[0:page_title.index('District')-1]
    district_num = int(page_title[page_title.index('District')+9:page_title.index('District')+11])

    candidate_name_list = []
    candidate_party_list = []
    is_incumbent_list = []
    is_winner_list = []
    vote_pct_list = []

    candidate_blocks = url_soup.findAll('div',attrs={'class':'Members--bio u-richtext'})
    #print(candidate_blocks) #-----> great reference for the complicated objects that results here
    for block in candidate_blocks:

        #initialize variable for each candidate
        candidate_info_list = []
        is_incumbent = 'No'
        is_winner = 'No'

        #create list of "stripped strings" with all candidate info from each block
        for string in block.stripped_strings:
            candidate_info_list.append(repr(string))
        #print(candidate_info_list) #-----> great reference for the complicated list that results here

        #find candidate name, party, and vote % in candidate_info_list
        candidate_name=candidate_info_list[0][1:candidate_info_list[0].index('(')-1]

        candidate_party=candidate_info_list[0][candidate_info_list[0].index('(')+1]
        candidate_party = party_dict.get(candidate_party,candidate_party)

        #note: vote_pct was made into a float for ease of use later
        try:
            vote_pct = float(candidate_info_list[-1][2:candidate_info_list[-1].index('%')])
        except:
            vote_pct = None

        #find whether or not candidate is incumbent within candidate_info_list
        for item in candidate_info_list:
            if item == "'Incumbent'":
                is_incumbent = 'Yes'

        #find whether or not candidate is winner within candidate_info_list
        for item in candidate_info_list:
            if item == "'Winner'":
                is_winner = 'Yes'            

        #make a lists for cleaned relevant information on each candidate
        candidate_name_list.append(candidate_name)
        candidate_party_list.append(candidate_party)
        is_incumbent_list.append(is_incumbent)
        is_winner_list.append(is_winner)
        vote_pct_list.append(vote_pct)

    #Find money raised and spent
    candidate_bigger_blocks = url_soup.findAll('div',attrs={'class':'Members--list-item'})
    money_raised_list = []
    money_spent_list = []
    for bigger_block in candidate_bigger_blocks:
        money_table=pd.read_html(str(bigger_block.find('table',attrs={'class':'Members--table'})))[0]
        money_raised=money_table.loc[0,1]
        money_spent=money_table.loc[1,1]   
        #Put money raised and spent in lists as int values
        money_raised_list.append(int(money_raised[0:].replace(',','').replace('$','')))
        money_spent_list.append(int(money_spent[0:].replace(',','').replace('$','')))


    #add cleaned info lists to the dataframe
    candidate_info_df['Candidate_Name']=candidate_name_list
    candidate_info_df['Candidate_Party']=candidate_party_list
    candidate_info_df['State'] = state
    candidate_info_df['District'] = district_num
    candidate_info_df['Is_Incumbent?'] = is_incumbent_list
    candidate_info_df['Is_Winner'] = is_winner_list
    candidate_info_df['Percent_of_Vote_(%)'] = vote_pct_list
    candidate_info_df['Money_Raised_($)'] = money_raised_list
    candidate_info_df['Money_Spent_($)'] = money_spent_list
    
    return candidate_info_df

In [7]:
full_candidate_info_df = pd.DataFrame()

urls = []
for state, rep_num in zip(state_info_df['abbreviation'],state_info_df['representatives']):
    print(state+" "+"{:02d}".format(rep_num))
    rep_num_list = [*range(1,rep_num+1)]
    rep_num_list_str = []
    for num in rep_num_list:
        num_string ="{:02d}".format(num)
        urls.append(f'https://www.opensecrets.org/races/candidates?cycle=2020&id={state}{num_string}&spec=N')

for url in urls:
    full_candidate_info_df = pd.concat([full_candidate_info_df,pull_candidates(url)], ignore_index=True)

AL 07
AK 01
AZ 09
AR 04
CA 52
CO 08
CT 05
DE 01
FL 28
GA 14
HI 02
ID 02
IL 17
IN 09
IA 04
KS 04
KY 06
LA 06
ME 02
MD 08
MA 09
MI 13
MN 08
MS 04
MO 08
MT 02
NE 03
NV 04
NH 02
NJ 12
NM 03
NY 26
NC 14
ND 01
OH 15
OK 05
OR 06
PA 17
RI 02
SC 07
SD 01
TN 09
TX 38
UT 04
VT 01
VA 11
WA 10
WV 02
WI 08
WY 01


In [8]:
full_candidate_info_df[(full_candidate_info_df['Candidate_Party']!='Republican') & (full_candidate_info_df['Candidate_Party']!='Democrat') & (full_candidate_info_df['Candidate_Party']!='Independent')].tail(50)

Unnamed: 0,Candidate_Name,Candidate_Party,State,District,Is_Incumbent?,Is_Winner,Percent_of_Vote_(%),Money_Raised_($),Money_Spent_($)
92,Angelica Duenas,3rd Party,California,29,No,No,43.4,78083,76627
141,Chris Milton,3rd Party,Colorado,3,No,No,1.0,12012,12012
146,Rebecca Keltie,3rd Party,Colorado,5,No,No,0.8,2552,1495
149,Jaimie Kulikowski,3rd Party,Colorado,6,No,No,0.9,72293,110645
160,Justin Paglino,3rd Party,Connecticut,3,No,No,1.5,29200,28670
242,Chase Oliver,Libertarian,Georgia,5,No,No,,9225,5220
250,Jimmy Cooper,3rd Party,Georgia,8,No,No,,8542,8528
262,Martin Cowen,Libertarian,Georgia,13,No,No,,13460,13464
268,Jonathan Hoomanawanui,3rd Party,Hawaii,2,No,No,2.2,1883,1522
283,Bill Redpath,Libertarian,Illinois,6,No,No,1.7,9837,9837


In [15]:
full_candidate_info_df[full_candidate_info_df['State']=='Texas'].head(50)

Unnamed: 0,Candidate_Name,Candidate_Party,State,District,Is_Incumbent?,Is_Winner,Percent_of_Vote_(%),Money_Raised_($),Money_Spent_($)
722,Louie Gohmert,Republican,Texas,1,Yes,Yes,72.6,552915,580441
723,Hank Gilbert,Democrat,Texas,1,No,No,27.4,968155,734411
724,Dan Crenshaw,Republican,Texas,2,Yes,Yes,55.6,19427865,17596329
725,Sima Ladjevardian,Democrat,Texas,2,No,No,42.8,3832697,3825287
726,Elliott Scheirman,Libertarian,Texas,2,No,No,1.6,15374,14296
727,Van Taylor,Republican,Texas,3,Yes,Yes,55.1,2655327,2825378
728,Helane Seikaly,Democrat,Texas,3,No,No,42.9,1635185,1634843
729,Patrick Fallon,Republican,Texas,4,No,Yes,75.1,257174,95065
730,Tracy Shawn Jones,Independent,Texas,4,No,No,0.4,13002,10595
731,Lance Gooden,Republican,Texas,5,Yes,Yes,62.0,1610327,1311983
