In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import time
import aiohttp
import asyncio

#insecure request warning from "verify=false" for each requests.get got annoying
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

In [2]:
district_7_url = 'https://www.opensecrets.org/races/candidates?cycle=2020&id=TN07&spec=N'

district_7_url_response = requests.get(district_7_url, verify=False)
district_7_url_response.status_code

200

In [3]:
district_7_soup = BeautifulSoup(district_7_url_response.text)

In [4]:
candidate_info_df = pd.DataFrame()
party_dict={'R':'Republican','D':'Democrat','I':'Independent', 'L':'Libertarian', '3':'3rd Party'}

page_title = district_7_soup.find('h1',attrs={'class':'Hero-title'}).text
state = page_title[0:page_title.index('District')-1]
district_num = int(page_title[page_title.index('District')+9:page_title.index('District')+11])

candidate_name_list = []
candidate_party_list = []
is_incumbent_list = []
is_winner_list = []
vote_pct_list = []

candidate_blocks = district_7_soup.findAll('div',attrs={'class':'Members--bio u-richtext'})
#print(candidate_blocks) #-----> great reference for the complicated objects that results here
for block in candidate_blocks:
    
    #initialize variable for each candidate
    candidate_info_list = []
    is_incumbent = 'No'
    is_winner = 'No'
    
    #create list of "stripped strings" with all candidate info from each block
    for string in block.stripped_strings:
        candidate_info_list.append(repr(string))
    #print(candidate_info_list) #-----> great reference for the complicated list that results here
    
    #find candidate name, party, and vote % in candidate_info_list
    candidate_name=candidate_info_list[0][1:candidate_info_list[0].index('(')-1]
    
    candidate_party=candidate_info_list[0][candidate_info_list[0].index('(')+1]
    candidate_party = party_dict.get(candidate_party,candidate_party)
    
    #note: vote_pct was made into a float for ease of use later
    vote_pct = float(candidate_info_list[-1][2:candidate_info_list[-1].index('%')])
    
    #find whether or not candidate is incumbent within candidate_info_list
    for item in candidate_info_list:
        if item == "'Incumbent'":
            is_incumbent = 'Yes'
            
    #find whether or not candidate is winner within candidate_info_list
    for item in candidate_info_list:
        if item == "'Winner'":
            is_winner = 'Yes'            
            
    #make a lists for cleaned relevant information on each candidate
    candidate_name_list.append(candidate_name)
    candidate_party_list.append(candidate_party)
    is_incumbent_list.append(is_incumbent)
    is_winner_list.append(is_winner)
    vote_pct_list.append(vote_pct)
    

#Find money raised and spent
candidate_bigger_blocks = district_7_soup.findAll('div',attrs={'class':'Members--list-item'})
money_raised_list = []
money_spent_list = []
for bigger_block in candidate_bigger_blocks:
    money_table=pd.read_html(str(bigger_block.find('table',attrs={'class':'Members--table'})))[0]
    money_raised=money_table.loc[0,1]
    money_spent=money_table.loc[1,1]   
    #Put money raised and spent in lists as int values
    money_raised_list.append(int(money_raised[0:].replace(',','').replace('$','')))
    money_spent_list.append(int(money_spent[0:].replace(',','').replace('$','')))

    
#add cleaned info lists to the dataframe
candidate_info_df['Candidate_Name']=candidate_name_list
candidate_info_df['Candidate_Party']=candidate_party_list
candidate_info_df['State'] = state
candidate_info_df['District'] = district_num
candidate_info_df['Is_Incumbent?'] = is_incumbent_list
candidate_info_df['Is_Winner'] = is_winner_list
candidate_info_df['Percent_of_Vote_(%)'] = vote_pct_list
candidate_info_df['Money_Raised_($)'] = money_raised_list
candidate_info_df['Money_Spent_($)'] = money_spent_list

candidate_info_df

Unnamed: 0,Candidate_Name,Candidate_Party,State,District,Is_Incumbent?,Is_Winner,Percent_of_Vote_(%),Money_Raised_($),Money_Spent_($)
0,Mark Green,Republican,Tennessee,7,Yes,Yes,69.9,1194960,935487
1,Kiran Sreepada,Democrat,Tennessee,7,No,No,27.3,206644,207191
2,Ronald Brown,Independent,Tennessee,7,No,No,2.2,1750,0
3,Scott Vieira Jr,Independent,Tennessee,7,No,No,0.6,655,1049


In [5]:
#Pull Number of state reps and state abbreviations
state_abbrev_url = 'https://www.ssa.gov/international/coc-docs/states.html'
state_abbrev_url_response = requests.get(state_abbrev_url,verify=False)
#state_abbrev_url_response.status_code
state_abbrev_soup = BeautifulSoup(state_abbrev_url_response.text)
state_abbrev_df=pd.read_html(str(state_abbrev_soup.find('table')))[0]


state_rep_num_url = 'https://www.britannica.com/topic/United-States-House-of-Representatives-Seats-by-State-1787120'
state_rep_num_url_response = requests.get(state_rep_num_url,verify=False)
#state_rep_num_url_response.status_code
state_rep_num_soup = BeautifulSoup(state_rep_num_url_response.text)
state_rep_num_df = pd.read_html(str(state_rep_num_soup.find('table')))[0]

state_abbrev_df = state_abbrev_df.rename(columns = {0:'state', 1:'abbreviation'}).copy()
state_rep_num_df['state'] = state_rep_num_df['state'].str.upper()
#state_abbrev_df
#state_rep_num_df
state_info_df = state_abbrev_df.merge(state_rep_num_df,on='state',how='inner')
state_info_df


Unnamed: 0,state,abbreviation,representatives
0,ALABAMA,AL,7
1,ALASKA,AK,1
2,ARIZONA,AZ,9
3,ARKANSAS,AR,4
4,CALIFORNIA,CA,52
5,COLORADO,CO,8
6,CONNECTICUT,CT,5
7,DELAWARE,DE,1
8,FLORIDA,FL,28
9,GEORGIA,GA,14


In [6]:
async def pull_candidates(url):
    #url_response = requests.get(url, verify=False)
    #url_soup = BeautifulSoup(url_response.text)
    
    candidate_info_df = pd.DataFrame()
    party_dict={'R':'Republican','D':'Democrat','I':'Independent', 'L':'Libertarian', '3':'3rd Party'}

    timeout = aiohttp.ClientTimeout(total=600)
    connector = aiohttp.TCPConnector(limit=10)
    async with aiohttp.ClientSession(connector=connector, timeout=timeout) as session:
         while True:
            try:
                async with session.get(url) as url_response:
                    url_text = await url_response.text()
                    url_soup = BeautifulSoup(url_text)
                    page_title = url_soup.find('h1',attrs={'class':'Hero-title'}).text
                    break
            except:
                print('Connection error loading url. Trying again...')
                time.sleep(5)

    state = page_title[0:page_title.index('District')-1]
    district_num = int(page_title[page_title.index('District')+9:page_title.index('District')+11])
    
    if int(page_title[page_title.index('District')+12:page_title.index('District')+16]) != 2020:
        print(f'{state} District {district_num} does not have a 2020 page!')
        pass
    else:
        candidate_name_list = []
        candidate_party_list = []
        is_incumbent_list = []
        is_winner_list = []
        vote_pct_list = []

        candidate_blocks = url_soup.findAll('div',attrs={'class':'Members--bio u-richtext'})
        #print(candidate_blocks) #-----> great reference for the complicated objects that results here
        for block in candidate_blocks:

            #initialize variable for each candidate
            candidate_info_list = []
            is_incumbent = 'No'
            is_winner = 'No'

            #create list of "stripped strings" with all candidate info from each block
            for string in block.stripped_strings:
                candidate_info_list.append(repr(string))
            #print(candidate_info_list) #-----> great reference for the complicated list that results here

            #find candidate name, party, and vote % in candidate_info_list
            candidate_name=candidate_info_list[0][1:candidate_info_list[0].index('(')-1]

            candidate_party=candidate_info_list[0][candidate_info_list[0].index('(')+1]
            candidate_party = party_dict.get(candidate_party,candidate_party)

            #note: vote_pct was made into a float for ease of use later
            try:
                vote_pct = float(candidate_info_list[-1][2:candidate_info_list[-1].index('%')])
            except:
                vote_pct = None

            #find whether or not candidate is incumbent within candidate_info_list
            for item in candidate_info_list:
                if item == "'Incumbent'":
                    is_incumbent = 'Yes'

            #find whether or not candidate is winner within candidate_info_list
            for item in candidate_info_list:
                if item == "'Winner'":
                    is_winner = 'Yes'            

            #make a lists for cleaned relevant information on each candidate
            candidate_name_list.append(candidate_name)
            candidate_party_list.append(candidate_party)
            is_incumbent_list.append(is_incumbent)
            is_winner_list.append(is_winner)
            vote_pct_list.append(vote_pct)

        try:
            if 'Yes' not in is_winner_list:
                print(f'{state} District {district_num} does not list a winner! Trying to determine winner by vote %...')
                is_winner_list[vote_pct_list.index(max(vote_pct_list))] = 'Yes'
                print(f'Successful!  {state} District {district_num} will list a winner determined by vote %.')
        except:
            print(f'Not successful. {state} District {district_num} will list NaN values for the "Is_Winner" column.')
            is_winner_list = [np.nan]*len(is_winner_list)
            pass       

        #Find money raised and spent
        candidate_bigger_blocks = url_soup.findAll('div',attrs={'class':'Members--list-item'})
        money_raised_list = []
        money_spent_list = []
        for bigger_block in candidate_bigger_blocks:
            money_table=pd.read_html(str(bigger_block.find('table',attrs={'class':'Members--table'})))[0]
            money_raised=money_table.loc[0,1]
            money_spent=money_table.loc[1,1]   
            #Put money raised and spent in lists as int values
            money_raised_list.append(int(money_raised[0:].replace(',','').replace('$','')))
            money_spent_list.append(int(money_spent[0:].replace(',','').replace('$','')))


        #add cleaned info lists to the dataframe
        candidate_info_df['Candidate_Name']=candidate_name_list
        candidate_info_df['Candidate_Party']=candidate_party_list
        candidate_info_df['State'] = state
        candidate_info_df['District'] = district_num
        candidate_info_df['Is_Incumbent?'] = is_incumbent_list
        candidate_info_df['Is_Winner'] = is_winner_list
        candidate_info_df['Percent_of_Vote_(%)'] = vote_pct_list
        candidate_info_df['Money_Raised_($)'] = money_raised_list
        candidate_info_df['Money_Spent_($)'] = money_spent_list

        return candidate_info_df

In [7]:
full_candidate_info_df = pd.DataFrame()

urls = []
for state, rep_num in zip(state_info_df['abbreviation'],state_info_df['representatives']):
    #print(state+" "+"{:02d}".format(rep_num))
    rep_num_list = [*range(1,rep_num+1)]
    rep_num_list_str = []
    for num in rep_num_list:
        num_string ="{:02d}".format(num)
        urls.append(f'https://www.opensecrets.org/races/candidates?cycle=2020&id={state}{num_string}&spec=N')

start_time = time.time()
print('Please wait. Compiling the full candidate info dataframe...')
for url in urls:
    result = await pull_candidates(url)
    full_candidate_info_df = pd.concat([full_candidate_info_df,result], ignore_index=True)
print(f"Complete! Compliling the full candidate info dataframe took: {time.time()-start_time:.2f} seconds")

Please wait. Compiling the full candidate info dataframe...
Colorado District 8 does not have a 2020 page!
Florida District 28 does not have a 2020 page!
Hawaii District 1 does not list a winner! Trying to determine winner by vote %...
Successful!  Hawaii District 1 will list a winner determine by vote %.
Louisiana District 5 does not list a winner! Trying to determine winner by vote %...
Not successful. Louisiana District 5 will list NaN values for the "Is_Winner" column.
Montana District 2 does not have a 2020 page!
New York District 22 does not list a winner! Trying to determine winner by vote %...
Not successful. New York District 22 will list NaN values for the "Is_Winner" column.
North Carolina District 14 does not have a 2020 page!
Oregon District 6 does not have a 2020 page!
Pennsylvania District 10 does not list a winner! Trying to determine winner by vote %...
Successful!  Pennsylvania District 10 will list a winner determine by vote %.
Texas District 37 does not have a 2020 

In [8]:
full_candidate_info_df[(full_candidate_info_df['Candidate_Party']!='Republican') & (full_candidate_info_df['Candidate_Party']!='Democrat') & (full_candidate_info_df['Candidate_Party']!='Independent')].tail(50)

Unnamed: 0,Candidate_Name,Candidate_Party,State,District,Is_Incumbent?,Is_Winner,Percent_of_Vote_(%),Money_Raised_($),Money_Spent_($)
92,Angelica Duenas,3rd Party,California,29,No,No,43.4,78083,76627
141,Chris Milton,3rd Party,Colorado,3,No,No,1.0,12012,12012
146,Rebecca Keltie,3rd Party,Colorado,5,No,No,0.8,2552,1495
149,Jaimie Kulikowski,3rd Party,Colorado,6,No,No,0.9,72293,110645
159,Justin Paglino,3rd Party,Connecticut,3,No,No,1.5,29200,28670
240,Chase Oliver,Libertarian,Georgia,5,No,No,,9225,5220
248,Jimmy Cooper,3rd Party,Georgia,8,No,No,,8542,8528
260,Martin Cowen,Libertarian,Georgia,13,No,No,,13460,13464
266,Jonathan Hoomanawanui,3rd Party,Hawaii,2,No,No,2.2,1883,1522
281,Bill Redpath,Libertarian,Illinois,6,No,No,1.7,9837,9837


In [9]:
full_candidate_info_df.tail()

Unnamed: 0,Candidate_Name,Candidate_Party,State,District,Is_Incumbent?,Is_Winner,Percent_of_Vote_(%),Money_Raised_($),Money_Spent_($)
872,Tricia Zunker,Democrat,Wisconsin,7,No,No,39.2,1261957,1232690
873,Mike Gallagher,Republican,Wisconsin,8,Yes,Yes,64.0,3202905,2841801
874,Amanda Stuck,Democrat,Wisconsin,8,No,No,36.0,416978,399916
875,Liz Cheney,Republican,Wyoming,1,Yes,Yes,68.6,3003883,3060167
876,Lynnette Grey Bull,Democrat,Wyoming,1,No,No,24.6,134597,132235


In [10]:
full_candidate_info_df[full_candidate_info_df['State']=='Texas'].tail(50)

Unnamed: 0,Candidate_Name,Candidate_Party,State,District,Is_Incumbent?,Is_Winner,Percent_of_Vote_(%),Money_Raised_($),Money_Spent_($)
750,Monica De La Cruz,Republican,Texas,15,No,No,47.6,344893,338832
751,Veronica Escobar,Democrat,Texas,16,Yes,Yes,64.7,1135549,1087693
752,Irene Armendariz-Jackson,Republican,Texas,16,No,No,35.3,168997,153671
753,Pete Sessions,Republican,Texas,17,No,Yes,55.9,1447958,1658637
754,Rick Kennedy,Democrat,Texas,17,No,No,40.9,200668,202096
755,Ted Brown,Libertarian,Texas,17,No,No,3.2,3041,3041
756,Sheila Jackson Lee,Democrat,Texas,18,Yes,Yes,73.3,924480,838414
757,Wendell Champion,Republican,Texas,18,No,No,23.5,305645,271278
758,Jodey Arrington,Republican,Texas,19,Yes,Yes,74.8,2497184,1932700
759,Tom Watson,Democrat,Texas,19,No,No,22.9,49405,60806


In [11]:
full_candidate_info_df[full_candidate_info_df['State']=='Louisiana'].tail(50)

Unnamed: 0,Candidate_Name,Candidate_Party,State,District,Is_Incumbent?,Is_Winner,Percent_of_Vote_(%),Money_Raised_($),Money_Spent_($)
349,Steve Scalise,Republican,Louisiana,1,Yes,Yes,72.2,37262827,32830607
350,Cedric Richmond,Democrat,Louisiana,2,Yes,Yes,63.6,1628289,1302079
351,Clay Higgins,Republican,Louisiana,3,Yes,Yes,67.8,751498,720424
352,Robert Jon Anderson,Democrat,Louisiana,3,No,No,11.6,63617,48015
353,Braylon Harris,Democrat,Louisiana,3,No,No,17.9,23006,15130
354,Mike Johnson,Republican,Louisiana,4,Yes,Yes,60.4,1299787,1039393
355,Kenny Houston,Democrat,Louisiana,4,No,No,25.5,33677,33407
356,Luke Letlow,Republican,Louisiana,5,No,,,1392822,1303757
357,Lance Harris,Republican,Louisiana,5,No,,,696805,687833
358,Martin Lemelle,Democrat,Louisiana,5,No,,,173796,162747


In [12]:
full_candidate_info_df[full_candidate_info_df['State']=='Pennsylvania'].tail(50)

Unnamed: 0,Candidate_Name,Candidate_Party,State,District,Is_Incumbent?,Is_Winner,Percent_of_Vote_(%),Money_Raised_($),Money_Spent_($)
646,Brian Fitzpatrick,Republican,Pennsylvania,1,Yes,Yes,58.7,4203825,4272135
647,Christina Finello,Democrat,Pennsylvania,1,No,No,41.3,2336814,2329748
648,Brendan Boyle,Democrat,Pennsylvania,2,Yes,Yes,65.1,1556650,756317
649,Dwight Evans,Democrat,Pennsylvania,3,Yes,Yes,87.5,942386,781265
650,Madeleine Dean,Democrat,Pennsylvania,4,Yes,Yes,56.0,1435995,1005928
651,Kathy Barnette,Republican,Pennsylvania,4,No,No,44.0,1000389,884640
652,Joe Tarshish,Independent,Pennsylvania,4,No,No,,24038,23738
653,Mary Gay Scanlon,Democrat,Pennsylvania,5,Yes,Yes,60.3,1455042,1350276
654,Dasha Pruett,Republican,Pennsylvania,5,No,No,39.7,105146,82437
655,Chrissy Houlahan,Democrat,Pennsylvania,6,Yes,Yes,54.7,3484484,1327283


In [13]:
compress_df_to_csv = dict(method='zip',
                        archive_name='full_candidate_info.csv')  
full_candidate_info_df.to_csv('full_candidate_info.zip', index=False,
          compression=compress_df_to_csv)