In [4]:
#def clean_data(dataframe,year, type):

import re
from bs4 import BeautifulSoup 
import numpy as np
import pandas as pd
from itertools import zip_longest

def clean_data(dataframe,year, type):

    def clean_r_and_dvalue(x):
        var = re.sub(r'[^\d.]', '', str(x))
        try:
            return float(var)
        except:
            return np.nan

    dataframe['dvalue'] = dataframe['dvalue'].apply(clean_r_and_dvalue)
    dataframe['rvalue'] = dataframe['rvalue'].apply(clean_r_and_dvalue)

    def clean_sample_size(x):
        if 'LV' in x:
            var = x.replace('LV', '').strip()
            try:
                var = float(var)
            except:
                var = np.nan
            return var
        else:
            return np.nan
        
    dataframe['sampleSize'] = dataframe['sampleSize'].apply(clean_sample_size)

    dataframe['Year'] = year

    dataframe['date'] = dataframe['date'].astype(str).str.replace(r'^.*-\s*','', regex=True)+'/' + str(year)
    dataframe['date'] = pd.to_datetime(dataframe['date'], errors='coerce')

    def margin_to_float(x):
        try:
            return float(x)
        except:
            return np.nan
        
    try:
        dataframe['marginError'] = dataframe['marginError'].apply(margin_to_float)
    except:
        pass
    
    dataframe['Type'] = type

    return dataframe

In [3]:
# 2014 General Congressional
url = 'https://www.realclearpolitics.com/epolls/other/generic_congressional_vote-2170.html#polls'

import requests
completed = False

while not completed:
    page = requests.get(
    url='https://proxy.scrapeops.io/v1/',
    params={
        'api_key': 'f2fec3c0-d71f-4279-9341-ba073c464e22',
        'url': url,
    },
    )

    if page.status_code == 200:
        url_content = page.text
        print("Success")
        completed = True
    else:
        print("Failed to retrieve. Status Code was", page.status_code)
        print("Trying again")
soup = BeautifulSoup(page.content, "html.parser")
results = soup.find(id="container")

pollster_data = []
date_data =[]
sample_data =[]
dvalue_data =[]
rvalue_data  = []

isinrcpavg = results.find_all("tr", class_="isInRcpAvg")
for poll in isinrcpavg:
    pollster = poll.find('a', class_='normal_pollster_name').text.strip()
    date = poll.find_all('td')[1].text.strip()
    sample = poll.find("td", class_="sample").text.strip()
    td_elements = poll.find_all('td')
    dvalue = td_elements[3].text.strip()
    rvalue = td_elements[4].text.strip()
    
    pollster_data.append(pollster)
    date_data.append(date)
    sample_data.append(sample)
    dvalue_data.append(dvalue)
    rvalue_data.append(rvalue)

alt = results.find_all("tr", class_="alt")
for poll in alt:
    pollster = poll.find('a', class_='normal_pollster_name').text.strip()
    date = poll.find_all('td')[1].text.strip()
    sample = poll.find("td", class_="sample").text.strip()
    td_elements = poll.find_all('td')
    dvalue = td_elements[3].text.strip()
    rvalue = td_elements[4].text.strip()
    
    pollster_data.append(pollster)
    date_data.append(date)
    sample_data.append(sample)
    dvalue_data.append(dvalue)
    rvalue_data.append(rvalue)

blank = results.select("tr[class='']")
for poll in blank:
    pollster = poll.find('a', class_='normal_pollster_name').text.strip()
    date = poll.find_all('td')[1].text.strip()
    sample = poll.find("td", class_="sample").text.strip()
    td_elements = poll.find_all('td')
    dvalue = td_elements[3].text.strip()
    rvalue = td_elements[4].text.strip()
    
    pollster_data.append(pollster)
    date_data.append(date)
    sample_data.append(sample)
    dvalue_data.append(dvalue)
    rvalue_data.append(rvalue)


data_rows = list(zip_longest(pollster_data, date_data, sample_data, dvalue_data, rvalue_data, fillvalue=None))
df = pd.DataFrame(data_rows, columns =['pollster', 'date', 'sampleSize', 'dvalue', 'rvalue'])

df = df.drop_duplicates().sort_values('pollster')
general_congressional_2014_df = df

general_congressional_2014 = clean_data(general_congressional_2014_df, 2014, 'General Congressional')


Success


In [4]:
#status check
print(general_congressional_2014.head(5))

               pollster       date  sampleSize  dvalue  rvalue  Year  \
70   ABC News/Wash Post 2014-01-23         NaN    46.0    45.0  2014   
198  ABC News/Wash Post 2014-05-19         NaN    40.0    48.0  2014   
151  ABC News/Wash Post 2014-04-27         NaN    44.0    45.0  2014   
146  ABC News/Wash Post 2014-06-01         NaN    45.0    47.0  2014   
65   ABC News/Wash Post 2014-03-02         NaN    45.0    46.0  2014   

                      Type  
70   General Congressional  
198  General Congressional  
151  General Congressional  
146  General Congressional  
65   General Congressional  


In [5]:
#2014 Governor

import requests
import json
import re
from bs4 import BeautifulSoup 
import numpy as np
import pandas as pd
from itertools import zip_longest
import time

def get_governor2016_data(url, state):

    page = requests.get(
        url='https://proxy.scrapeops.io/v1/',
         params={
            'api_key': 'f2fec3c0-d71f-4279-9341-ba073c464e22',
            'url': url
        },
    )

    if page.status_code == 200:
        url_content = page.text
        print("Success",state)
        
        soup = BeautifulSoup(page.content, "html.parser") 

        script_tags = soup.find_all('script')

        str = ""
        for script in script_tags:
            if script.string and 'finalData' in script.string:
                str += script.string

        x = str.replace("\\","")     
        
        pollster_pattern = r'"pollster":\s*"([^"]*)"'
        date_pattern = r'"date":\s*"([^"]*)"'
        sample_size_pattern = r'"sampleSize":\s*"([^"]*)"'
        margin_error_pattern = r'"marginError":\s*"([^"]*)"'
        
        link_pattern = r'"link":\s*"([^"]*)"' 

        dvalue_pattern = r'"candidate":\[\{.*?"name":"([^"]*)","affiliation":"Democrat","value":"([^"]*)".*?\}'
        rvalue_pattern = r'"candidate":\[\{.*?"name":"([^"]*)","affiliation":"Republican","value":"([^"]*)".*?\}'


        dvalue_data = re.findall(dvalue_pattern, x) #or #re.findall(dvalue_pattern2, x)
        rvalue_data = re.findall(rvalue_pattern, x) #or #re.findall(rvalue_pattern2, x)

        pollster_data = re.findall(pollster_pattern, x)
        date_data = re.findall(date_pattern, x)
        sample_size_data = re.findall(sample_size_pattern, x)
        margin_error_data = re.findall(margin_error_pattern, x)
        
        link_data = re.findall(link_pattern, x)

        data_rows = []
        for row in zip_longest(pollster_data, date_data, sample_size_data, margin_error_data, dvalue_data, rvalue_data, link_data, fillvalue=None):
            data_rows.append({
                
                "pollster": row[0],
                "date": row[1],
                "sampleSize": row[2],
                "marginError": row[3],
                "dvalue": row[4],
                "rvalue": row[5],
                "state": state
                #"race": race_value
                
            })
        
        return data_rows
        
    else:
        print("Failed to retrieve. Status Code was", page.status_code)
        return[]
        

state_urls = {
                
    "Alabama": "https://www.realclearpolling.com/polls/governor/general/2014/alabama/bentley-vs-griffith#polls",
    "Alaska": "https://www.realclearpolling.com/polls/governor/general/2014/alaska/parnell-vs-walker#polls", #independent won
    "Arizona": "https://www.realclearpolling.com/polls/governor/general/2014/arizona/ducey-vs-duval#polls",
    "Arkansas": "https://www.realclearpolling.com/polls/governor/general/2014/arkansas/hutchinson-vs-ross#polls",
    "California": "https://www.realclearpolling.com/polls/governor/general/2014/california/kashkari-vs-brown#polls",
    "Colorado": "https://www.realclearpolling.com/polls/governor/general/2014/colorado/beauprez-vs-hickenlooper#polls",
    "Connecticut": "https://www.realclearpolling.com/polls/governor/general/2014/connecticut/foley-vs-malloy-vs-visconti#polls",
    #"Delaware": "https://www.realclearpolitics.com/epolls/2016/governor/de/delaware_governor_bonini_vs_carney-6096.html#polls", 
    "Florida": "https://www.realclearpolling.com/polls/governor/general/2014/florida/scott-vs-crist-vs-wyllie#polls",
    "Georgia": "https://www.realclearpolling.com/polls/governor/general/2014/georgia/deal-vs-carter-vs-hunt#polls",
    "Hawaii": "https://www.realclearpolling.com/polls/governor/general/2014/hawaii/aiona-vs-Ige-vs-hannemann#polls",
    "Idaho": "https://www.realclearpolling.com/polls/governor/general/2014/idaho/otter-vs-balukoff#polls",
    "Illinois": "https://www.realclearpolling.com/polls/governor/general/2014/illinois/grimm-vs-quinn#polls",
    #"Indiana": "https://www.realclearpolling.com/polls/governor/general/2016/indiana/holcomb-vs-gregg#polls",
    "Iowa": "https://www.realclearpolling.com/polls/governor/general/2014/iowa/branstad-vs-hatch#polls",
    "Kansas": "https://www.realclearpolling.com/polls/governor/general/2014/kansas/brownback-vs-davis#polls",
    #"Kentucky": "https://www.realclearpolling.com/polls/senate/general/2014/kentucky/mcconnell-vs-grimes#polls",
    #"Louisiana": "https://www.realclearpolitics.com/epolls/2014/senate/louisiana_senate_race.html#polls",
    "Maine": "https://www.realclearpolling.com/polls/governor/general/2014/maine/lepage-vs-michaud-vs-cutler#polls",
    "Maryland": "https://www.realclearpolling.com/polls/governor/general/2014/maryland/hogan-vs-brown#polls",
    "Massachusetts": "https://www.realclearpolling.com/polls/governor/general/2014/massachusetts/baker-vs-coakley#polls",
    "Michigan": "https://www.realclearpolling.com/polls/governor/general/2014/michigan/snyder-vs-schauer#polls",
    "Minnesota": "https://www.realclearpolling.com/polls/governor/general/2014/minnesota/johnson-vs-dayton#polls",
    #"Mississippi": "https://www.realclearpolling.com/polls/senate/general/2014/mississippi/cochran-vs-childers#polls",
    #"Missouri": "https://www.realclearpolling.com/polls/governor/general/2016/missouri/greitens-vs-koster#polls",
    #"Montana": "https://www.realclearpolling.com/polls/governor/general/2016/montana/gianforte-vs-bullock#polls",
    "Nebraksa": "https://www.realclearpolling.com/polls/governor/general/2014/nebraska/ricketts-vs-hassebrook#polls",
    "Nevada": "https://www.realclearpolling.com/polls/governor/general/2014/nevada/sandoval-vs-goodman#polls",
    "New Hampshire": "https://www.realclearpolling.com/polls/governor/general/2014/new-hampshire/havenstein-vs-hassan#polls",
    #"New Jersey": "https://www.realclearpolling.com/polls/senate/general/2014/new-jersey/bell-vs-booker#polls",
    "New Mexico": "https://www.realclearpolling.com/polls/governor/general/2014/new-mexico/martinez-vs-king#polls",
    "New York": "https://www.realclearpolling.com/polls/governor/general/2014/new-york/astorino-vs-cuomo#polls",
    #"North Carolina": "https://www.realclearpolling.com/polls/governor/general/2016/north-carolina/mccrory-vs-cooper#polls",
    #"North Dakota": "https://www.realclearpolitics.com/epolls/2016/governor/nd/north_dakota_governor_burgum_vs_nelson-6099.html#polls",
    "Ohio": "https://www.realclearpolling.com/polls/governor/general/2014/ohio/kasich-vs-fitzgerald#polls",
    "Oklahoma": "https://www.realclearpolling.com/polls/governor/general/2014/oklahoma/fallin-vs-dorman#polls",
    "Oregon": "https://www.realclearpolling.com/polls/governor/general/2014/oregon/richardson-vs-kitzhaber#polls",
    "Pennsylvania": "https://www.realclearpolling.com/polls/governor/general/2014/pennsylvania/corbett-vs-wolf#polls",
    "Rhode Island": "https://www.realclearpolling.com/polls/governor/general/2014/rhode-island/fung-vs-raimondo#polls",  
    "South Carolina":"https://www.realclearpolling.com/polls/governor/general/2014/south-carolina/haley-vs-sheheen#polls",
    "South Dakota": "https://www.realclearpolling.com/polls/governor/general/2014/south-dakota/daugaard-vs-wismer#polls",
    "Tennessee": "https://www.realclearpolling.com/polls/governor/general/2014/tennessee/haslam-vs-brown#polls",
    "Texas": "https://www.realclearpolling.com/polls/governor/general/2014/texas/abbott-vs-davis#polls",
    #"Utah": "https://www.realclearpolling.com/polls/governor/general/2016/utah/herbert-vs-weinholtz#polls",
    "Vermont": "https://www.realclearpolling.com/polls/governor/general/2014/vermont/milne-vs-shumlin#polls",
    #"Virginia": "https://www.realclearpolling.com/polls/senate/general/2014/virginia/gillespie-vs-warner#polls",
    #"Washington": "https://www.realclearpolling.com/polls/governor/general/2016/washington/bryant-vs-inslee#polls",
    #"West Virginia": "https://www.realclearpolling.com/polls/governor/general/2016/west-virginia/cole-vs-justice#polls",
    "Wisconsin": "https://www.realclearpolling.com/polls/governor/general/2014/wisconsin/walker-vs-burke#polls",
    "Wyoming": "https://www.realclearpolling.com/polls/governor/general/2014/wyoming/mead-vs-gosar#polls"
       
}


all_state_dataframe = []

for state, url in state_urls.items():
    time.sleep(120)
    state_df = get_governor2016_data(url, state)
    df = pd.DataFrame(state_df)
    if len(df)>0:
        df = df.drop_duplicates().dropna(subset=['pollster']).dropna(subset=['dvalue'])
        all_state_dataframe.append(df)
    else:
        continue
    
all_state_df = pd.concat(all_state_dataframe, ignore_index=True)
governor_2014 = all_state_df

governor_2014 = clean_data(governor_2014, 2014, 'Governor')
print(governor_2014.head(5))

Success Alabama
Success Alaska
Success Arizona
Success Arkansas
Success California
Success Colorado
Success Connecticut
Success Florida
Success Georgia
Success Hawaii
Success Idaho
Success Illinois
Success Iowa
Success Kansas
Success Maine
Success Maryland
Success Massachusetts
Success Michigan
Success Minnesota
Success Nebraksa
Success Nevada
Success New Hampshire
Success New Mexico
Success New York
Success Ohio
Success Oklahoma
Success Oregon
Success Pennsylvania
Success Rhode Island
Success South Carolina
Success South Dakota
Success Tennessee
Success Texas
Success Vermont
Success Wisconsin
Success Wyoming
              pollster       date  sampleSize  marginError  dvalue  rvalue  \
0          rcp_average 2014-10-23         NaN          NaN    28.5    59.0   
1  CBS News/NYT/YouGov 2014-10-23       661.0          6.0    25.0    63.0   
2    Rasmussen Reports 2014-06-10       750.0          4.0    32.0    55.0   
3  CBS News/NYT/YouGov 2014-10-01       692.0          4.0    28.0    6

In [6]:
#2014 Senate

import requests
import json
import re
from bs4 import BeautifulSoup 
import numpy
import pandas as pd
from itertools import zip_longest

def get_senate2014_data(url, state):

    page = requests.get(
        url='https://proxy.scrapeops.io/v1/',
         params={
            'api_key': 'f2fec3c0-d71f-4279-9341-ba073c464e22',
            'url': url
        },
    )

    if page.status_code == 200:
        url_content = page.text
        print("Success", state)
        
        soup = BeautifulSoup(page.content, "html.parser") 

        script_tags = soup.find_all('script')

        str = ""
        for script in script_tags:
            if script.string and 'finalData' in script.string:
                str += script.string

        x = str.replace("\\","")     
        
        pollster_pattern = r'"pollster":\s*"([^"]*)"'
        date_pattern = r'"date":\s*"([^"]*)"'
        sample_size_pattern = r'"sampleSize":\s*"([^"]*)"'
        margin_error_pattern = r'"marginError":\s*"([^"]*)"'
        
        link_pattern = r'"link":\s*"([^"]*)"'

        dvalue_pattern1 = r'"candidate":\[{"name":"([^"]*?)","affiliation":"Democrat","value":"([^"]*)"'
        dvalue_pattern2 = r'"candidate":\[{[^}]*},{"name":"([^"]*?)","affiliation":"Democrat","value":"([^"]*)"'

        rvalue_pattern1 = r'"candidate":\[{[^}]*},{"name":"([^"]*?)","affiliation":"Republican","value":"([^"]*)"'
        rvalue_pattern2 = r'"candidate":\[{"name":"([^"]*?)","affiliation":"Republican","value":"([^"]*)"'


        dvalue_data = re.findall(dvalue_pattern1, x) or re.findall(dvalue_pattern2, x)
        rvalue_data = re.findall(rvalue_pattern1, x) or re.findall(rvalue_pattern2, x)

        pollster_data = re.findall(pollster_pattern, x)
        date_data = re.findall(date_pattern, x)
        sample_size_data = re.findall(sample_size_pattern, x)
        margin_error_data = re.findall(margin_error_pattern, x)
        
        link_data = re.findall(link_pattern, x)

        data_rows = []
        for row in zip_longest(pollster_data, date_data, sample_size_data, margin_error_data, dvalue_data, rvalue_data, link_data, fillvalue=None):
            data_rows.append({
                
                "pollster": row[0],
                "date": row[1],
                "sampleSize": row[2],
                "marginError": row[3],
                "dvalue": row[4],
                "rvalue": row[5],
                "state": state
                #"race": race_value
                
            })
        
        return data_rows
        
    else:
        print("Failed to retrieve. Status Code was", page.status_code)
        return[]
        

state_urls = {
                
    #"Alabama": no data
    "Alaska": "https://www.realclearpolling.com/polls/senate/general/2014/alaska/sullivan-vs-begich#polls",
    #"Arizona": "https://www.realclearpolling.com/polls/senate/general/2016/arizona/mccain-vs-kirkpatrick#polls",
    "Arkansas": "https://www.realclearpolling.com/polls/senate/general/2014/arkansas/cotton-vs-pryor#polls",
    #"California": "https://www.realclearpolling.com/polls/senate/general/2016/california/harris-vs-sanchez#polls",
    "Colorado": "https://www.realclearpolling.com/polls/senate/general/2014/colorado/gardner-vs-udall#polls",
    #"Connecticut": "https://www.realclearpolling.com/polls/senate/general/2016/connecticut/carter-vs-blumenthal#polls",
    "Delaware": "https://www.realclearpolling.com/polls/senate/general/2014/delaware/wade-vs-coons#polls", 
    #"Florida": "https://www.realclearpolling.com/polls/senate/general/2016/florida/rubio-vs-murphy#polls",
    "Georgia": "https://www.realclearpolling.com/polls/senate/general/2014/georgia/perdue-vs-nunn-vs-swafford#polls",
    "Hawaii": "https://www.realclearpolling.com/polls/senate/general/2014/hawaii/cavasso-vs-schatz#polls",
    "Idaho": "https://www.realclearpolling.com/polls/senate/general/2014/idaho/risch-vs-mitchell#polls",
    "Illinois": "https://www.realclearpolling.com/polls/senate/general/2014/illinois/oberweis-vs-durbin#polls",
    #"Indiana": "https://www.realclearpolling.com/polls/senate/general/2016/indiana/young-vs-bayh#polls",
    "Iowa": "https://www.realclearpolling.com/polls/senate/general/2014/iowa/ernst-vs-braley#polls",
    "Kansas": "https://www.realclearpolling.com/polls/senate/general/2014/kansas/roberts-vs-orman#polls",
    "Kentucky": "https://www.realclearpolling.com/polls/senate/general/2014/kentucky/mcconnell-vs-grimes#polls",
    "Louisiana": "https://www.realclearpolitics.com/epolls/2014/senate/louisiana_senate_race.html#polls",
    "Maine": "https://www.realclearpolling.com/polls/senate/general/2014/maine/collins-vs-bellows#polls",
    #"Maine CD1": "https://www.realclearpolling.com/polls/president/general/2024/maine/trump-vs-biden-cd1#polls",
    #"Maine CD2": "https://www.realclearpolling.com/polls/president/general/2024/maine/trump-vs-biden-cd2#polls",
    #"Maryland": "https://www.realclearpolling.com/polls/senate/general/2016/maryland/vanhollen-vs-szeliga#polls",
    "Massachusetts": "https://www.realclearpolling.com/polls/senate/general/2014/massachusetts/herr-vs-markey#polls",
    "Michigan": "https://www.realclearpolling.com/polls/senate/general/2014/michigan/land-vs-peters#polls",
    "Minnesota": "https://www.realclearpolling.com/polls/senate/general/2014/minnesota/mcfadden-vs-franken#polls",
    #"Minnesota_SPECIAL": "https://www.realclearpolling.com/polls/senate/general/2018/minnesota/housley-vs-smith",
    "Mississippi": "https://www.realclearpolling.com/polls/senate/general/2014/mississippi/cochran-vs-childers#polls",
    #"Mississippi_RUNOFF":"https://www.realclearpolling.com/polls/senate/general/2018/mississippi-runoff-election",
    #"Missouri": "https://www.realclearpolling.com/polls/senate/general/2016/missouri/blunt-vs-kander#polls",
    "Montana": "https://www.realclearpolling.com/polls/senate/general/2014/montana/daines-vs-curtis#polls",
    #"Nebraksa": "https://www.realclearpolitics.com/epolls/2018/senate/ne/nebraska_senate_fischer_vs_raybould_-6315.html#polls",
    #"Nebraska CD2": "https://www.realclearpolling.com/polls/president/general/2024/nebraska-cd2/trump-vs-biden#polls",
    #"Nevada": "https://www.realclearpolling.com/polls/senate/general/2016/nevada/cortezmasto-vs-heck#polls",
    "New Hampshire": "https://www.realclearpolling.com/polls/senate/general/2014/new-hampshire/brown-vs-shaheen#polls",
    "New Jersey": "https://www.realclearpolling.com/polls/senate/general/2014/new-jersey/bell-vs-booker#polls",
    "New Mexico": "https://www.realclearpolling.com/polls/senate/general/2014/new-mexico/weh-vs-udall#polls",
    #"New York": "https://www.realclearpolling.com/polls/senate/general/2016/new-york/long-vs-schumer#polls",
    "North Carolina": "https://www.realclearpolling.com/polls/senate/general/2014/north-carolina/tillis-vs-hagan#polls",
    #"North Dakota": "https://www.realclearpolitics.com/epolls/2016/senate/nd/north_dakota_senate_hoeven_vs_glassheim-5996.html#polls",
    #"Ohio": "https://www.realclearpolling.com/polls/senate/general/2016/ohio/portman-vs-strickland#polls",
    "Oklahoma": "https://www.realclearpolling.com/polls/senate/general/2014/oklahoma/lankford-vs-johnson#polls",
    "Oregon": "https://www.realclearpolling.com/polls/senate/general/2014/oregon/wehby-vs-merkley#polls",
    #"Pennsylvania": "https://www.realclearpolling.com/polls/senate/general/2016/pennsylvania/toomey-vs-mcginty#polls",
    "Rhode Island": "https://www.realclearpolling.com/polls/senate/general/2014/rhode-island/zaccaria-vs-reed#polls",  
    "South Carolina":"https://www.realclearpolling.com/polls/senate/general/2014/south-carolina/graham-vs-hutto-vs-ravenel#polls",
    "South Dakota": "https://www.realclearpolling.com/polls/senate/general/2014/south-dakota/rounds-vs-weiland-vs-pressler#polls",
    "Tennessee": "https://www.realclearpolling.com/polls/senate/general/2014/tennessee/alexander-vs-ball#polls",
    "Texas": "https://www.realclearpolling.com/polls/senate/general/2014/texas/cornyn-vs-alameel#polls",
    #"Utah": "https://www.realclearpolling.com/polls/senate/general/2016/utah/lee-vs-snow#polls",
    #"Vermont": "https://www.realclearpolling.com/polls/senate/general/2016/vermont/milne-vs-leahy#polls",
    "Virginia": "https://www.realclearpolling.com/polls/senate/general/2014/virginia/gillespie-vs-warner#polls",
    #"Washington": "https://www.realclearpolling.com/polls/senate/general/2016/washington/vance-vs-murray#polls",
    "West Virginia": "https://www.realclearpolling.com/polls/senate/general/2014/west-virginia/capito-vs-tennant#polls",
    #"Wisconsin": "https://www.realclearpolling.com/polls/senate/general/2016/wisconsin/johnson-vs-feingold#polls",
    "Wyoming": "https://www.realclearpolling.com/polls/senate/general/2014/wyoming/enzi-vs-hardy#polls"
       
}


all_state_dataframe = []

for state, url in state_urls.items():
    state_df = get_senate2014_data(url, state)
    time.sleep(120.22)
    df = pd.DataFrame(state_df)
    if len(df)>0:
        df = df.drop_duplicates().dropna(subset=['pollster']).dropna(subset=['dvalue'])
        all_state_dataframe.append(df)
    else:
        continue
    
all_state_df = pd.concat(all_state_dataframe, ignore_index=True)

senate_2014 = all_state_df

senate_2014 = clean_data(senate_2014, 2014, 'Senate')
print(senate_2014.head(5))

Success Alaska
Success Arkansas
Success Colorado
Success Delaware
Success Georgia
Success Hawaii
Success Idaho
Success Illinois
Success Iowa
Success Kansas
Success Kentucky
Success Louisiana
Success Maine
Success Massachusetts
Success Michigan
Success Minnesota
Success Mississippi
Success Montana
Success New Hampshire
Success New Jersey
Success New Mexico
Success North Carolina
Success Oklahoma
Success Oregon
Success Rhode Island
Success South Carolina
Success South Dakota
Success Tennessee
Success Texas
Success Virginia
Success West Virginia
Success Wyoming
              pollster       date  sampleSize  marginError  dvalue  rvalue  \
0          rcp_average 2014-11-02         NaN          NaN    43.8    46.2   
1             PPP (D)* 2014-11-02      1052.0          3.0    45.0    46.0   
2    Rasmussen Reports 2014-10-30       887.0          4.0    42.0    47.0   
3  Ivan Moore Research 2014-10-26       544.0          NaN    48.0    42.0   
4  CBS News/NYT/YouGov 2014-10-23       561.0

In [7]:
senate_2014.to_csv('senate_2014.csv', index=False)
governor_2014.to_csv('governor_2014.csv', index=False)
general_congressional_2014.to_csv('general_congressional_2014.csv', index=False)

In [5]:
#2016 General Congressional

import requests
import re
from bs4 import BeautifulSoup 
import numpy
import pandas as pd
from itertools import zip_longest
url = 'https://www.realclearpolitics.com/epolls/other/2016_generic_congressional_vote-5279.html'
 
page = requests.get(
        url='https://proxy.scrapeops.io/v1/',
         params={
            'api_key': 'f2fec3c0-d71f-4279-9341-ba073c464e22',
            'url': url
        },
    )
if page.status_code == 200:
    url_content = page.text
    print("Success")
else:
    print("Failed to retrieve. Status Code was", page.status_code)
    exit()
soup = BeautifulSoup(page.content, "html.parser")
results = soup.find(id="container")


#rcpAvg = results.find("tr", class_="rcpAvg2")
#print(rcpAvg.text.strip())

pollster_data = []
date_data =[]
sample_data =[]
dvalue_data =[]
rvalue_data  = []

isinrcpavg = results.find_all("tr", class_="isInRcpAvg")
for poll in isinrcpavg:
    pollster = poll.find('a', class_='normal_pollster_name').text.strip()
    date = poll.find_all('td')[1].text.strip()
    sample = poll.find("td", class_="sample").text.strip()
    td_elements = poll.find_all('td')
    dvalue = td_elements[3].text.strip()
    rvalue = td_elements[4].text.strip()
    
    pollster_data.append(pollster)
    date_data.append(date)
    sample_data.append(sample)
    dvalue_data.append(dvalue)
    rvalue_data.append(rvalue)

alt = results.find_all("tr", class_="alt")
for poll in alt:
    pollster = poll.find('a', class_='normal_pollster_name').text.strip()
    date = poll.find_all('td')[1].text.strip()
    sample = poll.find("td", class_="sample").text.strip()
    td_elements = poll.find_all('td')
    dvalue = td_elements[3].text.strip()
    rvalue = td_elements[4].text.strip()
    
    pollster_data.append(pollster)
    date_data.append(date)
    sample_data.append(sample)
    dvalue_data.append(dvalue)
    rvalue_data.append(rvalue)

blank = results.select("tr[class='']")
for poll in blank:
    pollster = poll.find('a', class_='normal_pollster_name').text.strip()
    date = poll.find_all('td')[1].text.strip()
    sample = poll.find("td", class_="sample").text.strip()
    td_elements = poll.find_all('td')
    dvalue = td_elements[3].text.strip()
    rvalue = td_elements[4].text.strip()
    
    pollster_data.append(pollster)
    date_data.append(date)
    sample_data.append(sample)
    dvalue_data.append(dvalue)
    rvalue_data.append(rvalue)


data_rows = list(zip_longest(pollster_data, date_data, sample_data, dvalue_data, rvalue_data, fillvalue=None))
df = pd.DataFrame(data_rows, columns =['pollster', 'date', 'sampleSize', 'dvalue', 'rvalue'])

df = df.drop_duplicates().sort_values('pollster')

general_congressional_2016 = df

general_congressional_2016 = clean_data(general_congressional_2016, 2016, 'General Congressional')

Success


In [6]:
import requests
import json
import re
from bs4 import BeautifulSoup 
import numpy
import pandas as pd
from itertools import zip_longest
import time

def get_governor2016_data(url, state):

    page = requests.get(
        url='https://proxy.scrapeops.io/v1/',
         params={
            'api_key': 'f2fec3c0-d71f-4279-9341-ba073c464e22',
            'url': url
        },
    )

    if page.status_code == 200:
        url_content = page.text
        print("Success")
        
        soup = BeautifulSoup(page.content, "html.parser") 

        script_tags = soup.find_all('script')

        str = ""
        for script in script_tags:
            if script.string and 'finalData' in script.string:
                str += script.string

        x = str.replace("\\","")     

        #str2 = str.split('self.__next_f.push(')
        #str3 = str2[1][:-1]
        #jsonx = json.loads(str3)
        #json_str = jsonx[1] 
        #print(json_str)
        
        pollster_pattern = r'"pollster":\s*"([^"]*)"'
        date_pattern = r'"date":\s*"([^"]*)"'
        sample_size_pattern = r'"sampleSize":\s*"([^"]*)"'
        margin_error_pattern = r'"marginError":\s*"([^"]*)"'
        
        link_pattern = r'"link":\s*"([^"]*)"'

        dvalue_pattern1 = r'"candidate":\[{"name":"([^"]*?)","affiliation":"Democrat","value":"([^"]*)"'
        dvalue_pattern2 = r'"candidate":\[{[^}]*},{"name":"([^"]*?)","affiliation":"Democrat","value":"([^"]*)"'

        rvalue_pattern1 = r'"candidate":\[{[^}]*},{"name":"([^"]*?)","affiliation":"Republican","value":"([^"]*)"'
        rvalue_pattern2 = r'"candidate":\[{"name":"([^"]*?)","affiliation":"Republican","value":"([^"]*)"'


        dvalue_data = re.findall(dvalue_pattern1, x) or re.findall(dvalue_pattern2, x)
        rvalue_data = re.findall(rvalue_pattern1, x) or re.findall(rvalue_pattern2, x)

        pollster_data = re.findall(pollster_pattern, x)
        date_data = re.findall(date_pattern, x)
        sample_size_data = re.findall(sample_size_pattern, x)
        margin_error_data = re.findall(margin_error_pattern, x)
        
        link_data = re.findall(link_pattern, x)

        data_rows = []
        for row in zip_longest(pollster_data, date_data, sample_size_data, margin_error_data, dvalue_data, rvalue_data, link_data, fillvalue=None):
            data_rows.append({
                
                "pollster": row[0],
                "date": row[1],
                "sampleSize": row[2],
                "marginError": row[3],
                "dvalue": row[4],
                "rvalue": row[5],
                "state": state
                #"race": race_value
                
            })
        
        return data_rows
        
    else:
        print("Failed to retrieve. Status Code was", page.status_code)
        return[]
        

state_urls = {
                
    #"Alabama": "https://www.realclearpolitics.com/epolls/2018/governor/al/alabama_governor_ivey_vs_maddox-6405.html#polls",
    #"Alaska": "https://www.realclearpolling.com/polls/governor/general/2018/alaska/dunleavy-vs-begich#polls",
    #"Arizona": "https://www.realclearpolling.com/polls/governor/general/2018/arizona/ducey-vs-garcia#polls",
    #"Arkansas": "https://www.realclearpolling.com/polls/governor/general/2018/arkansas/hutchinson-vs-henderson#polls",
    #"California": "https://www.realclearpolling.com/polls/governor/general/2018/california/cox-vs-newsom#polls",
    #"Colorado": "https://www.realclearpolling.com/polls/governor/general/2018/colorado/stapleton-vs-polis#polls",
    #"Connecticut": "https://www.realclearpolling.com/polls/governor/general/2018/connecticut/stefanowski-vs-lamont#polls",
    "Delaware": "https://www.realclearpolitics.com/epolls/2016/governor/de/delaware_governor_bonini_vs_carney-6096.html#polls", 
    #"Florida": "https://www.realclearpolling.com/polls/governor/general/2018/florida/desantis-vs-gillum#polls",
    #"Georgia": "https://www.realclearpolling.com/polls/governor/general/2018/georgia/kemp-vs-abrams#polls",
    #"Hawaii": "https://www.realclearpolling.com/polls/governor/general/2018/hawaii/tupola-vs-ige#polls",
    #"Idaho": "https://www.realclearpolitics.com/epolls/2018/governor/id/idaho_governor_little_vs_jordan-6413.html#polls",
    #"Illinois": "https://www.realclearpolling.com/polls/governor/general/2018/illinois/rauner-vs-pritzker#polls",
    "Indiana": "https://www.realclearpolling.com/polls/governor/general/2016/indiana/holcomb-vs-gregg#polls",
    #"Iowa": "https://www.realclearpolling.com/polls/governor/general/2018/iowa/reynolds-vs-hubbell#polls",
    #"Kansas": "https://www.realclearpolling.com/polls/governor/general/2018/kansas/kobach-vs-kelly-vs-orman#polls",
    #"Kentucky": "https://www.realclearpolling.com/polls/senate/general/2014/kentucky/mcconnell-vs-grimes#polls",
    #"Louisiana": "https://www.realclearpolitics.com/epolls/2014/senate/louisiana_senate_race.html#polls",
    #"Maine": "https://www.realclearpolling.com/polls/governor/general/2018/maine/moody-vs-mills#polls",
    #"Maryland": "https://www.realclearpolling.com/polls/governor/general/2018/maryland/hogan-vs-jealous#polls",
    #"Massachusetts": "https://www.realclearpolling.com/polls/governor/general/2018/massachusetts/baker-vs-gonzalez#polls",
    #"Michigan": "https://www.realclearpolling.com/polls/governor/general/2018/michigan/schuette-vs-whitmer#polls",
    #"Minnesota": "https://www.realclearpolling.com/polls/governor/general/2018/minnesota/johnson-vs-walz#polls",
    #"Mississippi": "https://www.realclearpolling.com/polls/senate/general/2014/mississippi/cochran-vs-childers#polls",
    "Missouri": "https://www.realclearpolling.com/polls/governor/general/2016/missouri/greitens-vs-koster#polls",
    "Montana": "https://www.realclearpolling.com/polls/governor/general/2016/montana/gianforte-vs-bullock#polls",
    #"Nebraksa": "https://www.realclearpolitics.com/epolls/2018/governor/ne/nebraska_governor_ricketts_vs_krist-6421.html#polls",
    #"Nevada": "https://www.realclearpolling.com/polls/governor/general/2018/nevada/laxalt-vs-sisolak#polls",
    "New Hampshire": "https://www.realclearpolling.com/polls/governor/general/2016/new-hampshire/sununu-vs-ostern#polls",
    #"New Jersey": "https://www.realclearpolling.com/polls/senate/general/2014/new-jersey/bell-vs-booker#polls",
    #"New Mexico": "https://www.realclearpolling.com/polls/governor/general/2018/new-mexico/pearce-vs-grisham#polls",
    #"New York": "https://www.realclearpolling.com/polls/governor/general/2018/new-york/molinaro-vs-cuomo#polls",
    "North Carolina": "https://www.realclearpolling.com/polls/governor/general/2016/north-carolina/mccrory-vs-cooper#polls",
    "North Dakota": "https://www.realclearpolitics.com/epolls/2016/governor/nd/north_dakota_governor_burgum_vs_nelson-6099.html#polls",
    #"Ohio": "https://www.realclearpolling.com/polls/governor/general/2018/ohio/dewine-vs-cordray#polls",
    #"Oklahoma": "https://www.realclearpolling.com/polls/governor/general/2018/oklahoma/stitt-vs-edmondson#polls",
    "Oregon": "https://www.realclearpolling.com/polls/governor/general/2016/oregon/pierce-vs-brown#polls",
    #"Pennsylvania": "https://www.realclearpolling.com/polls/governor/general/2018/pennsylvania/wagner-vs-wolf#polls",
    #"Rhode Island": "https://www.realclearpolling.com/polls/governor/general/2018/rhode-island/fung-vs-raimondo#polls",  
    #"South Carolina":"https://www.realclearpolling.com/polls/governor/general/2018/south-carolina/mcmaster-vs-smith#polls",
    #"South Dakota": "https://www.realclearpolling.com/polls/governor/general/2018/south-dakota/noem-vs-sutton#polls",
    #"Tennessee": "https://www.realclearpolling.com/polls/governor/general/2018/tennessee/lee-vs-dean#polls",
    #"Texas": "https://www.realclearpolling.com/polls/governor/general/2018/texas/abbott-vs-valdez#polls",
    "Utah": "https://www.realclearpolling.com/polls/governor/general/2016/utah/herbert-vs-weinholtz#polls",
    "Vermont": "https://www.realclearpolling.com/polls/governor/general/2016/vermont/scott-vs-minter#polls",
    #"Virginia": "https://www.realclearpolling.com/polls/senate/general/2014/virginia/gillespie-vs-warner#polls",
    "Washington": "https://www.realclearpolling.com/polls/governor/general/2016/washington/bryant-vs-inslee#polls",
    "West Virginia": "https://www.realclearpolling.com/polls/governor/general/2016/west-virginia/cole-vs-justice#polls",
    #"Wisconsin": "https://www.realclearpolling.com/polls/governor/general/2018/wisconsin/walker-vs-evers-vs-anderson#polls",
    #"Wyoming": "https://www.realclearpolitics.com/epolls/2018/governor/wy/wyoming_governor_gordon_vs_throne-6666.html#polls"
       
}


all_state_dataframe = []

for state, url in state_urls.items():
    time.sleep(120.234)
    state_df = get_governor2016_data(url, state)
    df = pd.DataFrame(state_df)
    if len(df)>0:
        df = df.drop_duplicates().dropna(subset=['pollster']).dropna(subset=['dvalue'])
        all_state_dataframe.append(df)
    else:
        continue
    
all_state_df = pd.concat(all_state_dataframe, ignore_index=True)

governor_2016 = all_state_df

governor_2016 = clean_data(governor_2016, 2016, 'Governor')

Success
Success
Success
Success
Success
Success
Success
Success
Success
Success
Success
Success


In [7]:
print(governor_2016.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 138 entries, 0 to 137
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   pollster     138 non-null    object        
 1   date         138 non-null    datetime64[ns]
 2   sampleSize   81 non-null     float64       
 3   marginError  124 non-null    float64       
 4   dvalue       138 non-null    float64       
 5   rvalue       138 non-null    float64       
 6   state        138 non-null    object        
 7   Year         138 non-null    int64         
 8   Type         138 non-null    object        
dtypes: datetime64[ns](1), float64(4), int64(1), object(3)
memory usage: 9.8+ KB
None


In [8]:
#2016 Presidential State Polls

import requests
import json
import re
from bs4 import BeautifulSoup 
import numpy
import pandas as pd
from itertools import zip_longest

def get_states2016_data(url, state):

    page = requests.get(
        url='https://proxy.scrapeops.io/v1/',
         params={
            'api_key': 'f2fec3c0-d71f-4279-9341-ba073c464e22',
            'url': url
        },
    )

    if page.status_code == 200:
        url_content = page.text
        print("Success")
        
        soup = BeautifulSoup(page.content, "html.parser") 

        script_tags = soup.find_all('script')

        for script in script_tags:
            if script.string and 'finalData' in script.string:
                string = script.string
                break
        try:
            str2 = string.split('self.__next_f.push(')
            str3 = str2[1][:-1]
            jsonx = json.loads(str3)
            json_str = jsonx[1] 
            
            pollster_pattern = r'"pollster":\s*"([^"]*)"'
            date_pattern = r'"date":\s*"([^"]*)"'
            sample_size_pattern = r'"sampleSize":\s*"([^"]*)"'
            margin_error_pattern = r'"marginError":\s*"([^"]*)"'
            
            link_pattern = r'"link":\s*"([^"]*)"'

            dvalue_pattern1 = r'"candidate":\[{"name":"([^"]*?)","affiliation":"Democrat","value":"([^"]*)"'
            dvalue_pattern2 = r'"candidate":\[{[^}]*},{"name":"([^"]*?)","affiliation":"Democrat","value":"([^"]*)"'

            rvalue_pattern1 = r'"candidate":\[{[^}]*},{"name":"([^"]*?)","affiliation":"Republican","value":"([^"]*)"'
            rvalue_pattern2 = r'"candidate":\[{"name":"([^"]*?)","affiliation":"Republican","value":"([^"]*)"'


            dvalue_data = re.findall(dvalue_pattern1, json_str) or re.findall(dvalue_pattern2, json_str)
            rvalue_data = re.findall(rvalue_pattern1, json_str) or re.findall(rvalue_pattern2, json_str)

            pollster_data = re.findall(pollster_pattern, json_str)
            date_data = re.findall(date_pattern, json_str)
            sample_size_data = re.findall(sample_size_pattern, json_str)
            margin_error_data = re.findall(margin_error_pattern, json_str)
            
            link_data = re.findall(link_pattern, json_str)

            data_rows = []
            for row in zip_longest(pollster_data, date_data, sample_size_data, margin_error_data, dvalue_data, rvalue_data, link_data, fillvalue=None):
                data_rows.append({
                    
                    "pollster": row[0],
                    "date": row[1],
                    "sampleSize": row[2],
                    "marginError": row[3],
                    "dvalue": row[4],
                    "rvalue": row[5],
                    "state": state
                    #"race": race_value
                    
                })
            
            return data_rows
        
        except:
            print("Failed to retrieve. Status Code was", page.status_code)
            return[]
        
    else:
        print("Failed to retrieve. Status Code was", page.status_code)
        return[]
        

state_urls = {
                
    "Alabama": "https://www.realclearpolitics.com/epolls/2016/president/al/alabama_trump_vs_clinton-5898.html",
    "Alaska": "https://www.realclearpolling.com/polls/president/general/2016/alaska/trump-vs-clinton#polls",
    "Arizona": "https://www.realclearpolling.com/polls/president/general/2016/arizona/trump-vs-clinton#polls",
    "Arkansas": "https://www.realclearpolling.com/polls/president/general/2016/arkansas/trump-vs-clinton#polls",
    "California": "https://www.realclearpolling.com/polls/president/general/2016/california/trump-vs-clinton#polls",
    "Colorado": "https://www.realclearpolling.com/polls/president/general/2016/colorado/trump-vs-clinton#polls",
    "Connecticut": "https://www.realclearpolling.com/polls/president/general/2016/connecticut/trump-vs-clinton#polls",
    "Delaware": "https://www.realclearpolling.com/polls/president/general/2016/delaware/trump-vs-clinton#polls", 
    "Florida": "https://www.realclearpolling.com/polls/president/general/2016/florida/trump-vs-clinton#polls",
    "Georgia": "https://www.realclearpolling.com/polls/president/general/2016/georgia/trump-vs-clinton#polls",
    "Hawaii": "https://www.realclearpolitics.com/epolls/2016/president/hi/hawaii_trump_vs_clinton-5902.html#polls",
    "Idaho": "https://www.realclearpolling.com/polls/president/general/2016/idaho/trump-vs-clinton#polls",
    "Illinois": "https://www.realclearpolling.com/polls/president/general/2016/illinois/trump-vs-clinton#polls",
    "Indiana": "https://www.realclearpolling.com/polls/president/general/2016/indiana/trump-vs-clinton#polls",
    "Iowa": "https://www.realclearpolling.com/polls/president/general/2016/iowa/trump-vs-clinton#polls",
    "Kansas": "https://www.realclearpolling.com/polls/president/general/2016/kansas/trump-vs-clinton#polls",
    "Kentucky": "https://www.realclearpolling.com/polls/president/general/2016/kentucky/trump-vs-clinton#polls",
    "Louisiana": "https://www.realclearpolling.com/polls/president/general/2016/louisiana/trump-vs-clinton#polls",
    #"Maine": "", no data
    "Maine CD1": "https://www.realclearpolling.com/polls/president/general/2016/maine/cd1-trump-vs-clinton#polls",
    "Maine CD2": "https://www.realclearpolling.com/polls/president/general/2016/maine/cd2-trump-vs-clinton#polls",
    "Maryland": "https://www.realclearpolling.com/polls/president/general/2016/maryland/trump-vs-clinton#polls",
    "Massachusetts": "https://www.realclearpolling.com/polls/president/general/2016/massachusetts/trump-vs-clinton#polls",
    "Michigan": "https://www.realclearpolling.com/polls/president/general/2016/michigan/trump-vs-clinton#polls",
    "Minnesota": "https://www.realclearpolling.com/polls/president/general/2016/minnesota/trump-vs-clinton#polls",
    "Mississippi": "https://www.realclearpolling.com/polls/president/general/2016/mississippi/trump-vs-clinton#polls",
    "Missouri": "https://www.realclearpolling.com/polls/president/general/2016/missouri/trump-vs-clinton#polls",
    "Montana": "https://www.realclearpolling.com/polls/president/general/2016/montana/trump-vs-clinton#polls",
    "Nebraksa": "https://www.realclearpolling.com/polls/president/general/2016/nebraska/trump-vs-clinton#polls",
    "Nebraska CD2": "https://www.realclearpolling.com/polls/president/general/2016/nebraska/cd2-trump-vs-clinton#polls",
    "Nevada": "https://www.realclearpolling.com/polls/president/general/2016/nevada/trump-vs-clinton-vs-johnson#polls",
    "New Hampshire": "https://www.realclearpolling.com/polls/president/general/2016/new-hampshire/trump-vs-clinton#polls",
    "New Jersey": "https://www.realclearpolling.com/polls/president/general/2016/new-jersey/trump-vs-clinton#polls",
    "New Mexico": "https://www.realclearpolling.com/polls/president/general/2016/new-mexico/trump-vs-clinton#polls",
    "New York": "https://www.realclearpolling.com/polls/president/general/2016/new-york/trump-vs-clinton#polls",
    "North Carolina": "https://www.realclearpolling.com/polls/president/general/2016/north-carolina/trump-vs-clinton#polls",
    "North Dakota": "https://www.realclearpolitics.com/epolls/2016/president/nd/north_dakota_trump_vs_clinton-5907.html#polls",
    "Ohio": "https://www.realclearpolling.com/polls/president/general/2016/ohio/trump-vs-clinton#polls",
    "Oklahoma": "https://www.realclearpolling.com/polls/president/general/2016/oklahoma/trump-vs-clinton#polls",
    "Oregon": "https://www.realclearpolling.com/polls/president/general/2016/oregon/trump-vs-clinton#polls",
    "Pennsylvania": "https://www.realclearpolling.com/polls/president/general/2016/pennsylvania/trump-vs-clinton#polls",
    "Rhode Island": "https://www.realclearpolling.com/polls/president/general/2016/rhode-island/trump-vs-clinton#polls",  
    "South Carolina": "https://www.realclearpolling.com/polls/president/general/2016/south-carolina/trump-vs-clinton#polls",
    "South Dakota": "https://www.realclearpolling.com/polls/president/general/2016/south-dakota/trump-vs-clinton#polls",
    "Tennessee": "https://www.realclearpolling.com/polls/president/general/2016/tennessee/trump-vs-clinton#polls",
    "Texas": "https://www.realclearpolling.com/polls/president/general/2016/texas/trump-vs-clinton#polls",
    "Utah": "https://www.realclearpolling.com/polls/president/general/2016/utah/trump-vs-clinton#polls",
    "Vermont": "https://www.realclearpolling.com/polls/president/general/2016/vermont/trump-vs-clinton#polls",
    "Virginia": "https://www.realclearpolling.com/polls/president/general/2016/virginia/trump-vs-clinton#polls",
    "Washington": "https://www.realclearpolling.com/polls/president/general/2016/washington/trump-vs-clinton#polls",
    "West Virginia": "https://www.realclearpolling.com/polls/president/general/2016/west-virginia/trump-vs-clinton#polls",
    "Wisconsin": "https://www.realclearpolling.com/polls/president/general/2016/wisconsin/trump-vs-clinton#polls",
    "Wyoming": "https://www.realclearpolitics.com/epolls/2016/president/wy/wyoming_trump_vs_clinton-5913.html#polls"
       
}

all_state_dataframe = []

for state, url in state_urls.items():
    time.sleep(121.5235235)
    state_df = get_states2016_data(url, state)
    df = pd.DataFrame(state_df)
    if len(df)>0:
        df = df.drop_duplicates().dropna(subset=['pollster'])
        all_state_dataframe.append(df)
    else:
        continue
    
all_state_df = pd.concat(all_state_dataframe, ignore_index=True)

presidential_state_2016 = all_state_df

presidential_state_2016 = clean_data(presidential_state_2016, 2016, 'President')

Failed to retrieve. Status Code was 500
Success
Success
Success
Success
Success
Success
Success
Success
Success
Success
Failed to retrieve. Status Code was 200
Success
Success
Success
Success
Success
Success
Success
Success
Success
Success
Success
Success
Success
Success
Success
Success
Success
Success
Success
Success
Success
Success
Success
Success
Failed to retrieve. Status Code was 500
Success
Success
Success
Success
Success
Success
Success
Success
Success
Success
Success
Success
Success
Success
Success
Success
Failed to retrieve. Status Code was 200


In [9]:
#2016 Senate 

import requests
import json
import re
from bs4 import BeautifulSoup 
import numpy
import pandas as pd
from itertools import zip_longest

def get_senate2016_data(url, state):

    page = requests.get(
        url='https://proxy.scrapeops.io/v1/',
         params={
            'api_key': 'f2fec3c0-d71f-4279-9341-ba073c464e22',
            'url': url
        },
    )

    if page.status_code == 200:
        url_content = page.text
        print("Success")
        
        soup = BeautifulSoup(page.content, "html.parser") 

        script_tags = soup.find_all('script')

        str = ""
        for script in script_tags:
            if script.string and 'finalData' in script.string:
                str += script.string

        x = str.replace("\\","")     
        
        pollster_pattern = r'"pollster":\s*"([^"]*)"'
        date_pattern = r'"date":\s*"([^"]*)"'
        sample_size_pattern = r'"sampleSize":\s*"([^"]*)"'
        margin_error_pattern = r'"marginError":\s*"([^"]*)"'
        
        link_pattern = r'"link":\s*"([^"]*)"'

        dvalue_pattern1 = r'"candidate":\[{"name":"([^"]*?)","affiliation":"Democrat","value":"([^"]*)"'
        dvalue_pattern2 = r'"candidate":\[{[^}]*},{"name":"([^"]*?)","affiliation":"Democrat","value":"([^"]*)"'

        rvalue_pattern1 = r'"candidate":\[{[^}]*},{"name":"([^"]*?)","affiliation":"Republican","value":"([^"]*)"'
        rvalue_pattern2 = r'"candidate":\[{"name":"([^"]*?)","affiliation":"Republican","value":"([^"]*)"'


        dvalue_data = re.findall(dvalue_pattern1, x) or re.findall(dvalue_pattern2, x)
        rvalue_data = re.findall(rvalue_pattern1, x) or re.findall(rvalue_pattern2, x)

        pollster_data = re.findall(pollster_pattern, x)
        date_data = re.findall(date_pattern, x)
        sample_size_data = re.findall(sample_size_pattern, x)
        margin_error_data = re.findall(margin_error_pattern, x)
        
        link_data = re.findall(link_pattern, x)

        data_rows = []
        for row in zip_longest(pollster_data, date_data, sample_size_data, margin_error_data, dvalue_data, rvalue_data, link_data, fillvalue=None):
            data_rows.append({
                
                "pollster": row[0],
                "date": row[1],
                "sampleSize": row[2],
                "marginError": row[3],
                "dvalue": row[4],
                "rvalue": row[5],
                "state": state
                #"race": race_value
                
            })
        
        return data_rows
        
    else:
        print("Failed to retrieve. Status Code was", page.status_code)
        return[]
        

state_urls = {
                
    "Alabama": "https://www.realclearpolitics.com/epolls/2016/senate/al/alabama_senate_shelby_vs_crumpton-5989.html#polls",
    "Alaska": "https://www.realclearpolling.com/polls/senate/general/2016/alaska/murkowski-vs-metcalfe-vs-miller-vs-stock#polls",
    "Arizona": "https://www.realclearpolling.com/polls/senate/general/2016/arizona/mccain-vs-kirkpatrick#polls",
    "Arkansas": "https://www.realclearpolling.com/polls/senate/general/2016/arkansas/boozman-vs-eldridge#polls",
    "California": "https://www.realclearpolling.com/polls/senate/general/2016/california/harris-vs-sanchez#polls",
    "Colorado": "https://www.realclearpolling.com/polls/senate/general/2016/colorado/glenn-vs-bennet#polls",
    "Connecticut": "https://www.realclearpolling.com/polls/senate/general/2016/connecticut/carter-vs-blumenthal#polls",
    #"Delaware": "https://www.realclearpolling.com/polls/senate/general/2018/delaware/arlett-vs-carper#polls", 
    "Florida": "https://www.realclearpolling.com/polls/senate/general/2016/florida/rubio-vs-murphy#polls",
    "Georgia": "https://www.realclearpolling.com/polls/senate/general/2016/georgia/isakson-vs-barksdale#polls",
    "Hawaii": "https://www.realclearpolitics.com/epolls/2016/senate/hi/hawaii_senate_carroll_vs_schatz-5992.html#polls",
    "Idaho": "https://www.realclearpolling.com/polls/senate/general/2016/idaho/crapo-vs-sturgill#polls",
    "Illinois": "https://www.realclearpolling.com/polls/senate/general/2016/illinois/kirk-vs-duckworth#polls",
    "Indiana": "https://www.realclearpolling.com/polls/senate/general/2016/indiana/young-vs-bayh#polls",
    "Iowa": "https://www.realclearpolling.com/polls/senate/general/2016/iowa/grassley-vs-judge#polls",
    "Kansas": "https://www.realclearpolling.com/polls/senate/general/2016/kansas/moran-vs-wiesner#polls",
    "Kentucky": "https://www.realclearpolling.com/polls/senate/general/2016/kentucky/paul-vs-gray#polls",
    "Louisiana": "https://www.realclearpolling.com/polls/senate/open-primary/2016/louisiana#polls",
    #"Maine": "https://www.realclearpolling.com/polls/senate/general/2018/maine/brakey-vs-king#polls",
    #"Maine CD1": "https://www.realclearpolling.com/polls/president/general/2024/maine/trump-vs-biden-cd1#polls",
    #"Maine CD2": "https://www.realclearpolling.com/polls/president/general/2024/maine/trump-vs-biden-cd2#polls",
    "Maryland": "https://www.realclearpolling.com/polls/senate/general/2016/maryland/vanhollen-vs-szeliga#polls",
    #"Massachusetts": "https://www.realclearpolling.com/polls/senate/general/2018/massachusetts/diehl-vs-warren#polls",
    #"Michigan": "https://www.realclearpolling.com/polls/senate/general/2018/michigan/james-vs-stabenow#polls",
    #"Minnesota": "https://www.realclearpolling.com/polls/senate/general/2018/minnesota/newberger-vs-klobuchar#polls",
    #"Minnesota_SPECIAL": "https://www.realclearpolling.com/polls/senate/general/2018/minnesota/housley-vs-smith",
    #"Mississippi": "https://www.realclearpolling.com/polls/senate/general/2018/mississippi/wicker-vs-baria#polls",
    #"Mississippi_RUNOFF":"https://www.realclearpolling.com/polls/senate/general/2018/mississippi-runoff-election",
    "Missouri": "https://www.realclearpolling.com/polls/senate/general/2016/missouri/blunt-vs-kander#polls",
    #"Montana": "https://www.realclearpolling.com/polls/senate/general/2018/montana/rosendale-vs-tester#polls",
    #"Nebraksa": "https://www.realclearpolitics.com/epolls/2018/senate/ne/nebraska_senate_fischer_vs_raybould_-6315.html#polls",
    #"Nebraska CD2": "https://www.realclearpolling.com/polls/president/general/2024/nebraska-cd2/trump-vs-biden#polls",
    "Nevada": "https://www.realclearpolling.com/polls/senate/general/2016/nevada/cortezmasto-vs-heck#polls",
    "New Hampshire": "https://www.realclearpolling.com/polls/senate/general/2016/new-hampshire/ayotte-vs-hassan#polls",
    #"New Jersey": "https://www.realclearpolling.com/polls/senate/general/2018/new-jersey/hugin-vs-menendez#polls",
    #"New Mexico": "https://www.realclearpolling.com/polls/senate/general/2018/new-mexico/rich-vs-heinrich-vs-johnson#polls",
    "New York": "https://www.realclearpolling.com/polls/senate/general/2016/new-york/long-vs-schumer#polls",
    "North Carolina": "https://www.realclearpolling.com/polls/senate/general/2016/north-carolina/burr-vs-ross#polls",
    "North Dakota": "https://www.realclearpolitics.com/epolls/2016/senate/nd/north_dakota_senate_hoeven_vs_glassheim-5996.html#polls",
    "Ohio": "https://www.realclearpolling.com/polls/senate/general/2016/ohio/portman-vs-strickland#polls",
    "Oklahoma": "https://www.realclearpolitics.com/epolls/2016/senate/ok/oklahoma_senate_lankford_vs_workman-5997.html#polls",
    "Oregon": "https://www.realclearpolling.com/polls/senate/general/2016/oregon/wyden-vs-callahan#polls",
    "Pennsylvania": "https://www.realclearpolling.com/polls/senate/general/2016/pennsylvania/toomey-vs-mcginty#polls",
    #"Rhode Island": "https://www.realclearpolling.com/polls/senate/general/2018/rhode-island/flanders-vs-whitehouse#polls",  
    "South Carolina":"https://www.realclearpolling.com/polls/senate/general/2016/south-carolina/scott-vs-dixon#polls",
    "South Dakota": "https://www.realclearpolling.com/polls/senate/general/2016/south-dakota/thune-vs-williams#polls",
    #"Tennessee": "https://www.realclearpolling.com/polls/senate/general/2018/tennessee/blackburn-vs-bredesen#polls",
    #"Texas": "https://www.realclearpolling.com/polls/senate/general/2018/texas/cruz-vs-o'rourke#polls",
    "Utah": "https://www.realclearpolling.com/polls/senate/general/2016/utah/lee-vs-snow#polls",
    "Vermont": "https://www.realclearpolling.com/polls/senate/general/2016/vermont/milne-vs-leahy#polls",
    #"Virginia": "https://www.realclearpolling.com/polls/senate/general/2018/virginia/stewart-vs-kaine#polls",
    "Washington": "https://www.realclearpolling.com/polls/senate/general/2016/washington/vance-vs-murray#polls",
    #"West Virginia": "https://www.realclearpolling.com/polls/senate/general/2018/west-virginia/morrisey-vs-manchin#polls",
    "Wisconsin": "https://www.realclearpolling.com/polls/senate/general/2016/wisconsin/johnson-vs-feingold#polls",
    #"Wyoming": "https://www.realclearpolitics.com/epolls/2018/senate/wy/wyoming_senate_barrasso_vs_trauner-6320.html#polls"
       
}


all_state_dataframe = []

for state, url in state_urls.items():
    time.sleep(120.235)
    state_df = get_senate2016_data(url, state)
    df = pd.DataFrame(state_df)
    if len(df)>0:
        df = df.drop_duplicates().dropna(subset=['pollster']).dropna(subset=['dvalue'])
        all_state_dataframe.append(df)
    else:
        continue
    
all_state_df = pd.concat(all_state_dataframe, ignore_index=True)

senate_2016 = all_state_df

senate_2016 = clean_data(senate_2016, 2016, 'Senate')

Success
Success
Success
Success
Success
Success
Success
Success
Success
Success
Success
Success
Success
Success
Success
Success
Success
Success
Success
Success
Success
Success
Success
Success
Success
Failed to retrieve. Status Code was 500
Success
Success
Success
Success
Success
Success
Success
Success


In [10]:
#2018 Generic Congressional Polls

import requests
import re
from bs4 import BeautifulSoup 
import numpy
import pandas as pd
from itertools import zip_longest
url = 'https://www.realclearpolitics.com/epolls/other/2018_generic_congressional_vote-6185.html'

page = requests.get(
        url='https://proxy.scrapeops.io/v1/',
         params={
            'api_key': 'f2fec3c0-d71f-4279-9341-ba073c464e22',
            'url': url
        },
    )
if page.status_code == 200:
    url_content = page.text
    print("Success")
else:
    print("Failed to retrieve. Status Code was", page.status_code)
    exit()
soup = BeautifulSoup(page.content, "html.parser")
results = soup.find(id="container")

pollster_data = []
date_data =[]
sample_data =[]
dvalue_data =[]
rvalue_data  = []

isinrcpavg = results.find_all("tr", class_="isInRcpAvg")
for poll in isinrcpavg:
    pollster = poll.find('a', class_='normal_pollster_name').text.strip()
    date = poll.find_all('td')[1].text.strip()
    sample = poll.find("td", class_="sample").text.strip()
    td_elements = poll.find_all('td')
    dvalue = td_elements[3].text.strip()
    rvalue = td_elements[4].text.strip()
    
    pollster_data.append(pollster)
    date_data.append(date)
    sample_data.append(sample)
    dvalue_data.append(dvalue)
    rvalue_data.append(rvalue)

alt = results.find_all("tr", class_="alt")
for poll in alt:
    pollster = poll.find('a', class_='normal_pollster_name').text.strip()
    date = poll.find_all('td')[1].text.strip()
    sample = poll.find("td", class_="sample").text.strip()
    td_elements = poll.find_all('td')
    dvalue = td_elements[3].text.strip()
    rvalue = td_elements[4].text.strip()
    
    pollster_data.append(pollster)
    date_data.append(date)
    sample_data.append(sample)
    dvalue_data.append(dvalue)
    rvalue_data.append(rvalue)

blank = results.select("tr[class='']")
for poll in blank:
    pollster = poll.find('a', class_='normal_pollster_name').text.strip()
    date = poll.find_all('td')[1].text.strip()
    sample = poll.find("td", class_="sample").text.strip()
    td_elements = poll.find_all('td')
    dvalue = td_elements[3].text.strip()
    rvalue = td_elements[4].text.strip()
    
    pollster_data.append(pollster)
    date_data.append(date)
    sample_data.append(sample)
    dvalue_data.append(dvalue)
    rvalue_data.append(rvalue)


data_rows = list(zip_longest(pollster_data, date_data, sample_data, dvalue_data, rvalue_data, fillvalue=None))
df = pd.DataFrame(data_rows, columns =['pollster', 'date', 'sampleSize', 'dvalue', 'rvalue'])

df = df.drop_duplicates().sort_values('pollster')

generic_congressional_2018 = df

general_congressional_2018 = clean_data(generic_congressional_2018, 2018, 'Generic Congressional')


Success


In [11]:
#2018 governor polls

import requests
import json
import re
from bs4 import BeautifulSoup 
import numpy
import pandas as pd
from itertools import zip_longest

def get_governor2018_data(url, state):

    page = requests.get(
        url='https://proxy.scrapeops.io/v1/',
         params={
            'api_key': 'f2fec3c0-d71f-4279-9341-ba073c464e22',
            'url': url
        },
    )

    if page.status_code == 200:
        url_content = page.text
        print("Success")
        
        soup = BeautifulSoup(page.content, "html.parser") 

        script_tags = soup.find_all('script')

        str = ""
        for script in script_tags:
            if script.string and 'finalData' in script.string:
                str += script.string

        x = str.replace("\\","")     

        #str2 = str.split('self.__next_f.push(')
        #str3 = str2[1][:-1]
        #jsonx = json.loads(str3)
        #json_str = jsonx[1] 
        #print(json_str)
        
        pollster_pattern = r'"pollster":\s*"([^"]*)"'
        date_pattern = r'"date":\s*"([^"]*)"'
        sample_size_pattern = r'"sampleSize":\s*"([^"]*)"'
        margin_error_pattern = r'"marginError":\s*"([^"]*)"'
        
        link_pattern = r'"link":\s*"([^"]*)"'

        dvalue_pattern1 = r'"candidate":\[{"name":"([^"]*?)","affiliation":"Democrat","value":"([^"]*)"'
        dvalue_pattern2 = r'"candidate":\[{[^}]*},{"name":"([^"]*?)","affiliation":"Democrat","value":"([^"]*)"'

        rvalue_pattern1 = r'"candidate":\[{[^}]*},{"name":"([^"]*?)","affiliation":"Republican","value":"([^"]*)"'
        rvalue_pattern2 = r'"candidate":\[{"name":"([^"]*?)","affiliation":"Republican","value":"([^"]*)"'


        dvalue_data = re.findall(dvalue_pattern1, x) or re.findall(dvalue_pattern2, x)
        rvalue_data = re.findall(rvalue_pattern1, x) or re.findall(rvalue_pattern2, x)

        pollster_data = re.findall(pollster_pattern, x)
        date_data = re.findall(date_pattern, x)
        sample_size_data = re.findall(sample_size_pattern, x)
        margin_error_data = re.findall(margin_error_pattern, x)
        
        link_data = re.findall(link_pattern, x)

        data_rows = []
        for row in zip_longest(pollster_data, date_data, sample_size_data, margin_error_data, dvalue_data, rvalue_data, link_data, fillvalue=None):
            data_rows.append({
                
                "pollster": row[0],
                "date": row[1],
                "sampleSize": row[2],
                "marginError": row[3],
                "dvalue": row[4],
                "rvalue": row[5],
                "state": state
                #"race": race_value
                
            })
        
        return data_rows
        
    else:
        print("Failed to retrieve. Status Code was", page.status_code)
        return[]
        

state_urls = {
                
    "Alabama": "https://www.realclearpolitics.com/epolls/2018/governor/al/alabama_governor_ivey_vs_maddox-6405.html#polls",
    "Alaska": "https://www.realclearpolling.com/polls/governor/general/2018/alaska/dunleavy-vs-begich#polls",
    "Arizona": "https://www.realclearpolling.com/polls/governor/general/2018/arizona/ducey-vs-garcia#polls",
    "Arkansas": "https://www.realclearpolling.com/polls/governor/general/2018/arkansas/hutchinson-vs-henderson#polls",
    "California": "https://www.realclearpolling.com/polls/governor/general/2018/california/cox-vs-newsom#polls",
    "Colorado": "https://www.realclearpolling.com/polls/governor/general/2018/colorado/stapleton-vs-polis#polls",
    "Connecticut": "https://www.realclearpolling.com/polls/governor/general/2018/connecticut/stefanowski-vs-lamont#polls",
    #"Delaware": "https://www.realclearpolling.com/polls/governor/general/2020/delaware/murray-vs-carney#polls", 
    "Florida": "https://www.realclearpolling.com/polls/governor/general/2018/florida/desantis-vs-gillum#polls",
    "Georgia": "https://www.realclearpolling.com/polls/governor/general/2018/georgia/kemp-vs-abrams#polls",
    "Hawaii": "https://www.realclearpolling.com/polls/governor/general/2018/hawaii/tupola-vs-ige#polls",
    "Idaho": "https://www.realclearpolitics.com/epolls/2018/governor/id/idaho_governor_little_vs_jordan-6413.html#polls",
    "Illinois": "https://www.realclearpolling.com/polls/governor/general/2018/illinois/rauner-vs-pritzker#polls",
    #"Indiana": "https://www.realclearpolling.com/polls/governor/general/2020/indiana/holcomb-vs-myers-#polls",
    "Iowa": "https://www.realclearpolling.com/polls/governor/general/2018/iowa/reynolds-vs-hubbell#polls",
    "Kansas": "https://www.realclearpolling.com/polls/governor/general/2018/kansas/kobach-vs-kelly-vs-orman#polls",
    #"Kentucky": "https://www.realclearpolling.com/polls/senate/general/2014/kentucky/mcconnell-vs-grimes#polls",
    #"Louisiana": "https://www.realclearpolitics.com/epolls/2014/senate/louisiana_senate_race.html#polls",
    "Maine": "https://www.realclearpolling.com/polls/governor/general/2018/maine/moody-vs-mills#polls",
    "Maryland": "https://www.realclearpolling.com/polls/governor/general/2018/maryland/hogan-vs-jealous#polls",
    "Massachusetts": "https://www.realclearpolling.com/polls/governor/general/2018/massachusetts/baker-vs-gonzalez#polls",
    "Michigan": "https://www.realclearpolling.com/polls/governor/general/2018/michigan/schuette-vs-whitmer#polls",
    "Minnesota": "https://www.realclearpolling.com/polls/governor/general/2018/minnesota/johnson-vs-walz#polls",
    #"Mississippi": "https://www.realclearpolling.com/polls/senate/general/2014/mississippi/cochran-vs-childers#polls",
    #"Missouri": "https://www.realclearpolling.com/polls/governor/general/2020/missouri/parson-vs-galloway#polls",
    #"Montana": "https://www.realclearpolling.com/polls/governor/general/2020/montana/gianforte-vs-cooney#polls",
    "Nebraksa": "https://www.realclearpolitics.com/epolls/2018/governor/ne/nebraska_governor_ricketts_vs_krist-6421.html#polls",
    "Nevada": "https://www.realclearpolling.com/polls/governor/general/2018/nevada/laxalt-vs-sisolak#polls",
    "New Hampshire": "https://www.realclearpolling.com/polls/governor/general/2018/new-hampshire/sununu-vs-kelly#polls",
    #"New Jersey": "https://www.realclearpolling.com/polls/senate/general/2014/new-jersey/bell-vs-booker#polls",
    "New Mexico": "https://www.realclearpolling.com/polls/governor/general/2018/new-mexico/pearce-vs-grisham#polls",
    "New York": "https://www.realclearpolling.com/polls/governor/general/2018/new-york/molinaro-vs-cuomo#polls",
    #"North Carolina": "https://www.realclearpolling.com/polls/governor/general/2020/north-carolina/forest-vs-cooper#polls",
    #"North Dakota": "https://www.realclearpolitics.com/epolls/2020/governor/nd/north_dakota_governor_burgum_vs_lenz-7200.html#polls",
    "Ohio": "https://www.realclearpolling.com/polls/governor/general/2018/ohio/dewine-vs-cordray#polls",
    "Oklahoma": "https://www.realclearpolling.com/polls/governor/general/2018/oklahoma/stitt-vs-edmondson#polls",
    "Oregon": "https://www.realclearpolling.com/polls/governor/general/2018/oregon/buehler-vs-brown#polls",
    "Pennsylvania": "https://www.realclearpolling.com/polls/governor/general/2018/pennsylvania/wagner-vs-wolf#polls",
    "Rhode Island": "https://www.realclearpolling.com/polls/governor/general/2018/rhode-island/fung-vs-raimondo#polls",  
    "South Carolina":"https://www.realclearpolling.com/polls/governor/general/2018/south-carolina/mcmaster-vs-smith#polls",
    "South Dakota": "https://www.realclearpolling.com/polls/governor/general/2018/south-dakota/noem-vs-sutton#polls",
    "Tennessee": "https://www.realclearpolling.com/polls/governor/general/2018/tennessee/lee-vs-dean#polls",
    "Texas": "https://www.realclearpolling.com/polls/governor/general/2018/texas/abbott-vs-valdez#polls",
    #"Utah": "https://www.realclearpolling.com/polls/governor/general/2020/utah/cox-vs-peterson#polls",
    "Vermont": "https://www.realclearpolling.com/polls/governor/general/2018/vermont/scott-vs-hallquist#polls",
    #"Virginia": "https://www.realclearpolling.com/polls/senate/general/2014/virginia/gillespie-vs-warner#polls",
    #"Washington": "https://www.realclearpolling.com/polls/governor/general/2020/washington/culp-vs-inslee#polls",
    #"West Virginia": "https://www.realclearpolling.com/polls/governor/general/2020/west-virginia/justice-vs-salango#polls",
    "Wisconsin": "https://www.realclearpolling.com/polls/governor/general/2018/wisconsin/walker-vs-evers-vs-anderson#polls",
    "Wyoming": "https://www.realclearpolitics.com/epolls/2018/governor/wy/wyoming_governor_gordon_vs_throne-6666.html#polls"
       
}


all_state_dataframe = []

for state, url in state_urls.items():
    time.sleep(120.3153125)
    state_df = get_governor2018_data(url, state)
    df = pd.DataFrame(state_df)
    if len(df)>0:
        df = df.drop_duplicates().dropna(subset=['pollster']).dropna(subset=['dvalue'])
        all_state_dataframe.append(df)
    else:
        continue
    
all_state_df = pd.concat(all_state_dataframe, ignore_index=True)

governor_2018 = all_state_df

governor_2018 = clean_data(governor_2018, 2018, 'Governor')



Failed to retrieve. Status Code was 500
Success
Success
Success
Success
Success
Success
Success
Success
Success
Success
Success
Success
Success
Success
Success
Success
Success
Success
Failed to retrieve. Status Code was 500
Success
Success
Success
Success
Success
Success
Success
Success
Success
Success
Success
Success
Success
Success
Success
Success


In [12]:
#2018 Senate
import requests
import json
import re
from bs4 import BeautifulSoup 
import numpy
import pandas as pd
from itertools import zip_longest

def get_senate2020_data(url, state):

    page = requests.get(
        url='https://proxy.scrapeops.io/v1/',
         params={
            'api_key': 'f2fec3c0-d71f-4279-9341-ba073c464e22',
            'url': url
        },
    )

    if page.status_code == 200:
        url_content = page.text
        print("Success")
        
        soup = BeautifulSoup(page.content, "html.parser") 

        script_tags = soup.find_all('script')

        str = ""
        for script in script_tags:
            if script.string and 'finalData' in script.string:
                str += script.string

        x = str.replace("\\","")     

        #str2 = str.split('self.__next_f.push(')
        #str3 = str2[1][:-1]
        #jsonx = json.loads(str3)
        #json_str = jsonx[1] 
        #print(json_str)
        
        pollster_pattern = r'"pollster":\s*"([^"]*)"'
        date_pattern = r'"date":\s*"([^"]*)"'
        sample_size_pattern = r'"sampleSize":\s*"([^"]*)"'
        margin_error_pattern = r'"marginError":\s*"([^"]*)"'
        
        link_pattern = r'"link":\s*"([^"]*)"'

        dvalue_pattern1 = r'"candidate":\[{"name":"([^"]*?)","affiliation":"Democrat","value":"([^"]*)"'
        dvalue_pattern2 = r'"candidate":\[{[^}]*},{"name":"([^"]*?)","affiliation":"Democrat","value":"([^"]*)"'

        rvalue_pattern1 = r'"candidate":\[{[^}]*},{"name":"([^"]*?)","affiliation":"Republican","value":"([^"]*)"'
        rvalue_pattern2 = r'"candidate":\[{"name":"([^"]*?)","affiliation":"Republican","value":"([^"]*)"'


        dvalue_data = re.findall(dvalue_pattern1, x) or re.findall(dvalue_pattern2, x)
        rvalue_data = re.findall(rvalue_pattern1, x) or re.findall(rvalue_pattern2, x)

        pollster_data = re.findall(pollster_pattern, x)
        date_data = re.findall(date_pattern, x)
        sample_size_data = re.findall(sample_size_pattern, x)
        margin_error_data = re.findall(margin_error_pattern, x)
        
        link_data = re.findall(link_pattern, x)

        data_rows = []
        for row in zip_longest(pollster_data, date_data, sample_size_data, margin_error_data, dvalue_data, rvalue_data, link_data, fillvalue=None):
            data_rows.append({
                
                "pollster": row[0],
                "date": row[1],
                "sampleSize": row[2],
                "marginError": row[3],
                "dvalue": row[4],
                "rvalue": row[5],
                "state": state
                #"race": race_value
                
            })
        
        return data_rows
        
    else:
        print("Failed to retrieve. Status Code was", page.status_code)
        return[]
        

state_urls = {
                
    #"Alabama": "https://www.realclearpolling.com/polls/senate/general/2020/alabama/tuberville-vs-jones#polls",
    #"Alaska": "https://www.realclearpolling.com/polls/senate/general/2020/alaska/sullivan-vs-gross#polls",
    "Arizona": "https://www.realclearpolling.com/polls/senate/general/2018/arizona/mcsally-vs-sinema#polls",
    #"Arkansas": "https://www.realclearpolling.com/polls/senate/general/2020/arkansas/cotton-vs-harrington#polls",
    "California": "https://www.realclearpolling.com/polls/senate/general/2018/california/feinstein-vs-leon#polls",
    #"Colorado": "https://www.realclearpolling.com/polls/senate/general/2020/colorado/gardner-vs-hickenlooper#polls",
    "Connecticut": "https://www.realclearpolling.com/polls/senate/general/2018/connecticut/corey-vs-murphy#polls",
    "Delaware": "https://www.realclearpolling.com/polls/senate/general/2018/delaware/arlett-vs-carper#polls", 
    "Florida": "https://www.realclearpolling.com/polls/senate/general/2018/florida/scott-vs-nelson#polls",
    #"Georgia": "https://www.realclearpolling.com/polls/senate/general/2020/georgia/perdue-vs-ossoff#polls",
    "Hawaii": "https://www.realclearpolitics.com/epolls/2018/senate/hi/hawaii_senate_curtis_vs_hirono-6264.html#polls",
    #"Idaho": "https://www.realclearpolitics.com/epolls/2020/senate/id/idaho_senate_risch_vs_jordan-7070.html#polls",
    #"Illinois": "https://www.realclearpolitics.com/epolls/2020/senate/il/illinois_senate_curran_vs_durbin-7071.html#polls",
    "Indiana": "https://www.realclearpolling.com/polls/senate/general/2018/indiana/braun-vs-donnelly-vs-brenton#polls",
    #"Iowa": "https://www.realclearpolling.com/polls/senate/general/2020/iowa/ernst-vs-greenfield#polls",
    #"Kansas": "https://www.realclearpolling.com/polls/senate/general/2020/kansas/marshall-vs-bollier#polls",
    #"Kentucky": "https://www.realclearpolling.com/polls/senate/general/2020/kentucky/mcconnell-vs-mcgrath#polls",
    #"Louisiana": "https://www.realclearpolitics.com/epolls/2020/senate/la/louisiana_senate_open_primary-7074.html#polls",
    "Maine": "https://www.realclearpolling.com/polls/senate/general/2018/maine/brakey-vs-king#polls",
    #"Maine CD1": "https://www.realclearpolling.com/polls/president/general/2024/maine/trump-vs-biden-cd1#polls",
    #"Maine CD2": "https://www.realclearpolling.com/polls/president/general/2024/maine/trump-vs-biden-cd2#polls",
    "Maryland": "https://www.realclearpolling.com/polls/senate/general/2018/maryland/campbell-vs-cardin-vs-simon#polls",
    "Massachusetts": "https://www.realclearpolling.com/polls/senate/general/2018/massachusetts/diehl-vs-warren#polls",
    "Michigan": "https://www.realclearpolling.com/polls/senate/general/2018/michigan/james-vs-stabenow#polls",
    "Minnesota": "https://www.realclearpolling.com/polls/senate/general/2018/minnesota/newberger-vs-klobuchar#polls",
    "Minnesota_SPECIAL": "https://www.realclearpolling.com/polls/senate/general/2018/minnesota/housley-vs-smith",
    "Mississippi": "https://www.realclearpolling.com/polls/senate/general/2018/mississippi/wicker-vs-baria#polls",
    "Mississippi_RUNOFF":"https://www.realclearpolling.com/polls/senate/general/2018/mississippi-runoff-election",
    "Missouri": "https://www.realclearpolling.com/polls/senate/general/2018/missouri/hawley-vs-mccaskill#polls",
    "Montana": "https://www.realclearpolling.com/polls/senate/general/2018/montana/rosendale-vs-tester#polls",
    "Nebraksa": "https://www.realclearpolitics.com/epolls/2018/senate/ne/nebraska_senate_fischer_vs_raybould_-6315.html#polls",
    #"Nebraska CD2": "https://www.realclearpolling.com/polls/president/general/2024/nebraska-cd2/trump-vs-biden#polls",
    "Nevada": "https://www.realclearpolling.com/polls/senate/general/2018/nevada/heller-vs-rosen#polls",
    #"New Hampshire": "https://www.realclearpolling.com/polls/senate/general/2020/new-hampshire/messner-vs-shaheen#polls",
    "New Jersey": "https://www.realclearpolling.com/polls/senate/general/2018/new-jersey/hugin-vs-menendez#polls",
    "New Mexico": "https://www.realclearpolling.com/polls/senate/general/2018/new-mexico/rich-vs-heinrich-vs-johnson#polls",
    "New York": "https://www.realclearpolling.com/polls/senate/general/2018/new-york/farley-vs-gillibrand#polls",
    #"North Carolina": "https://www.realclearpolling.com/polls/senate/general/2020/north-carolina/tillis-vs-cunningham#polls",
    "North Dakota": "https://www.realclearpolling.com/polls/senate/general/2018/north-dakota/cramer-vs-heitkamp#polls",
    "Ohio": "https://www.realclearpolling.com/polls/senate/general/2018/ohio/renacci-vs-brown#polls",
    #"Oklahoma": "https://www.realclearpolling.com/polls/senate/general/2020/oklahoma/inhofe-vs-broyles#polls",
    #"Oregon": "https://www.realclearpolitics.com/epolls/2020/senate/or/oregon_senate_perkins_vs_merkley-7081.html#polls",
    "Pennsylvania": "https://www.realclearpolling.com/polls/senate/general/2018/pennsylvania/barletta-vs-casey#polls",
    "Rhode Island": "https://www.realclearpolling.com/polls/senate/general/2018/rhode-island/flanders-vs-whitehouse#polls",  
    #"South Carolina":"https://www.realclearpolling.com/polls/senate/general/2020/south-carolina/graham-vs-harrison#polls",
    #"South Dakota": "https://www.realclearpolling.com/polls/senate/general/2020/south-dakota/rounds-vs-ahlers#polls",
    "Tennessee": "https://www.realclearpolling.com/polls/senate/general/2018/tennessee/blackburn-vs-bredesen#polls",
    "Texas": "https://www.realclearpolling.com/polls/senate/general/2018/texas/cruz-vs-o'rourke#polls",
    "Utah": "https://www.realclearpolling.com/polls/senate/general/2018/utah/romney-vs-wilson#polls",
    "Vermont": "https://www.realclearpolling.com/polls/senate/general/2018/vermont/zupan-vs-sanders#polls",
    "Virginia": "https://www.realclearpolling.com/polls/senate/general/2018/virginia/stewart-vs-kaine#polls",
    "Washington": "https://www.realclearpolling.com/polls/senate/general/2018/washington/hutchison-vs-cantwell#polls",
    "West Virginia": "https://www.realclearpolling.com/polls/senate/general/2018/west-virginia/morrisey-vs-manchin#polls",
    "Wisconsin": "https://www.realclearpolling.com/polls/senate/general/2018/wisconsin/vukmir-vs-baldwin#polls",
    "Wyoming": "https://www.realclearpolitics.com/epolls/2018/senate/wy/wyoming_senate_barrasso_vs_trauner-6320.html#polls"
       
}


all_state_dataframe = []

for state, url in state_urls.items():
    time.sleep(120.235)
    state_df = get_senate2020_data(url, state)
    df = pd.DataFrame(state_df)
    if len(df)>0:
        df = df.drop_duplicates().dropna(subset=['pollster']).dropna(subset=['dvalue'])
        all_state_dataframe.append(df)
    else:
        continue
    
all_state_df = pd.concat(all_state_dataframe, ignore_index=True)

senate_2018 = all_state_df

senate_2018 = clean_data(senate_2018, 2018, 'Senate')

Success
Success
Success
Success
Success
Success
Success
Success
Success
Success
Success
Success
Success
Success
Success
Success
Success
Failed to retrieve. Status Code was 500
Success
Success
Success
Success
Success
Success
Success
Success
Success
Success
Success
Success
Success
Success
Success
Success
Success


In [13]:
#2020 Generic Congressional Polls

import requests
import re
from bs4 import BeautifulSoup 
import numpy
import pandas as pd
from itertools import zip_longest
url = 'https://www.realclearpolitics.com/epolls/other/2020_generic_congressional_vote-6722.html'

page = requests.get(
        url='https://proxy.scrapeops.io/v1/',
         params={
            'api_key': 'f2fec3c0-d71f-4279-9341-ba073c464e22',
            'url': url
        },
    )
if page.status_code == 200:
    url_content = page.text
    print("Success")
else:
    print("Failed to retrieve. Status Code was", page.status_code)
    exit()
soup = BeautifulSoup(page.content, "html.parser")
results = soup.find(id="container")


#rcpAvg = results.find("tr", class_="rcpAvg2")
#print(rcpAvg.text.strip())

pollster_data = []
date_data =[]
sample_data =[]
dvalue_data =[]
rvalue_data  = []

isinrcpavg = results.find_all("tr", class_="isInRcpAvg")
for poll in isinrcpavg:
    pollster = poll.find('a', class_='normal_pollster_name').text.strip()
    date = poll.find_all('td')[1].text.strip()
    sample = poll.find("td", class_="sample").text.strip()
    td_elements = poll.find_all('td')
    dvalue = td_elements[3].text.strip()
    rvalue = td_elements[4].text.strip()
    
    pollster_data.append(pollster)
    date_data.append(date)
    sample_data.append(sample)
    dvalue_data.append(dvalue)
    rvalue_data.append(rvalue)

alt = results.find_all("tr", class_="alt")
for poll in alt:
    pollster = poll.find('a', class_='normal_pollster_name').text.strip()
    date = poll.find_all('td')[1].text.strip()
    sample = poll.find("td", class_="sample").text.strip()
    td_elements = poll.find_all('td')
    dvalue = td_elements[3].text.strip()
    rvalue = td_elements[4].text.strip()
    
    pollster_data.append(pollster)
    date_data.append(date)
    sample_data.append(sample)
    dvalue_data.append(dvalue)
    rvalue_data.append(rvalue)

blank = results.select("tr[class='']")
for poll in blank:
    pollster = poll.find('a', class_='normal_pollster_name').text.strip()
    date = poll.find_all('td')[1].text.strip()
    sample = poll.find("td", class_="sample").text.strip()
    td_elements = poll.find_all('td')
    dvalue = td_elements[3].text.strip()
    rvalue = td_elements[4].text.strip()
    
    pollster_data.append(pollster)
    date_data.append(date)
    sample_data.append(sample)
    dvalue_data.append(dvalue)
    rvalue_data.append(rvalue)


data_rows = list(zip_longest(pollster_data, date_data, sample_data, dvalue_data, rvalue_data, fillvalue=None))
df = pd.DataFrame(data_rows, columns =['pollster', 'date', 'sampleSize', 'dvalue', 'rvalue'])

df = df.drop_duplicates().sort_values('pollster')

generic_congressional_2020 = df

general_congressional_2020 = clean_data(generic_congressional_2020, 2020, 'Generic Congressional')

Success


In [14]:
#2020 Governor polls

import requests
import json
import re
from bs4 import BeautifulSoup 
import numpy
import pandas as pd
from itertools import zip_longest

def get_governor2020_data(url, state):

    page = requests.get(
        url='https://proxy.scrapeops.io/v1/',
         params={
            'api_key': 'f2fec3c0-d71f-4279-9341-ba073c464e22',
            'url': url
        },
    )

    if page.status_code == 200:
        url_content = page.text
        print("Success")
        
        soup = BeautifulSoup(page.content, "html.parser") 

        script_tags = soup.find_all('script')

        str = ""
        for script in script_tags:
            if script.string and 'finalData' in script.string:
                str += script.string

        x = str.replace("\\","")     

        #str2 = str.split('self.__next_f.push(')
        #str3 = str2[1][:-1]
        #jsonx = json.loads(str3)
        #json_str = jsonx[1] 
        #print(json_str)
        
        pollster_pattern = r'"pollster":\s*"([^"]*)"'
        date_pattern = r'"date":\s*"([^"]*)"'
        sample_size_pattern = r'"sampleSize":\s*"([^"]*)"'
        margin_error_pattern = r'"marginError":\s*"([^"]*)"'
        
        link_pattern = r'"link":\s*"([^"]*)"'

        dvalue_pattern1 = r'"candidate":\[{"name":"([^"]*?)","affiliation":"Democrat","value":"([^"]*)"'
        dvalue_pattern2 = r'"candidate":\[{[^}]*},{"name":"([^"]*?)","affiliation":"Democrat","value":"([^"]*)"'

        rvalue_pattern1 = r'"candidate":\[{[^}]*},{"name":"([^"]*?)","affiliation":"Republican","value":"([^"]*)"'
        rvalue_pattern2 = r'"candidate":\[{"name":"([^"]*?)","affiliation":"Republican","value":"([^"]*)"'


        dvalue_data = re.findall(dvalue_pattern1, x) or re.findall(dvalue_pattern2, x)
        rvalue_data = re.findall(rvalue_pattern1, x) or re.findall(rvalue_pattern2, x)

        pollster_data = re.findall(pollster_pattern, x)
        date_data = re.findall(date_pattern, x)
        sample_size_data = re.findall(sample_size_pattern, x)
        margin_error_data = re.findall(margin_error_pattern, x)
        
        link_data = re.findall(link_pattern, x)

        data_rows = []
        for row in zip_longest(pollster_data, date_data, sample_size_data, margin_error_data, dvalue_data, rvalue_data, link_data, fillvalue=None):
            data_rows.append({
                
                "pollster": row[0],
                "date": row[1],
                "sampleSize": row[2],
                "marginError": row[3],
                "dvalue": row[4],
                "rvalue": row[5],
                "state": state
                #"race": race_value
                
            })
        
        return data_rows
        
    else:
        print("Failed to retrieve. Status Code was", page.status_code)
        return[]
        

state_urls = {
                
   # "Alabama": "https://www.realclearpolling.com/polls/governor/general/2022/alabama/ivey-vs-flowers#polls",
    #"Alaska": "https://www.realclearpolling.com/polls/governor/general/2022/alaska/dunleavy-vs-gara-final-round#polls",
    #"Arizona": "https://www.realclearpolling.com/polls/governor/general/2022/arizona/lake-vs-hobbs#polls",
    #"Arkansas": "https://www.realclearpolling.com/polls/governor/general/2022/arkansas/huckabeesanders-vs-jones#polls",
    #"California": "https://www.realclearpolling.com/polls/governor/general/2022/california/dahle-vs-newsom#polls",
    #"Colorado": "https://www.realclearpolling.com/polls/governor/general/2022/colorado/ganahl-vs-polis#polls",
    #"Connecticut": "https://www.realclearpolling.com/polls/governor/general/2022/connecticut/stefanowski-vs-lamont#polls",
    "Delaware": "https://www.realclearpolling.com/polls/governor/general/2020/delaware/murray-vs-carney#polls", 
    #"Florida": "https://www.realclearpolling.com/polls/governor/general/2022/florida/desantis-vs-crist#polls",
    #"Georgia": "https://www.realclearpolling.com/polls/governor/general/2022/georgia/kemp-vs-abrams#polls",
    #"Hawaii": "https://www.realclearpolitics.com/epolls/2022/governor/hi/hawaii_governor_aiona_vs_green-7928.html#polls",
    #"Idaho": "https://www.realclearpolitics.com/epolls/2022/governor/id/idaho_governor_little_vs_heidt-7743.html#polls",
    #"Illinois": "https://www.realclearpolling.com/polls/governor/general/2022/illinois/bailey-vs-pritzker#polls",
    "Indiana": "https://www.realclearpolling.com/polls/governor/general/2020/indiana/holcomb-vs-myers-#polls",
    #"Iowa": "https://www.realclearpolling.com/polls/governor/general/2022/iowa/reynolds-vs-dejear#polls",
    #"Kansas": "https://www.realclearpolling.com/polls/governor/general/2022/kansas/kelly-vs-schmidt#polls",
    #"Kentucky": "https://www.realclearpolling.com/polls/senate/general/2014/kentucky/mcconnell-vs-grimes#polls",
    #"Louisiana": "https://www.realclearpolitics.com/epolls/2014/senate/louisiana_senate_race.html#polls",
    #"Maine": "https://www.realclearpolling.com/polls/governor/general/2022/maine/lepage-vs-mills#polls",
    #"Maryland": "https://www.realclearpolling.com/polls/governor/general/2022/massachusetts/diehl-vs-healey#polls",
    #"Massachusetts": "https://www.realclearpolling.com/polls/governor/general/2022/massachusetts/diehl-vs-healey#polls",
    #"Michigan": "https://www.realclearpolling.com/polls/governor/general/2022/michigan/dixon-vs-whitmer#polls",
    #"Minnesota": "https://www.realclearpolling.com/polls/governor/general/2022/minnesota/jensen-vs-walz#polls",
    #"Mississippi": "https://www.realclearpolling.com/polls/senate/general/2014/mississippi/cochran-vs-childers#polls",
    "Missouri": "https://www.realclearpolling.com/polls/governor/general/2020/missouri/parson-vs-galloway#polls",
    "Montana": "https://www.realclearpolling.com/polls/governor/general/2020/montana/gianforte-vs-cooney#polls",
    #"Nebraksa": "https://www.realclearpolitics.com/epolls/2022/governor/ne/nebraska_governor_pillen_vs_blood-7897.html#polls",
    #"Nebraska CD2": "https://www.realclearpolling.com/polls/president/general/2024/nebraska-cd2/trump-vs-biden#polls",
    #"Nevada": "https://www.realclearpolling.com/polls/governor/general/2022/nevada/lombardo-vs-sisolak#polls",
    "New Hampshire": "https://www.realclearpolling.com/polls/governor/general/2020/new-hampshire/sununu-vs-feltes#polls",
    #"New Jersey": "https://www.realclearpolling.com/polls/senate/general/2014/new-jersey/bell-vs-booker#polls",
    #"New Mexico": "https://www.realclearpolling.com/polls/governor/general/2022/new-mexico/ronchetti-vs-grisham#polls",
    #"New York": "https://www.realclearpolling.com/polls/governor/general/2022/new-york/zeldin-vs-hochul#polls",
    "North Carolina": "https://www.realclearpolling.com/polls/governor/general/2020/north-carolina/forest-vs-cooper#polls",
    "North Dakota": "https://www.realclearpolitics.com/epolls/2020/governor/nd/north_dakota_governor_burgum_vs_lenz-7200.html#polls",
    #"Ohio": "https://www.realclearpolling.com/polls/governor/general/2022/ohio/dewine-vs-whaley#polls",
    #"Oklahoma": "https://www.realclearpolling.com/polls/governor/general/2022/oklahoma/stitt-vs-hofmeister#polls",
    #"Oregon": "https://www.realclearpolling.com/polls/governor/general/2022/oregon/drazan-vs-kotek-vs-johnson#polls",
    #"Pennsylvania": "https://www.realclearpolling.com/polls/governor/general/2022/pennsylvania/mastriano-vs-shapiro#polls",
    #"Rhode Island": "https://www.realclearpolling.com/polls/governor/general/2022/rhode-island/kalus-vs-mckee#polls",  
    #"South Carolina":"https://www.realclearpolling.com/polls/governor/general/2022/south-carolina/mcmaster-vs-cunningham#polls",
    #"South Dakota": "https://www.realclearpolling.com/polls/governor/general/2022/south-dakota/noem-vs-smith#polls",
    #"Tennessee": "https://www.realclearpolitics.com/epolls/2022/governor/tn/tennessee_governor_lee_vs_martin-7925.html#polls",
    #"Texas": "https://www.realclearpolling.com/polls/governor/general/2022/texas/abbott-vs-o'rourke#polls",
    "Utah": "https://www.realclearpolling.com/polls/governor/general/2020/utah/cox-vs-peterson#polls",
    "Vermont": "https://www.realclearpolling.com/polls/governor/general/2020/vermont/scott-vs-zuckerman#polls",
    #"Virginia": "https://www.realclearpolling.com/polls/senate/general/2014/virginia/gillespie-vs-warner#polls",
    "Washington": "https://www.realclearpolling.com/polls/governor/general/2020/washington/culp-vs-inslee#polls",
    "West Virginia": "https://www.realclearpolling.com/polls/governor/general/2020/west-virginia/justice-vs-salango#polls",
    #"Wisconsin": "https://www.realclearpolling.com/polls/governor/general/2022/wisconsin/michels-vs-evers#polls",
    #"Wyoming": "https://www.realclearpolitics.com/epolls/2022/governor/wy/wyoming_governor_gordon_vs_livingston-7904.html#polls"
       
}


all_state_dataframe = []

for state, url in state_urls.items():
    time.sleep(120.51325)
    state_df = get_governor2020_data(url, state)
    df = pd.DataFrame(state_df)
    if len(df)>0:
        df = df.drop_duplicates().dropna(subset=['pollster']).dropna(subset=['dvalue'])
        all_state_dataframe.append(df)
    else:
        continue
    
all_state_df = pd.concat(all_state_dataframe, ignore_index=True)

governor_2020 = all_state_df

governor_2020 = clean_data(governor_2020, 2020, 'Governor')

Success
Success
Success
Success
Success
Success
Success
Success
Success
Success
Success


In [15]:
#2020 National Presidential Polls

import requests
import re
from bs4 import BeautifulSoup 
import numpy
import pandas as pd
from itertools import zip_longest

url = 'https://www.realclearpolling.com/polls/president/general/2020/trump-vs-biden'

def get_national2020_data(url):
    import json
    page = requests.get(
        url='https://proxy.scrapeops.io/v1/',
         params={
            'api_key': 'f2fec3c0-d71f-4279-9341-ba073c464e22',
            'url': url
        },
    )

    #check and return code
    if page.status_code == 200:
        url_content = page.text
        print("Success")
    else:
        print("Failed to retrieve. Status Code was", page.status_code)
        exit()

    soup = BeautifulSoup(page.content, "html.parser") 

    script_tags = soup.find_all('script')


    for script in script_tags:
        if script.string and 'finalData' in script.string:
            
            str = script.string
            break

    str2 = str.split('self.__next_f.push(')
    str3 = str2[1][:-1]
    json = json.loads(str3)

    json_str = json[1] 

    #search pattern index
    pollster_pattern = r'"pollster":\s*"([^"]*)"'
    date_pattern = r'"date":\s*"([^"]*)"'
    sample_size_pattern = r'"sampleSize":\s*"([^"]*)"'
    margin_error_pattern = r'"marginError":\s*"([^"]*)"'
    link_pattern = r'"link":\s*"([^"]*)"'

    pollster_data = re.findall(pollster_pattern, json_str)
    date_data = re.findall(date_pattern, json_str)
    sample_size_data = re.findall(sample_size_pattern, json_str)
    margin_error_data = re.findall(margin_error_pattern, json_str)
    link_data = re.findall(link_pattern, json_str)

    dvalue_pattern1 = r'"candidate":\[{"name":"([^"]*?)","affiliation":"Democrat","value":"([^"]*)"'
    dvalue_pattern2 = r'"candidate":\[{[^}]*},{"name":"([^"]*?)","affiliation":"Democrat","value":"([^"]*)"'

    rvalue_pattern1 = r'"candidate":\[{[^}]*},{"name":"([^"]*?)","affiliation":"Republican","value":"([^"]*)"'
    rvalue_pattern2 = r'"candidate":\[{"name":"([^"]*?)","affiliation":"Republican","value":"([^"]*)"'

    dvalue_data = re.findall(dvalue_pattern1, json_str) or re.findall(dvalue_pattern2, json_str)
    rvalue_data = re.findall(rvalue_pattern1, json_str) or re.findall(rvalue_pattern2, json_str)

    data_rows = []
    for row in zip_longest(pollster_data, date_data, sample_size_data, margin_error_data, dvalue_data, rvalue_data, link_data, fillvalue=None):
        data_rows.append({
            "pollster": row[0],
            "date": row[1],
            "sampleSize": row[2],
            "marginError": row[3],
            "dvalue": row[4],
            "rvalue": row[5],


        })

    df = pd.DataFrame(data_rows)

    df = df.drop_duplicates().dropna(subset=['pollster'])

    return df

df_national = get_national2020_data(url)

national_presidential_2020 = df_national

presidential_2020 = clean_data(national_presidential_2020, 2020, 'National Presidential')

Success


In [17]:
#2020 Presidential Polls by State

import requests
import json
import re
from bs4 import BeautifulSoup 
import numpy
import pandas as pd
from itertools import zip_longest

def get_2020state_data(url, state):

    page = requests.get(
        url='https://proxy.scrapeops.io/v1/',
         params={
            'api_key': 'f2fec3c0-d71f-4279-9341-ba073c464e22',
            'url': url
        },
    )

    if page.status_code == 200:
        url_content = page.text
        print("Success")
        
        soup = BeautifulSoup(page.content, "html.parser") 

        script_tags = soup.find_all('script')

        for script in script_tags:
            if script.string and 'finalData' in script.string:
                str = script.string
                break

        str2 = str.split('self.__next_f.push(')
        str3 = str2[1][:-1]
        jsonx = json.loads(str3)
        json_str = jsonx[1] 

        pollster_pattern = r'"pollster":\s*"([^"]*)"'
        date_pattern = r'"date":\s*"([^"]*)"'
        sample_size_pattern = r'"sampleSize":\s*"([^"]*)"'
        margin_error_pattern = r'"marginError":\s*"([^"]*)"'
        
        link_pattern = r'"link":\s*"([^"]*)"'

        dvalue_pattern1 = r'"candidate":\[{"name":"([^"]*?)","affiliation":"Democrat","value":"([^"]*)"'
        dvalue_pattern2 = r'"candidate":\[{[^}]*},{"name":"([^"]*?)","affiliation":"Democrat","value":"([^"]*)"'

        rvalue_pattern1 = r'"candidate":\[{[^}]*},{"name":"([^"]*?)","affiliation":"Republican","value":"([^"]*)"'
        rvalue_pattern2 = r'"candidate":\[{"name":"([^"]*?)","affiliation":"Republican","value":"([^"]*)"'

        dvalue_data = re.findall(dvalue_pattern1, json_str) or re.findall(dvalue_pattern2, json_str)
        rvalue_data = re.findall(rvalue_pattern1, json_str) or re.findall(rvalue_pattern2, json_str)

        pollster_data = re.findall(pollster_pattern, json_str)
        date_data = re.findall(date_pattern, json_str)
        sample_size_data = re.findall(sample_size_pattern, json_str)
        margin_error_data = re.findall(margin_error_pattern, json_str)
        
        link_data = re.findall(link_pattern, json_str)

        data_rows = []
        for row in zip_longest(pollster_data, date_data, sample_size_data, margin_error_data, dvalue_data, rvalue_data, link_data, fillvalue=None):
            data_rows.append({
                
                "pollster": row[0],
                "date": row[1],
                "sampleSize": row[2],
                "marginError": row[3],
                "dvalue": row[4],
                "rvalue": row[5],
                "state": state,
                
            })
        
        return data_rows
    else:
        print("Failed to retrieve. Status Code was", page.status_code)
        return[]


state_urls = {
    
    "Alabama": "https://www.realclearpolling.com/polls/president/general/2020/alabama/trump-vs-biden#polls",
    "Alaska": "https://www.realclearpolling.com/polls/president/general/2020/alaska/trump-vs-biden#polls",
    "Arizona": "https://www.realclearpolling.com/polls/president/general/2020/arizona/trump-vs-biden#polls",
    "Arkansas": "https://www.realclearpolling.com/polls/president/general/2020/arkansas/trump-vs-biden#polls",
    "California": "https://www.realclearpolling.com/polls/president/general/2020/california/trump-vs-biden#polls",
    "Colorado": "https://www.realclearpolling.com/polls/president/general/2020/colorado/trump-vs-biden#polls",
    "Connecticut": "https://www.realclearpolling.com/polls/president/general/2020/connecticut/trump-vs-biden#polls",
    "Delaware": "https://www.realclearpolling.com/polls/president/general/2020/delaware/trump-vs-biden#polls",
    "Florida": "https://www.realclearpolling.com/polls/president/general/2020/florida/trump-vs-biden#polls",
    "Georgia": "https://www.realclearpolling.com/polls/president/general/2020/georgia/trump-vs-biden#polls",
    "Hawaii": "https://www.realclearpolling.com/polls/president/general/2020/hawaii/trump-vs-biden#polls",
    #"Idaho": "",  # Missing URL
    #"Illinois": "", #Missing URL
    "Indiana": "https://www.realclearpolling.com/polls/president/general/2020/indiana/trump-vs-biden#polls",
    "Iowa": "https://www.realclearpolling.com/polls/president/general/2020/iowa/trump-vs-biden#polls",
    "Kansas": "https://www.realclearpolling.com/polls/president/general/2020/kansas/trump-vs-biden#polls",
    "Kentucky": "https://www.realclearpolling.com/polls/president/general/2020/kentucky/trump-vs-biden#polls",
    "Louisiana": "https://www.realclearpolling.com/polls/president/general/2020/louisiana/trump-vs-biden#polls",
    "Maine": "https://www.realclearpolling.com/polls/president/general/maine/2020/trump-vs-biden#polls",
    "Maine CD1": "https://www.realclearpolling.com/polls/president/general/2020/maine-cd1/trump-vs-biden#polls",
    "Maine CD2": "https://www.realclearpolling.com/polls/president/general/2020/maine/cd2-trump-vs-biden#polls",
    "Maryland": "https://www.realclearpolling.com/polls/president/general/2020/maryland/trump-vs-biden#polls",
    "Massachusetts": "https://www.realclearpolling.com/polls/president/general/2020/massachusetts/trump-vs-biden#polls",
    "Michigan": "https://www.realclearpolling.com/polls/president/general/2020/michigan/trump-vs-biden#polls",
    "Minnesota": "https://www.realclearpolling.com/polls/president/general/2020/minnesota/trump-vs-biden#polls",
    "Mississippi": "https://www.realclearpolling.com/polls/president/general/2020/mississippi/trump-vs-biden#polls",
    "Missouri": "https://www.realclearpolling.com/polls/president/general/2020/missouri/trump-vs-biden#polls",
    "Montana": "https://www.realclearpolling.com/polls/president/general/2020/montana/trump-vs-biden#polls",
    "Nebraska CD2": "https://www.realclearpolling.com/polls/president/general/2020/nebraska-cd2/trump-vs-biden#polls",
    "Nevada": "https://www.realclearpolling.com/polls/president/general/2020/nevada/trump-vs-biden#polls",
    "New Hampshire": "https://www.realclearpolling.com/polls/president/general/2020/new-hampshire/trump-vs-biden#polls",
    "New Jersey": "https://www.realclearpolling.com/polls/president/general/2020/new-jersey/trump-vs-biden#polls",
    "New Mexico": "https://www.realclearpolling.com/polls/president/general/2020/new-mexico/trump-vs-biden#polls",
    "New York": "https://www.realclearpolling.com/polls/president/general/2020/new-york/trump-vs-biden#polls",
    "North Carolina": "https://www.realclearpolling.com/polls/president/general/2020/north-carolina/trump-vs-biden#polls",
    #"North Dakota": "",  # Missing URL
    "Ohio": "https://www.realclearpolling.com/polls/president/general/2020/ohio/trump-vs-biden#polls",
    "Oklahoma": "https://www.realclearpolling.com/polls/president/general/2020/oklahoma/trump-vs-biden#polls",
    "Oregon": "https://www.realclearpolling.com/polls/president/general/2020/oregon/trump-vs-biden#polls",
    "Pennsylvania": "https://www.realclearpolling.com/polls/president/general/2020/pennsylvania/trump-vs-biden#polls",
    #"Rhode Island": "",  # Missing URL
    "South Carolina": "https://www.realclearpolling.com/polls/president/general/2020/south-carolina/trump-vs-biden#polls",
    "South Dakota": "https://www.realclearpolling.com/polls/president/general/2020/south-dakota/trump-vs-biden#polls",
    "Tennessee": "https://www.realclearpolling.com/polls/president/general/2020/tennessee/trump-vs-biden#polls",
    "Texas": "https://www.realclearpolling.com/polls/president/general/2020/texas/trump-vs-biden#polls",
    "Utah": "https://www.realclearpolling.com/polls/president/general/2020/utah/trump-vs-biden#polls",
    "Vermont": "https://www.realclearpolling.com/polls/president/general/2020/vermont/trump-vs-biden#polls",
    "Virginia": "https://www.realclearpolling.com/polls/president/general/2020/virginia/trump-vs-biden#polls",
    "Washington": "https://www.realclearpolling.com/polls/president/general/2020/washington/trump-vs-biden#polls",
    "West Virginia": "https://www.realclearpolling.com/polls/president/general/2020/west-virginia/trump-vs-biden#polls",
    "Wisconsin": "https://www.realclearpolling.com/polls/president/general/2020/wisconsin/trump-vs-biden#polls",
    "Wyoming": "https://www.realclearpolling.com/polls/president/general/2020/wyoming/trump-vs-biden#polls",
                 
}

all_state_dataframe = []

for state, url in state_urls.items():
    time.sleep(120.235)
    state_df = get_2020state_data(url, state)
    df = pd.DataFrame(state_df)
    if len(df)>0:
        df = df.drop_duplicates().dropna(subset=['pollster'])
        all_state_dataframe.append(df)
    else:
        continue
    
all_state_df = pd.concat(all_state_dataframe, ignore_index=True)

presidential_2020_states = all_state_df

presidential_state_20202 = clean_data(presidential_2020_states, 2020, 'Presidential')

Success
Success
Success
Success
Success
Success
Success
Success
Success
Success
Success
Success
Success
Success
Success
Success
Success
Success
Success
Success
Success
Success
Success
Success
Success
Success
Success
Success
Success
Success
Success
Success
Success
Success
Success
Success
Success
Success
Success
Success
Success
Success
Success
Success
Success
Success
Success
Success


In [16]:
#2020 Senate Polls by State

import requests
import json
import re
from bs4 import BeautifulSoup 
import numpy
import pandas as pd
from itertools import zip_longest

def get_senate2020_data(url, state):

    page = requests.get(
        url='https://proxy.scrapeops.io/v1/',
         params={
            'api_key': 'f2fec3c0-d71f-4279-9341-ba073c464e22',
            'url': url
        },
    )

    if page.status_code == 200:
        url_content = page.text
        print("Success")
        
        soup = BeautifulSoup(page.content, "html.parser") 

        script_tags = soup.find_all('script')

        str = ""
        for script in script_tags:
            if script.string and 'finalData' in script.string:
                str += script.string

        x = str.replace("\\","")     

        #str2 = str.split('self.__next_f.push(')
        #str3 = str2[1][:-1]
        #jsonx = json.loads(str3)
        #json_str = jsonx[1] 
        #print(json_str)
        
        pollster_pattern = r'"pollster":\s*"([^"]*)"'
        date_pattern = r'"date":\s*"([^"]*)"'
        sample_size_pattern = r'"sampleSize":\s*"([^"]*)"'
        margin_error_pattern = r'"marginError":\s*"([^"]*)"'
        
        link_pattern = r'"link":\s*"([^"]*)"'

        dvalue_pattern1 = r'"candidate":\[{"name":"([^"]*?)","affiliation":"Democrat","value":"([^"]*)"'
        dvalue_pattern2 = r'"candidate":\[{[^}]*},{"name":"([^"]*?)","affiliation":"Democrat","value":"([^"]*)"'

        rvalue_pattern1 = r'"candidate":\[{[^}]*},{"name":"([^"]*?)","affiliation":"Republican","value":"([^"]*)"'
        rvalue_pattern2 = r'"candidate":\[{"name":"([^"]*?)","affiliation":"Republican","value":"([^"]*)"'


        dvalue_data = re.findall(dvalue_pattern1, x) or re.findall(dvalue_pattern2, x)
        rvalue_data = re.findall(rvalue_pattern1, x) or re.findall(rvalue_pattern2, x)

        pollster_data = re.findall(pollster_pattern, x)
        date_data = re.findall(date_pattern, x)
        sample_size_data = re.findall(sample_size_pattern, x)
        margin_error_data = re.findall(margin_error_pattern, x)
        
        link_data = re.findall(link_pattern, x)

        data_rows = []
        for row in zip_longest(pollster_data, date_data, sample_size_data, margin_error_data, dvalue_data, rvalue_data, link_data, fillvalue=None):
            data_rows.append({
                
                "pollster": row[0],
                "date": row[1],
                "sampleSize": row[2],
                "marginError": row[3],
                "dvalue": row[4],
                "rvalue": row[5],
                "state": state
                #"race": race_value
                
            })
        
        return data_rows
        
    else:
        print("Failed to retrieve. Status Code was", page.status_code)
        return[]
        

state_urls = {
                
    "Alabama": "https://www.realclearpolling.com/polls/senate/general/2020/alabama/tuberville-vs-jones#polls",
    "Alaska": "https://www.realclearpolling.com/polls/senate/general/2020/alaska/sullivan-vs-gross#polls",
    "Arizona": "https://www.realclearpolling.com/polls/senate/general/2020/arizona/kelly-vs-mcsally#polls",
    "Arkansas": "https://www.realclearpolling.com/polls/senate/general/2020/arkansas/cotton-vs-harrington#polls",
    #"California": "https://www.realclearpolling.com/polls/senate/general/2022/california/meuser-vs-padilla#polls",
    "Colorado": "https://www.realclearpolling.com/polls/senate/general/2020/colorado/gardner-vs-hickenlooper#polls",
    #"Connecticut": "https://www.realclearpolling.com/polls/senate/general/2022/connecticut/levy-vs-blumenthal#polls",
    "Delaware": "https://www.realclearpolling.com/polls/senate/general/2020/delaware/witzke-vs-coons", 
    #"Florida": "https://www.realclearpolling.com/polls/senate/general/2022/florida/rubio-vs-demings#polls",
    "Georgia": "https://www.realclearpolling.com/polls/senate/general/2020/georgia/perdue-vs-ossoff#polls",
    #"Hawaii": "https://www.realclearpolitics.com/epolls/2022/senate/hi/hawaii_senate_mcdermott_vs_schatz-7929.html#polls",
    "Idaho": "https://www.realclearpolitics.com/epolls/2020/senate/id/idaho_senate_risch_vs_jordan-7070.html#polls",
    "Illinois": "https://www.realclearpolitics.com/epolls/2020/senate/il/illinois_senate_curran_vs_durbin-7071.html#polls",
    #"Indiana": "https://www.realclearpolitics.com/epolls/2022/senate/in/indiana_senate_young_vs_mcdermott-7746.html#polls",
    "Iowa": "https://www.realclearpolling.com/polls/senate/general/2020/iowa/ernst-vs-greenfield#polls",
    "Kansas": "https://www.realclearpolling.com/polls/senate/general/2020/kansas/marshall-vs-bollier#polls",
    "Kentucky": "https://www.realclearpolling.com/polls/senate/general/2020/kentucky/mcconnell-vs-mcgrath#polls",
    "Louisiana": "https://www.realclearpolitics.com/epolls/2020/senate/la/louisiana_senate_open_primary-7074.html#polls",
    "Maine": "https://www.realclearpolling.com/polls/senate/general/2020/maine/collins-vs-gideon#polls",
    #"Maine CD1": "https://www.realclearpolling.com/polls/president/general/2024/maine/trump-vs-biden-cd1#polls",
    #"Maine CD2": "https://www.realclearpolling.com/polls/president/general/2024/maine/trump-vs-biden-cd2#polls",
    #"Maryland": "https://www.realclearpolling.com/polls/senate/general/2022/maryland/vanhollen-vs-chaffee#polls",
    "Massachusetts": "https://www.realclearpolling.com/polls/senate/general/2020/massachusetts/markey-vs-oconnor#polls",
    "Michigan": "https://www.realclearpolling.com/polls/senate/general/2020/michigan/james-vs-peters#polls",
    "Minnesota": "https://www.realclearpolling.com/polls/senate/general/2020/minnesota/lewis-vs-smith#polls",
    "Mississippi": "https://www.realclearpolling.com/polls/senate/general/2020/mississippi/hyde-smith-vs-espy#polls",
    #"Missouri": "https://www.realclearpolling.com/polls/senate/general/2022/missouri/schmitt-vs-valentine#polls",
    "Montana": "https://www.realclearpolling.com/polls/senate/general/2020/montana/daines-vs-bullock#polls",
    "Nebraksa": "https://www.realclearpolitics.com/epolls/2020/senate/ne/nebraska_senate_sasse_vs_democrat-7076.html#polls",
    #"Nebraska CD2": "https://www.realclearpolling.com/polls/president/general/2024/nebraska-cd2/trump-vs-biden#polls",
    #"Nevada": "https://www.realclearpolling.com/polls/senate/general/2022/nevada/laxalt-vs-cortezmasto#polls",
    "New Hampshire": "https://www.realclearpolling.com/polls/senate/general/2020/new-hampshire/messner-vs-shaheen#polls",
    "New Jersey": "https://www.realclearpolling.com/polls/senate/general/2020/new-jersey/mehta-vs-booker#polls",
    "New Mexico": "https://www.realclearpolling.com/polls/senate/general/2020/new-mexico/ronchetti-vs-lujan#polls",
    #"New York": "https://www.realclearpolling.com/polls/senate/general/2022/new-york/schumer-vs-pinion#polls",
    "North Carolina": "https://www.realclearpolling.com/polls/senate/general/2020/north-carolina/tillis-vs-cunningham#polls",
    #"North Dakota": "https://www.realclearpolitics.com/epolls/2022/senate/nd/north_dakota_senate_hoeven_vs_christiansen-7527.html#polls",
    #"Ohio": "https://www.realclearpolling.com/polls/senate/general/2022/ohio/vance-vs-ryan#polls",
    "Oklahoma": "https://www.realclearpolling.com/polls/senate/general/2020/oklahoma/inhofe-vs-broyles#polls",
    "Oregon": "https://www.realclearpolitics.com/epolls/2020/senate/or/oregon_senate_perkins_vs_merkley-7081.html#polls",
    #"Pennsylvania": "https://www.realclearpolling.com/polls/senate/general/2022/pennsylvania/oz-vs-fetterman#polls",
    "Rhode Island": "https://www.realclearpolitics.com/epolls/2020/senate/ri/rhode_island_senate_waters_vs_reed-7082.html#polls",  
    "South Carolina":"https://www.realclearpolling.com/polls/senate/general/2020/south-carolina/graham-vs-harrison#polls",
    "South Dakota": "https://www.realclearpolling.com/polls/senate/general/2020/south-dakota/rounds-vs-ahlers#polls",
    "Tennessee": "https://www.realclearpolitics.com/epolls/2020/senate/tn/tennessee_senate_hagerty_vs_bradshaw-7239.html#polls",
    "Texas": "https://www.realclearpolling.com/polls/senate/general/2020/texas/cornyn-vs-hegar#polls",
    #"Utah": "https://www.realclearpolling.com/polls/senate/general/2022/utah/lee-vs-mcmullin#polls",
    #"Vermont": "https://www.realclearpolling.com/polls/senate/general/2022/vermont/malloy-vs-welch#polls",
    "Virginia": "https://www.realclearpolling.com/polls/senate/general/2020/virginia/gade-vs-warner#polls",
    #"Washington": "https://www.realclearpolling.com/polls/senate/general/2022/washington/smiley-vs-murray#polls",
    "West Virginia": "https://www.realclearpolitics.com/epolls/2020/senate/wv/west_virginia_senate_moore_capito_vs_swearengin-7087.html#polls",
    #"Wisconsin": "https://www.realclearpolling.com/polls/senate/general/2022/wisconsin/johnson-vs-barnes#polls",
    "Wyoming": "https://www.realclearpolling.com/polls/senate/general/2020/wyoming/lummis-vs-ben-david#polls"
       
}


all_state_dataframe = []

for state, url in state_urls.items():
    time.sleep(120.235)
    state_df = get_senate2020_data(url, state)
    df = pd.DataFrame(state_df)
    if len(df)>0:
        df = df.drop_duplicates().dropna(subset=['pollster']).dropna(subset=['dvalue'])
        all_state_dataframe.append(df)
    else:
        continue
    
all_state_df = pd.concat(all_state_dataframe, ignore_index=True)

senate_2020 = all_state_df

senate_2020 = clean_data(senate_2020, 2020, 'Senate')

Success
Success
Success
Success
Success
Success
Success
Failed to retrieve. Status Code was 500
Success
Success
Success
Success
Success
Success
Success
Success
Success
Success
Success
Success
Success
Success
Success
Success
Success
Success
Failed to retrieve. Status Code was 500
Success
Success
Failed to retrieve. Status Code was 500
Success
Success
Success
Success


In [18]:
#concatenate all dataframes that are currently in memory

all_data = pd.concat([presidential_2020, presidential_state_20202, senate_2020, governor_2020, general_congressional_2020, 
                      senate_2018, governor_2018, general_congressional_2018, senate_2016, presidential_state_2016, governor_2016,
                      general_congressional_2016], ignore_index=True)

#open saved csvs and concatenate to all_data

senate_2014 = pd.read_csv('senate_2014.csv')
governor_2014 = pd.read_csv('governor_2014.csv')
general_congressional_2014 = pd.read_csv('general_congressional_2014.csv')


total_data = pd.concat([all_data, senate_2014, governor_2014, general_congressional_2014], ignore_index=True)


In [22]:
(total_data['Year'].astype(str)+total_data['Type']).value_counts()

print(total_data.shape[0])

total_data.to_csv('total_data.csv', index=False)

5749
