In [3]:
import pandas as pd
import math
import random
from collections import Counter
import json
import plotly.graph_objects as go
from datetime import datetime
import requests
import os 
import pickle

In [2]:
def getLatestPollOfStateAndCandidate(df, state, candidateA, candidateB, verbose =True ):
    # Filter the dataframe for the given state and candidate
    state_candidate_polls = df[(df['state'] == state) & ((df['answer'] == candidateA) | (df['answer'] == candidateB)  )]
    
    if state_candidate_polls.empty:
        if verbose:
            print(f"No polls found for {candidateA} or {candidateB} in {state}")
        return None
    
    # Sort by end_date in descending order and get the first row
    latest_poll = state_candidate_polls.sort_values('end_date', ascending=False).iloc[0]
    
    return {
        'state': state,
        'candidate': candidateA,
        'percentage': latest_poll['pct'],
        'poll_date': latest_poll['end_date']
    }

In [4]:
current_date = datetime.now()
today_formatted_date = current_date.strftime("%d_%m_%Y")
filename = f"data/president_polls_{today_formatted_date}.csv"

df = pd.read_csv(filename)
df.head()

Unnamed: 0,poll_id,pollster_id,pollster,sponsor_ids,sponsors,display_name,pollster_rating_id,pollster_rating_name,numeric_grade,pollscore,...,stage,nationwide_batch,ranked_choice_reallocated,ranked_choice_round,hypothetical,party,answer,candidate_id,candidate_name,pct
0,88739,770,TIPP,,,TIPP Insights,144,TIPP Insights,1.8,-0.4,...,general,False,False,,False,DEM,Harris,16661,Kamala Harris,49.0
1,88739,770,TIPP,,,TIPP Insights,144,TIPP Insights,1.8,-0.4,...,general,False,False,,False,REP,Trump,16651,Donald Trump,46.0
2,88762,568,YouGov,133.0,CBS News,YouGov,391,YouGov,3.0,-1.1,...,general,False,False,,False,DEM,Harris,16661,Kamala Harris,48.0
3,88762,568,YouGov,133.0,CBS News,YouGov,391,YouGov,3.0,-1.1,...,general,False,False,,False,REP,Trump,16651,Donald Trump,51.0
4,88756,1554,RMG Research,,,RMG Research,555,RMG Research,2.3,-0.4,...,general,False,False,,False,DEM,Harris,16661,Kamala Harris,49.0


In [6]:
df.columns

Index(['poll_id', 'pollster_id', 'pollster', 'sponsor_ids', 'sponsors',
       'display_name', 'pollster_rating_id', 'pollster_rating_name',
       'numeric_grade', 'pollscore', 'methodology', 'transparency_score',
       'state', 'start_date', 'end_date', 'sponsor_candidate_id',
       'sponsor_candidate', 'sponsor_candidate_party', 'endorsed_candidate_id',
       'endorsed_candidate_name', 'endorsed_candidate_party', 'question_id',
       'sample_size', 'population', 'subpopulation', 'population_full',
       'tracking', 'created_at', 'notes', 'url', 'url_article', 'url_topline',
       'url_crosstab', 'source', 'internal', 'partisan', 'race_id', 'cycle',
       'office_type', 'seat_number', 'seat_name', 'election_date', 'stage',
       'nationwide_batch', 'ranked_choice_reallocated', 'ranked_choice_round',
       'hypothetical', 'party', 'answer', 'candidate_id', 'candidate_name',
       'pct'],
      dtype='object')

In [13]:
df.iloc[2]

poll_id                                                                  88762
pollster_id                                                                568
pollster                                                                YouGov
sponsor_ids                                                                133
sponsors                                                              CBS News
display_name                                                            YouGov
pollster_rating_id                                                         391
pollster_rating_name                                                    YouGov
numeric_grade                                                              3.0
pollscore                                                                 -1.1
methodology                                                       Online Panel
transparency_score                                                         9.0
state                                               

In [21]:
from datetime import datetime, date

def getLatestPollOfStateAndCandidateAtTime(df, state, candidateA, candidateB, date_before=None, verbose=True):
    # Set date_before to today if not provided
    if date_before is None:
        date_before = date.today()
    elif isinstance(date_before, str):
        # Try to parse date_before flexibly
        date_before = pd.to_datetime(date_before, dayfirst=False, yearfirst=False).date()
    
    # Convert end_date to datetime, letting pandas infer the format
    df['end_date'] = pd.to_datetime(df['end_date'], dayfirst=False, yearfirst=False)
    
    # Filter the dataframe for the given state, candidates, and date
    state_candidate_polls = df[
        (df['state'] == state) & 
        ((df['answer'] == candidateA) | (df['answer'] == candidateB)) &
        (df['end_date'].dt.date <= date_before)
    ]
    
    if state_candidate_polls.empty:
        if verbose:
            print(f"No polls found for {candidateA} or {candidateB} in {state} before {date_before}")
        return None
    
    # Sort by end_date in descending order and get the first row
    latest_poll = state_candidate_polls.sort_values('end_date', ascending=False).iloc[0]
    
    return {
        'state': state,
        'candidate': candidateA,
        'percentage': latest_poll['pct'],
        'poll_date': latest_poll['end_date'].strftime('%m/%d/%Y')
    }

In [22]:
getLatestPollOfStateAndCandidateAtTime(df, "Arizona", "Trump", "Trump", date_before=None, verbose=True)

  df['end_date'] = pd.to_datetime(df['end_date'], dayfirst=False, yearfirst=False)


{'state': 'Arizona',
 'candidate': 'Trump',
 'percentage': np.float64(51.0),
 'poll_date': '10/16/2024'}

In [27]:
getLatestPollOfStateAndCandidateAtTime(df, "Arizona", "Trump", "Trump", date_before="07/31/2024", verbose=True)

{'state': 'Arizona',
 'candidate': 'Trump',
 'percentage': np.float64(49.0),
 'poll_date': '07/30/2024'}

In [28]:
from datetime import datetime, timedelta

def generate_date_list(start_date, end_date):
    date_list = []
    current_date = start_date
    while current_date <= end_date:
        date_list.append(current_date.strftime('%m/%d/%Y'))
        current_date += timedelta(days=1)
    return date_list

# Set the start date to June 1, 2024
start_date = datetime(2024, 6, 1)

# Set the end date to today
end_date = datetime.now()

# Generate the list of dates
dates = generate_date_list(start_date, end_date)

# Print the first few and last few dates to verify
print("First few dates:")
print(dates[:5])
print("\nLast few dates:")
print(dates[-5:])
print(f"\nTotal number of dates: {len(dates)}")

First few dates:
['06/01/2024', '06/02/2024', '06/03/2024', '06/04/2024', '06/05/2024']

Last few dates:
['10/14/2024', '10/15/2024', '10/16/2024', '10/17/2024', '10/18/2024']

Total number of dates: 140


In [35]:
def election_probability(pa, pb, moe, uv=0):
    # Adjust for undecided voters
    total = pa + pb
    adj_pa = pa / total * (100 - uv)
    adj_pb = pb / total * (100 - uv)
    
    # Calculate spread
    spread = adj_pa - adj_pb
    
    # Calculate standard error
    se = moe / 1.96
    
    # Calculate z-score
    z = spread / (se * math.sqrt(2))
    
    # Use error function to calculate probability
    probability = 0.5 * (1 + math.erf(z / math.sqrt(2)))
    
    return probability * 100 
election_probability(50,49,5,0)

61.02557678743099

In [36]:
df['end_date']

0       2024-10-16
1       2024-10-16
2       2024-10-16
3       2024-10-16
4       2024-10-16
           ...    
15752   2021-04-16
15753   2021-04-16
15754   2021-04-16
15755   2021-04-07
15756   2021-04-07
Name: end_date, Length: 15757, dtype: datetime64[ns]

In [38]:
df['end_date'][100:115]

100   2024-10-14
101   2024-10-14
102   2024-10-14
103   2024-10-14
104   2024-10-14
105   2024-10-14
106   2024-10-14
107   2024-10-14
108   2024-10-14
109   2024-10-14
110   2024-10-14
111   2024-10-14
112   2024-10-14
113   2024-10-14
114   2024-10-14
Name: end_date, dtype: datetime64[ns]

In [39]:
df = pd.read_csv(filename)

In [40]:
df['end_date'][100:115]

100    10/14/24
101    10/14/24
102    10/14/24
103    10/14/24
104    10/14/24
105    10/14/24
106    10/14/24
107    10/14/24
108    10/14/24
109    10/14/24
110    10/14/24
111    10/14/24
112    10/14/24
113    10/14/24
114    10/14/24
Name: end_date, dtype: object