## Election Data Scrape Wikipedia

In [6]:
# pip install requests beautifulsoup4 pandas
!pip install lxml

Collecting lxml
  Downloading lxml-5.3.1-cp311-cp311-win_amd64.whl (3.8 MB)
                                              0.0/3.8 MB ? eta -:--:--
     --                                       0.2/3.8 MB 3.6 MB/s eta 0:00:02
     ------                                   0.6/3.8 MB 6.0 MB/s eta 0:00:01
     ----------                               1.0/3.8 MB 6.8 MB/s eta 0:00:01
     --------------                           1.4/3.8 MB 7.5 MB/s eta 0:00:01
     --------------------                     2.0/3.8 MB 8.4 MB/s eta 0:00:01
     ---------------------------              2.7/3.8 MB 9.4 MB/s eta 0:00:01
     --------------------------------         3.1/3.8 MB 10.0 MB/s eta 0:00:01
     ---------------------------------------  3.8/3.8 MB 10.5 MB/s eta 0:00:01
     ---------------------------------------- 3.8/3.8 MB 10.1 MB/s eta 0:00:00
Installing collected packages: lxml
Successfully installed lxml-5.3.1



[notice] A new release of pip is available: 23.1.2 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [100]:
import requests
import pandas as pd
from bs4 import BeautifulSoup

In [101]:
import requests
import pandas as pd
from bs4 import BeautifulSoup

# Wikipedia URL
url = "https://en.wikipedia.org/wiki/Opinion_polling_for_the_2025_Australian_federal_election"

# Request the page
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')

# Find all wikitables
wikitable_list = soup.find_all("table", {"class": "wikitable"})

# Store extracted tables
filtered_dataframes = []

# Define target column keywords
target_columns = {"Sample", "Sample size", "Firm", "Brand"}

# Loop through each wikitable
for table in wikitable_list:
    try:
        # Convert HTML table to DataFrame
        df = pd.read_html(str(table))[0]  # Extract first table from parsed HTML
        
        # Check if the DataFrame contains at least one target column
        if any(col in df.columns for col in target_columns):
            filtered_dataframes.append(df)  # Store only relevant tables
            
    except Exception as e:
        print(f"Skipping a table due to an error: {e}")

# Check results
print(f"Extracted {len(filtered_dataframes)} relevant wikitables.")

# Optional: Preview first few rows of each filtered table
# for i, df in enumerate(filtered_dataframes):
#     print(f"\nTable {i} Preview:")
#     print(df.head())

filtered_dataframes[11].head()

Extracted 16 relevant wikitables.


Unnamed: 0_level_0,Date,Firm,Sample size,Primary vote,Primary vote,Primary vote,Primary vote,Primary vote,Primary vote,Primary vote,2pp vote,2pp vote
Unnamed: 0_level_1,Date,Firm,Sample size,ALP,L/NP,GRN,ONP,UAP,OTH,UND,ALP,L/NP
0,1 Oct – 8 Dec 2024,Resolve Strategic[291],460,30%,37%,12%,5%,—,16%,—,50%,50%
1,7 Oct – 6 Dec 2024,Newspoll[291],376,38%,37%,11%,5%,—,9%,—,54%,46%
2,30 Oct – 4 Nov 2024,DemosAU[293],948,34%,38%,14%,6%,—,8%,—,52%,48%
3,1–10 Oct 2024,Redbridge[294],1514,35%,34%,—,—,—,—,—,54.5%,45.5%
4,15 Jul – 20 Sep 2024,Newspoll[291],562,36%,39%,11%,4%,—,10%,—,52%,48%


In [7]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import warnings
import re

# Suppress FutureWarnings from pandas
warnings.simplefilter(action="ignore", category=FutureWarning)

# Wikipedia URL
url = "https://en.wikipedia.org/wiki/Opinion_polling_for_the_2025_Australian_federal_election"

# Request the page
response = requests.get(url)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')

# Define possible top and bottom headers (case-insensitive search)
top_headers = ["Voting Intention", "National Polling"]
bottom_headers = ["Preferred Prime Minister", "Leadership Polling"]

# Find all header tags (h2, h3, h4)
headers = soup.find_all(['h2', 'h3', 'h4'])

# Locate polling tables
polling_tables = []
found_top = False
for header in headers:
    header_text = header.get_text(strip=True).lower()
    
    if any(top.lower() in header_text for top in top_headers):
        found_top = True
        continue
    
    if found_top and any(bottom.lower() in header_text for bottom in bottom_headers):
        break  # Stop at first bottom header occurrence
    
    if found_top:
        table = header.find_next("table", class_="wikitable")
        if table:
            polling_tables.append(table)

# Convert tables to DataFrames
all_dataframes = [pd.read_html(str(table))[0] for table in polling_tables]

# Clean column headers
for i, df in enumerate(all_dataframes):
    if isinstance(df.columns, pd.MultiIndex):  # Check for MultiIndex headers
        df.columns = [
            col1 if col1 == col2 else f"{col1}: {col2}"
            for col1, col2 in zip(df.columns.get_level_values(0), df.columns.get_level_values(1))
        ]
        df = df.iloc[0:].reset_index(drop=True)  # Reset index after cleaning
        all_dataframes[i] = df  # Save cleaned DataFrame

# Extract Mapping Table (Events During Polling Period)
event_mapping = []

def is_event_text(value):
    """Determine if a value is an event description rather than a percentage or missing data."""
    if isinstance(value, str):
        return not re.match(r"^\d+%?$", value)  # Check if it is NOT a percentage or numeric data
    return False

for i, df in enumerate(all_dataframes):
    if any("Primary vote" in col for col in df.columns):
        rows_to_remove = []
        for index, row in df.iterrows():
            if any(is_event_text(row[col]) for col in df.columns if "Primary vote" in col):
                event_mapping.append({
                    "Date": row[0],  # Assuming first column is the date
                    "Event": next(row[col] for col in df.columns if "Primary vote" in col and is_event_text(row[col]))
                })
                rows_to_remove.append(index)
        
        # Drop event mapping rows from the original table
        all_dataframes[i] = df.drop(rows_to_remove).reset_index(drop=True)

# Convert to DataFrame
event_mapping_df = pd.DataFrame(event_mapping)

# Debugging Output
print(f"Extracted {len(all_dataframes)} national polling tables.")
print("\nEvent Mapping Table:")
print(event_mapping_df)

for i, df in enumerate(all_dataframes):
    print(f"\nCleaned National Polling Table {i}:")
    print(df)


Extracted 4 national polling tables.

Event Mapping Table:
               Date                                              Event
0       28 Mar 2025  The 2025 Australian federal election is called...
1    13–24 Mar 2025                                                  —
2    17–23 Mar 2025                                              35.5%
3    14–19 Mar 2025                                                  —
4    12–16 Mar 2025                                              1%[c]
..              ...                                                ...
211  17–21 Aug 2022                                                  —
212  27–30 Jul 2022                                                  —
213  13–19 Jun 2022                                               0.5%
214     29 May 2022  Peter Dutton elected unopposed as Leader of th...
215     21 May 2022                                              35.7%

[216 rows x 2 columns]

Cleaned National Polling Table 0:
Empty DataFrame
Columns: [Date

In [10]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import warnings
import re

# Suppress FutureWarnings from pandas
warnings.simplefilter(action="ignore", category=FutureWarning)

# Wikipedia URL
url = "https://en.wikipedia.org/wiki/Opinion_polling_for_the_2025_Australian_federal_election"

# Request the page
response = requests.get(url)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')

# Define possible top and bottom headers (case-insensitive search)
top_headers = ["Voting Intention", "National Polling"]
bottom_headers = ["Preferred Prime Minister", "Leadership Polling"]

# Find all header tags (h2, h3, h4)
headers = soup.find_all(['h2', 'h3', 'h4'])

# Locate polling tables
polling_tables = []
found_top = False
for header in headers:
    header_text = header.get_text(strip=True).lower()
    
    if any(top.lower() in header_text for top in top_headers):
        found_top = True
        continue
    
    if found_top and any(bottom.lower() in header_text for bottom in bottom_headers):
        break  # Stop at first bottom header occurrence
    
    if found_top:
        table = header.find_next("table", class_="wikitable")
        if table:
            polling_tables.append(table)

# Convert tables to DataFrames
all_dataframes = [pd.read_html(str(table))[0] for table in polling_tables]

# Clean column headers
for i, df in enumerate(all_dataframes):
    if isinstance(df.columns, pd.MultiIndex):  # Check for MultiIndex headers
        df.columns = [
            col1 if col1 == col2 else f"{col1}: {col2}"
            for col1, col2 in zip(df.columns.get_level_values(0), df.columns.get_level_values(1))
        ]
        df = df.iloc[0:].reset_index(drop=True)  # Reset index after cleaning
        all_dataframes[i] = df  # Save cleaned DataFrame

# Extract Mapping Table (Events During Polling Period)
event_mapping = []

def is_event_text(value):
    """Determine if a value is an event description rather than a percentage, missing data, or placeholders."""
    if isinstance(value, str):
        value = re.sub(r"\[.*?\]", "", value).strip()  # Remove citation markers (e.g., [a], [b])
        return not re.match(r"^\d+(\.\d+)?%?$", value) and value.strip() not in ["—", "-"]  # Exclude percentages and dashes
    return False

for i, df in enumerate(all_dataframes):
    if any("Primary vote" in col for col in df.columns):
        rows_to_remove = []
        for index, row in df.iterrows():
            if any(is_event_text(row[col]) for col in df.columns if "Primary vote" in col):
                event_mapping.append({
                    "Date": row[0],  # Assuming first column is the date
                    "Event": next(row[col] for col in df.columns if "Primary vote" in col and is_event_text(row[col]))
                })
                rows_to_remove.append(index)
        
        # Drop event mapping rows from the original table
        all_dataframes[i] = df.drop(rows_to_remove).reset_index(drop=True)

# Convert to DataFrame
event_mapping_df = pd.DataFrame(event_mapping)

# Debugging Output
print(f"Extracted {len(all_dataframes)} national polling tables.")
print("\nEvent Mapping Table:")
print(event_mapping_df)

for i, df in enumerate(all_dataframes):
    print(f"\nCleaned National Polling Table {i}:")
    print(df)

Extracted 4 national polling tables.

Event Mapping Table:
          Date                                              Event
0  28 Mar 2025  The 2025 Australian federal election is called...
1  13 Apr 2024  The Liberals are re-elected in the 2024 Cook b...
2   2 Mar 2024  Labor is re-elected in the 2024 Dunkley by-ele...
3  14 Oct 2023  The 2023 Australian Indigenous Voice referendu...
4  15 Jul 2023   LNP is re-elected in the 2023 Fadden by-election
5   1 Apr 2023              Labor wins the 2023 Aston by-election
6  23 Dec 2022  Andrew Gee leaves the Nationals to become an I...
7  29 May 2022  Peter Dutton elected unopposed as Leader of th...

Cleaned National Polling Table 0:
                    Date                    Brand Interview mode Sample size  \
0         13–24 Mar 2025      Redbridge/Accent[5]         Online        2039   
1         17–23 Mar 2025            Roy Morgan[6]         Online        1683   
2         14–19 Mar 2025                YouGov[7]         Online        

In [36]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import warnings
import re
from datetime import datetime

# Suppress FutureWarnings from pandas
warnings.simplefilter(action="ignore", category=FutureWarning)

# Wikipedia URL
url = "https://en.wikipedia.org/wiki/Opinion_polling_for_the_2025_Australian_federal_election"

# Request the page
response = requests.get(url)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')

# Define possible top and bottom headers (case-insensitive search)
top_headers = ["Voting Intention", "National Polling"]
bottom_headers = ["Preferred Prime Minister", "Leadership Polling"]

# Find all header tags (h2, h3, h4)
headers = soup.find_all(['h2', 'h3', 'h4'])

# Locate polling tables
polling_tables = []
found_top = False
for header in headers:
    header_text = header.get_text(strip=True).lower()
    
    if any(top.lower() in header_text for top in top_headers):
        found_top = True
        continue
    
    if found_top and any(bottom.lower() in header_text for bottom in bottom_headers):
        break  # Stop at first bottom header occurrence
    
    if found_top:
        table = header.find_next("table", class_="wikitable")
        if table:
            polling_tables.append(table)

# Convert tables to DataFrames
all_dataframes = [pd.read_html(str(table))[0] for table in polling_tables]

# Clean column headers
for i, df in enumerate(all_dataframes):
    if isinstance(df.columns, pd.MultiIndex):  # Check for MultiIndex headers
        df.columns = [
            col1 if col1 == col2 else f"{col1}: {col2}"
            for col1, col2 in zip(df.columns.get_level_values(0), df.columns.get_level_values(1))
        ]
        df = df.iloc[0:].reset_index(drop=True)  # Reset index after cleaning
        all_dataframes[i] = df  # Save cleaned DataFrame

# Extract Mapping Table (Events During Polling Period)
event_mapping = []

def is_event_text(value):
    """Determine if a value is an event description rather than a percentage, missing data, or placeholders."""
    if isinstance(value, str):
        value = re.sub(r"\[.*?\]", "", value).strip()  # Remove citation markers (e.g., [a], [b])
        return not re.match(r"^\d+(\.\d+)?%?$", value) and value.strip() not in ["—", "-"]  # Exclude percentages and dashes
    return False

def parse_date_range(date_str):
    """Extract start and end dates from date strings."""
    
    # Normalize dashes (replace en dash or em dash with a regular dash)
    date_str = date_str.replace("–", "-").replace("—", "-")
    # Split the string by dash
    date_parts = date_str.split("-")

    try:
        if len(date_parts) == 2:  
            # Case: '16-18 Dec 2022' or '30 Nov - 4 Dec 2022'
            
            # Extract the year from the second part
            year = date_parts[1].strip()[-4:]

            # Check if the first part contains a month (e.g., '30 Nov')
            if any(char.isalpha() for char in date_parts[0]):
                # Format: '30 Nov - 4 Dec 2022'
                start_date = datetime.strptime(date_parts[0].strip() + " " + year, "%d %b %Y").strftime("%Y-%m-%d")
            else:
                # Format: '16-18 Dec 2022'
                month_year = " ".join(date_parts[1].strip().split()[-2:])  # Extract month and year
                start_date = datetime.strptime(date_parts[0].strip() + " " + month_year, "%d %b %Y").strftime("%Y-%m-%d")
            
            # Parse end date
            end_date = datetime.strptime(date_parts[1].strip(), "%d %b %Y").strftime("%Y-%m-%d")
        
        else:  
            # Case: Single date format '21 May 2022'
            start_date = end_date = datetime.strptime(date_str.strip(), "%d %b %Y").strftime("%Y-%m-%d")

        return start_date, end_date

    except ValueError:
        return None, None  # Return None if parsing fails

for i, df in enumerate(all_dataframes):
    if any("Primary vote" in col for col in df.columns):
        rows_to_remove = []
        for index, row in df.iterrows():
            if any(is_event_text(row[col]) for col in df.columns if "Primary vote" in col):
                start_date, end_date = parse_date_range(row[0])  # Parse date
                event_mapping.append({
                    "Start Date": start_date,
                    "End Date": end_date,
                    "Event": next(row[col] for col in df.columns if "Primary vote" in col and is_event_text(row[col]))
                })
                rows_to_remove.append(index)
        
        # Drop event mapping rows from the original table
        all_dataframes[i] = df.drop(rows_to_remove).reset_index(drop=True)

# Convert to DataFrame
event_mapping_df = pd.DataFrame(event_mapping)

# Clean date columns in polling tables
for i, df in enumerate(all_dataframes):
    if df.shape[0] > 0 and df.shape[1] > 0:  # Ensure DataFrame is not empty
        start_dates = []
        end_dates = []
        for date in df.iloc[:, 0]:  # Assume first column is the date column
            start_date, end_date = parse_date_range(str(date))
            start_dates.append(start_date)
            end_dates.append(end_date)
        
        df.insert(1, "Start Date", start_dates)
        df.insert(2, "End Date", end_dates)

# Debugging Output
print(f"Extracted {len(all_dataframes)} national polling tables.")
print("\nEvent Mapping Table:")
print(event_mapping_df)

for i, df in enumerate(all_dataframes):
    print(f"\nCleaned National Polling Table {i}:")
    print(df)

Extracted 4 national polling tables.

Event Mapping Table:
   Start Date    End Date                                              Event
0  2025-03-28  2025-03-28  The 2025 Australian federal election is called...
1  2024-04-13  2024-04-13  The Liberals are re-elected in the 2024 Cook b...
2  2024-03-02  2024-03-02  Labor is re-elected in the 2024 Dunkley by-ele...
3  2023-10-14  2023-10-14  The 2023 Australian Indigenous Voice referendu...
4  2023-07-15  2023-07-15   LNP is re-elected in the 2023 Fadden by-election
5  2023-04-01  2023-04-01              Labor wins the 2023 Aston by-election
6  2022-12-23  2022-12-23  Andrew Gee leaves the Nationals to become an I...
7  2022-05-29  2022-05-29  Peter Dutton elected unopposed as Leader of th...

Cleaned National Polling Table 0:
                    Date  Start Date    End Date                    Brand  \
0         13–24 Mar 2025  2025-03-13  2025-03-24      Redbridge/Accent[5]   
1         17–23 Mar 2025  2025-03-17  2025-03-23           

In [39]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import warnings
import re
from datetime import datetime

# Suppress FutureWarnings from pandas
warnings.simplefilter(action="ignore", category=FutureWarning)

# Wikipedia URL
url = "https://en.wikipedia.org/wiki/Opinion_polling_for_the_2025_Australian_federal_election"

# Request the page
response = requests.get(url)
response.raise_for_status()
soup = BeautifulSoup(response.text, "html.parser")

# Define possible top and bottom headers (case-insensitive search)
top_headers = ["Voting Intention", "National Polling"]
bottom_headers = ["Preferred Prime Minister", "Leadership Polling"]

# Find all header tags (h2, h3, h4)
headers = soup.find_all(["h2", "h3", "h4"])

# Locate polling tables
polling_tables = []
found_top = False
for header in headers:
    header_text = header.get_text(strip=True).lower()
    if any(top.lower() in header_text for top in top_headers):
        found_top = True
        continue
    if found_top and any(bottom.lower() in header_text for bottom in bottom_headers):
        break  
    if found_top:
        table = header.find_next("table", class_="wikitable")
        if table:
            polling_tables.append(table)

# Convert tables to DataFrames
all_dataframes = [pd.read_html(str(table))[0] for table in polling_tables]

# Function to remove footnotes (bracketed text)
def remove_footnotes(text):
    """Remove bracketed text and surrounding brackets from strings."""
    return re.sub(r"\[.*?\]", "", str(text)).strip()

# Clean column headers and remove footnotes across all DataFrames
for i, df in enumerate(all_dataframes):
    if isinstance(df.columns, pd.MultiIndex):
        df.columns = [
            col1 if col1 == col2 else f"{col1}: {col2}"
            for col1, col2 in zip(df.columns.get_level_values(0), df.columns.get_level_values(1))
        ]
        df = df.iloc[0:].reset_index(drop=True)
        all_dataframes[i] = df  

    df.columns = [remove_footnotes(col) for col in df.columns]
    all_dataframes[i] = df.applymap(remove_footnotes)

# Function to convert percentages to decimals and handle missing values
def percentage_to_decimal(value):
    """Convert percentage values to decimals and replace dashes with 0."""
    if isinstance(value, str):
        value = value.strip()
        if "%" in value:
            try:
                return round(float(value.replace("%", "")) / 100, 4)
            except ValueError:
                return 0.0
        elif value in ["—", "-", ""]:
            return 0.0
    return value

# Extract Mapping Table (Events During Polling Period)
event_mapping = []

def is_event_text(value):
    """Determine if a value is an event description rather than a percentage or missing data."""
    if isinstance(value, str):
        return not re.match(r"^\d+(\.\d+)?%?$", value) and value.strip() not in ["—", "-"]
    return False

def parse_date_range(date_str):
    """Extract start and end dates from date strings."""
    date_str = date_str.replace("–", "-").replace("—", "-")
    date_parts = date_str.split("-")
    try:
        if len(date_parts) == 2:  
            year = date_parts[1].strip()[-4:]
            if any(char.isalpha() for char in date_parts[0]):
                start_date = datetime.strptime(date_parts[0].strip() + " " + year, "%d %b %Y").strftime("%Y-%m-%d")
            else:
                month_year = " ".join(date_parts[1].strip().split()[-2:])
                start_date = datetime.strptime(date_parts[0].strip() + " " + month_year, "%d %b %Y").strftime("%Y-%m-%d")
            end_date = datetime.strptime(date_parts[1].strip(), "%d %b %Y").strftime("%Y-%m-%d")
        else:  
            start_date = end_date = datetime.strptime(date_str.strip(), "%d %b %Y").strftime("%Y-%m-%d")
        return start_date, end_date
    except ValueError:
        return None, None  

for i, df in enumerate(all_dataframes):
    if any("Primary vote" in col for col in df.columns):
        rows_to_remove = []
        for index, row in df.iterrows():
            if any(is_event_text(row[col]) for col in df.columns if "Primary vote" in col):
                start_date, end_date = parse_date_range(row[0])  
                event_mapping.append({
                    "Start Date": start_date,
                    "End Date": end_date,
                    "Event": next(row[col] for col in df.columns if "Primary vote" in col and is_event_text(row[col]))
                })
                rows_to_remove.append(index)
        all_dataframes[i] = df.drop(rows_to_remove).reset_index(drop=True)

# Convert to DataFrame
event_mapping_df = pd.DataFrame(event_mapping)

# Clean and format data in polling tables
for i, df in enumerate(all_dataframes):
    if df.shape[0] > 0 and df.shape[1] > 0:  
        start_dates = []
        end_dates = []
        for date in df.iloc[:, 0]:  
            start_date, end_date = parse_date_range(str(date))
            start_dates.append(start_date)
            end_dates.append(end_date)
        
        df.insert(1, "Start Date", start_dates)
        df.insert(2, "End Date", end_dates)

        df = df.applymap(percentage_to_decimal)

        all_dataframes[i] = df  

# Debugging Output
print(f"Extracted {len(all_dataframes)} national polling tables.")
print("\nEvent Mapping Table:")
print(event_mapping_df)

for i, df in enumerate(all_dataframes):
    print(f"\nCleaned National Polling Table {i}:")
    print(df)

Extracted 4 national polling tables.

Event Mapping Table:
   Start Date    End Date                                              Event
0  2025-03-28  2025-03-28  The 2025 Australian federal election is called...
1  2024-04-13  2024-04-13  The Liberals are re-elected in the 2024 Cook b...
2  2024-03-02  2024-03-02  Labor is re-elected in the 2024 Dunkley by-ele...
3  2023-10-14  2023-10-14  The 2023 Australian Indigenous Voice referendu...
4  2023-07-15  2023-07-15   LNP is re-elected in the 2023 Fadden by-election
5  2023-04-01  2023-04-01              Labor wins the 2023 Aston by-election
6  2022-12-23  2022-12-23  Andrew Gee leaves the Nationals to become an I...
7  2022-05-29  2022-05-29  Peter Dutton elected unopposed as Leader of th...

Cleaned National Polling Table 0:
                    Date  Start Date    End Date                Brand  \
0         13–24 Mar 2025  2025-03-13  2025-03-24     Redbridge/Accent   
1         17–23 Mar 2025  2025-03-17  2025-03-23           Roy Morg