<a href="https://colab.research.google.com/github/bryaanabraham/Election-Data-Analysis/blob/main/Election_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [2]:
def exctract_data(url, name):
    response = requests.get(url)
    html_content = response.content

    # Parse HTML content
    soup = BeautifulSoup(html_content, 'html.parser')

    # Locate the table
    table = soup.find('table', class_='table table-striped table-bordered')

    if table is None:
        print(f"Table not found for URL: {url}")
        return

    # Extract the header
    headers = [header.text.strip() for header in table.find_all('th')]

    # Keep only the first 7 headers that match the data rows
    headers = headers[:7]
    print(f"Headers for {name}: {headers}")

    # Extract the rows
    rows = []
    for row in table.find('tbody').find_all('tr'):
        cells = row.find_all('td')
        rows.append([cell.text.strip() for cell in cells])
    print(f"Rows for {name}: {rows[:3]}")

    # Check the length of headers and rows to ensure they match
    for row in rows:
        if len(row) != len(headers):
            print(f"Row length {len(row)} does not match header length {len(headers)}")

    df = pd.DataFrame(rows, columns=headers)
    csv_file_path = f"csv_files/{name}.csv"
    df.to_csv(csv_file_path, index=False)

    print(f"Data has been saved to {csv_file_path}")

In [3]:
# Fetch the webpage content
url = "https://results.eci.gov.in/AcResultByeJune2024/index.htm"
response = requests.get(url)
html_content = response.content

# Parse HTML content
soup = BeautifulSoup(html_content, 'html.parser')

# Extract all links from buttons
buttons = soup.find_all('a')
links = [button['href'] for button in buttons if 'href' in button.attrs]

# Convert relative URLs to absolute URLs
base_url = "https://results.eci.gov.in/AcResultByeJune2024/"
absolute_links = [base_url + link for link in links]

for link in absolute_links:
    print(link)

https://results.eci.gov.in/AcResultByeJune2024/index.htm
https://results.eci.gov.in/AcResultByeJune2024/./hi/index.htm
https://results.eci.gov.in/AcResultByeJune2024/#
https://results.eci.gov.in/AcResultByeJune2024/candidateswise-S04195.htm
https://results.eci.gov.in/AcResultByeJune2024/candidateswise-S0626.htm
https://results.eci.gov.in/AcResultByeJune2024/candidateswise-S0683.htm
https://results.eci.gov.in/AcResultByeJune2024/candidateswise-S0685.htm
https://results.eci.gov.in/AcResultByeJune2024/candidateswise-S06108.htm
https://results.eci.gov.in/AcResultByeJune2024/candidateswise-S06136.htm
https://results.eci.gov.in/AcResultByeJune2024/candidateswise-S0721.htm
https://results.eci.gov.in/AcResultByeJune2024/candidateswise-S0818.htm
https://results.eci.gov.in/AcResultByeJune2024/candidateswise-S0821.htm
https://results.eci.gov.in/AcResultByeJune2024/candidateswise-S0837.htm
https://results.eci.gov.in/AcResultByeJune2024/candidateswise-S0839.htm
https://results.eci.gov.in/AcResultBy

In [4]:
print(len(absolute_links))

30


In [5]:
# All URLs are not useful
filtered_links = [item for item in absolute_links if 'candidateswise'in item]
for link in filtered_links:
    print(link)

https://results.eci.gov.in/AcResultByeJune2024/candidateswise-S04195.htm
https://results.eci.gov.in/AcResultByeJune2024/candidateswise-S0626.htm
https://results.eci.gov.in/AcResultByeJune2024/candidateswise-S0683.htm
https://results.eci.gov.in/AcResultByeJune2024/candidateswise-S0685.htm
https://results.eci.gov.in/AcResultByeJune2024/candidateswise-S06108.htm
https://results.eci.gov.in/AcResultByeJune2024/candidateswise-S06136.htm
https://results.eci.gov.in/AcResultByeJune2024/candidateswise-S0721.htm
https://results.eci.gov.in/AcResultByeJune2024/candidateswise-S0818.htm
https://results.eci.gov.in/AcResultByeJune2024/candidateswise-S0821.htm
https://results.eci.gov.in/AcResultByeJune2024/candidateswise-S0837.htm
https://results.eci.gov.in/AcResultByeJune2024/candidateswise-S0839.htm
https://results.eci.gov.in/AcResultByeJune2024/candidateswise-S0842.htm
https://results.eci.gov.in/AcResultByeJune2024/candidateswise-S0845.htm
https://results.eci.gov.in/AcResultByeJune2024/candidateswise

In [6]:
print(len(filtered_links))

25


In [7]:
for url in filtered_links:
  name = url.split('-')[-1].split('.')[0]
  # the website contains tabulated data under 'Constituencywise' whereas 'candidatewise' has it in the form of ccards
  url = url.replace('candidateswise-', 'Constituencywise')
  exctract_data(url,name)

Headers for S04195: ['S.N.', 'Candidate', 'Party', 'EVM Votes', 'Postal Votes', 'Total Votes', '% of Votes']
Rows for S04195: [['1', 'PRABHUNATH PRASAD', 'Janata Dal  (United)', '43334', '291', '43625', '31.66'], ['2', 'SHIV PRAKASH RANJAN', 'Communist Party of India  (Marxist-Leninist)  (Liberation)', '73191', '269', '73460', '53.31'], ['3', 'UPENDRA KUMAR S/O - CHANDESHWAR RAM', 'Independent', '6414', '4', '6418', '4.66']]
Data has been saved to csv_files/S04195.csv
Headers for S0626: ['S.N.', 'Candidate', 'Party', 'EVM Votes', 'Postal Votes', 'Total Votes', '% of Votes']
Rows for S0626: [['1', 'DR. C. J. CHAVDA', 'Bharatiya Janata Party', '99083', '1558', '100641', '67.49'], ['2', 'DINESHBHAI TULSIBHAI PATEL', 'Indian National Congress', '43393', '1020', '44413', '29.78'], ['3', 'ANKITKUMAR HARSHADBHAI GOHIL', 'Independent', '960', '4', '964', '0.65']]
Data has been saved to csv_files/S0626.csv
Headers for S0683: ['S.N.', 'Candidate', 'Party', 'EVM Votes', 'Postal Votes', 'Total Vot

In [8]:
import shutil

directory_to_compress = '/content/csv_files'
output_zip_file = '/content/csv_files.zip'
shutil.make_archive(output_zip_file.replace('.zip', ''), 'zip', directory_to_compress)

'/content/csv_files.zip'