In [61]:
import requests
import csv
from bs4 import BeautifulSoup
from urllib.parse import urlparse, parse_qs, urlencode, urlunparse

# LEFT FOR TESTING PURPOSES

# def scrape():
#     url = 'https://www.cityofmadison.com/police/newsroom/incidentreports/'
#     response = requests.get(url)
#     soup = BeautifulSoup(response.text, 'html.parser')
#     return soup

# if __name__ == '__main__':
#     scrape()

In [62]:
# LEFT FOR TESTING PURPOSES

# data_rows = soup.select("div.span2") # left for testing purposes

# for row in data_rows:
#     print(row.get_text(strip=True))

In [63]:
# LEFT FOR TESTING PURPOSES

# base_url = "https://www.cityofmadison.com/police/newsroom/incidentreports/index.cfm?page={}&a=71&radius=0.5&startdate=01/01/2024&enddate=01/01/2025"
# keys = ["Released", "Incident", "Case Number", "Address", "Released By"]
# all_reports = []
# page = 1
# max_pages = 38

# while page <= max_pages:
#     url = base_url.format(page)
#     print(f"Fetching: {url}")
    
#     response = requests.get(url)
    
#     if response.status_code != 200:
#         print(f"Error fetching page {page}. Stopping.")
#         break

#     soup = BeautifulSoup(response.text, "html.parser")

#     data_rows = soup.select("div.row.incident-reports div.span2")

#     if not data_rows:
#         no_results = soup.find("p", string=lambda text: text and "No results were found" in text)
#         if no_results:
#             print("No more data found. Stopping...")
#             break
#         else:
#             print(f"No data found on page {page}. This may indicate an issue.")
#             break

#     for i in range(0, len(data_rows), len(keys)):
#         values = [row.get_text(strip=True) for row in data_rows[i:i + len(keys)]]

#         if len(values) == len(keys):
#             date_value = values[0]
#             if date_value.endswith("2024"):
#                 report_data = dict(zip(keys, values))
#                 all_reports.append(report_data)

#     print(f"Scraped page {page}: {len(data_rows) // len(keys)} reports found.")

#     page += 1  

# print(f"Total reports collected: {len(all_reports)}")

# csv_filename = "police_reports.csv"
# with open(csv_filename, mode='w', newline='', encoding='utf-8') as csv_file:
#     writer = csv.DictWriter(csv_file, fieldnames=keys)
#     writer.writeheader()  
#     for report in all_reports:
#         writer.writerow(report)

# print(f"Data has been written to {csv_filename}.")

In [60]:
base_url = "https://www.cityofmadison.com/police/newsroom/incidentreports/index.cfm?page={}&a=71&radius=0.5&startdate=01/01/2024&enddate=01/01/2025"
incident_base_url = "https://www.cityofmadison.com/police/newsroom/incidentreports/"
keys = ["Released", "Incident", "Case Number", "Address", "Released By", "Time of Incident"]
all_reports = []

def get_incident_time(case_url):
    try:
        response = requests.get(case_url)
        if response.status_code != 200:
            print(f"Error fetching incident time from {case_url}. Status code: {response.status_code}")
            return "N/A"
        
        soup = BeautifulSoup(response.text, "html.parser")

        incident_date_header = soup.find("h3", class_="span2", string="Incident Date")
        if not incident_date_header:
            return "N/A"

        incident_date_value = incident_date_header.find_next("span", class_="span5")
        if incident_date_value:
            incident_text = incident_date_value.get_text(strip=True)
            if " - " in incident_text:
                return incident_text.split(" - ")[1]
        return "N/A"
    
    except Exception as e:
        print(f"Error fetching incident time from {case_url}: {e}")
        return "N/A"

page = 1
max_pages = 38

while page <= max_pages:
    url = base_url.format(page)
    print(f"Fetching: {url}")
    
    response = requests.get(url)
    
    if response.status_code != 200:
        print(f"Error fetching page {page}. Stopping.")
        break

    soup = BeautifulSoup(response.text, "html.parser")

    incident_rows = soup.select("div.row.incident-reports")

    if not incident_rows:
        print("No more data found. Stopping...")
        break

    for row in incident_rows:
        data_cells = row.select("div.span2")
        if len(data_cells) < len(keys) - 1:
            continue
        
        values = [cell.get_text(strip=True) for cell in data_cells[:len(keys) - 1]]

        if values[0].endswith("2024"):
            report_data = dict(zip(keys[:-1], values))
            
            case_link = row.find("a", href=True)
            if case_link:
                case_url = incident_base_url + case_link["href"]
                report_data["Time of Incident"] = get_incident_time(case_url)
            else:
                report_data["Time of Incident"] = "N/A"
            
            all_reports.append(report_data)

    print(f"Scraped page {page}: {len(incident_rows)} reports found.")
    page += 1  

print(f"Total reports collected: {len(all_reports)}")

csv_filename = "police_reports_with_time.csv"
with open(csv_filename, mode="w", newline="", encoding="utf-8") as csv_file:
    writer = csv.DictWriter(csv_file, fieldnames=keys)
    writer.writeheader()
    for report in all_reports:
        writer.writerow(report)

print(f"Data has been written to {csv_filename}.")

# Written with the help of ChatGPT

Fetching: https://www.cityofmadison.com/police/newsroom/incidentreports/index.cfm?page=1&a=71&radius=0.5&startdate=01/01/2024&enddate=01/01/2025
Scraped page 1: 17 reports found.
Fetching: https://www.cityofmadison.com/police/newsroom/incidentreports/index.cfm?page=2&a=71&radius=0.5&startdate=01/01/2024&enddate=01/01/2025
Scraped page 2: 17 reports found.
Fetching: https://www.cityofmadison.com/police/newsroom/incidentreports/index.cfm?page=3&a=71&radius=0.5&startdate=01/01/2024&enddate=01/01/2025
Scraped page 3: 17 reports found.
Fetching: https://www.cityofmadison.com/police/newsroom/incidentreports/index.cfm?page=4&a=71&radius=0.5&startdate=01/01/2024&enddate=01/01/2025
Scraped page 4: 17 reports found.
Fetching: https://www.cityofmadison.com/police/newsroom/incidentreports/index.cfm?page=5&a=71&radius=0.5&startdate=01/01/2024&enddate=01/01/2025
Scraped page 5: 17 reports found.
Fetching: https://www.cityofmadison.com/police/newsroom/incidentreports/index.cfm?page=6&a=71&radius=0.5&