# Webscraping and Statistical Testing

### Webscraping Dallas Fort Worth Airport (DFW) for flight delays on 2025-10-07 and comparing with historical data from 2024-10-09

Find the city with most departure delays in order to webscrape its airport website for delay information:

In [3]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from datetime import datetime, timedelta

In [None]:
# driver = webdriver.Chrome()
# driver.get("https://www.dfwairport.com/flights/")
# wait = WebDriverWait(driver, 10)

# input("Please click 'Departures' and 'Show Earlier Flights' until ready, then press Enter to start scraping...")

data = []

while True:
    rows = driver.find_elements(By.CSS_SELECTOR, 'div.css-16mp7cl.e1tlh5vk0')
    for row in rows:
        try:
            status_div = row.find_element(By.CSS_SELECTOR, 'div.css-1tdn8t0.e1xvvp7d0')
            status_span = status_div.find_element(By.CSS_SELECTOR, 'span.e1y53rij0')
            status_text = status_span.text.strip()
            times_div = row.find_element(By.CSS_SELECTOR, 'div.css-1c5a36w.e8j8o0n0')
            s_tags = times_div.find_elements(By.TAG_NAME, 's')
            em_tags = times_div.find_elements(By.TAG_NAME, 'em')
            span_tags = times_div.find_elements(By.TAG_NAME, 'span')

            if status_text in ["Departed", "Delayed"]:
                if s_tags and em_tags:
                    expected_dep = s_tags[0].text.strip()
                    dep_delay = em_tags[0].text.strip()
                elif span_tags:
                    expected_dep = span_tags[0].text.strip()
                    dep_delay = ""
                else:
                    expected_dep = times_div.text.strip()
                    dep_delay = ""
                data.append({
                    "expected_dep": expected_dep,
                    "dep_delay": dep_delay
                })
        except Exception:
            continue

    try:
        earlier_btn = driver.find_element(By.CSS_SELECTOR, 'button.css-1ug2c5u.e1owxzay2')
        if earlier_btn.is_enabled() and 'Show earlier flights' in earlier_btn.text:
            earlier_btn.click()
            wait.until(EC.staleness_of(rows[0]))
            wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'div.css-16mp7cl.e1tlh5vk0')))
        else:
            break
    except Exception:
        break

driver.quit()
print("Scraping complete.")

# Create DataFrame
df_scraped = pd.DataFrame(data, columns=["expected_dep", "dep_delay"])

# Assign date column: before 00:00 AM is 2025-10-07, at/after 00:00 AM is 2025-10-08
def assign_date(time_str):
    if pd.isna(time_str) or str(time_str).strip() == '':
        return pd.NaT
    dt = pd.to_datetime(time_str, format='%I:%M %p', errors='coerce')
    if pd.isna(dt):
        return pd.NaT
    if dt.hour == 0 and dt.minute == 0:
        return "2025-10-08"
    return "2025-10-07"

df_scraped['date'] = df_scraped['expected_dep'].apply(assign_date)

# Convert expected_dep and dep_delay to minutes since midnight
def time_to_minutes(timestr):
    if pd.isna(timestr) or str(timestr).strip() == '':
        return None
    try:
        dt = pd.to_datetime(timestr, format='%I:%M %p', errors='coerce')
        if pd.isna(dt):
            return None
        return dt.hour * 60 + dt.minute
    except Exception:
        return None

df_scraped['expected_dep_min'] = df_scraped['expected_dep'].apply(time_to_minutes)
df_scraped['dep_delay_min'] = df_scraped['dep_delay'].apply(time_to_minutes)

# Final output: ONLY expected_dep_min, dep_delay_min, and date
df_scraped = df_scraped[["expected_dep_min", "dep_delay_min", "date"]]

df_scraped.to_csv('scraped_dfw_flights.csv', index=False)

print(df_scraped.head())
print("Scraping complete.")