# Weather Data (scraping from Weather Underground)

In [1]:
import time
from datetime import datetime, timedelta
import pandas as pd
import requests
import aiohttp
import asyncio
import nest_asyncio
import ssl
import certifi

# Allow nested event loops (needed for Jupyter notebooks)
nest_asyncio.apply()

In [2]:
# !pip install aiohttp asyncio

Frrom `eda.ipynb`, we have found that the top 20 most popular airports by flight traffic in the United States are:

ORD, ATL, DFW, LAX, PHX, DEN, DTW, IAH, MSP, SFO, STL, EWR, LAS, CLT, LGA, BOS, PHL, PIT, SLC, SEA

In [3]:
iata_codes_top_20 = ['ORD', 'ATL', 'DFW', 'LAX', 'PHX', 'DEN', 'DTW', 'IAH', 'MSP', 'SFO', 'STL', 'EWR', 'LAS', 'CLT', 'LGA', 'BOS', 'PHL', 'PIT', 'SLC', 'SEA']

# Convert to ICAO codes for US airports
icao_codes_top_20 = ['K' + code.upper() for code in iata_codes_top_20]

# Split into 5 groups for scraping
list1, list2, list3, list4, list5 = [icao_codes_top_20[i:i+4] for i in range(0, 20, 4)]

In [4]:
async def scrape_wunderground_async(session, station, date, semaphore, progress_counter):
    """Async version of scrape with rate limiting via semaphore"""
    async with semaphore:  # Limit concurrent requests
        url = f"https://api.weather.com/v1/location/{station}:9:US/observations/historical.json?apiKey=e1f10a1e78da46f5b10a1e78da96f525&units=e&startDate={date.replace('-','')}"
        try:
            async with session.get(url) as response:
                data = await response.json()
                if "observations" in data and data["observations"]:
                    df = pd.DataFrame(data["observations"])
                    df = df[["obs_id","valid_time_gmt", "wx_phrase","temp", "precip_hrly", "snow_hrly", "wspd", "clds", "rh","vis"]]
                    
                    # Update progress counter
                    progress_counter['completed'] += 1
                    progress_counter['rows'] += len(df)
                    
                    # Print progress every 100 days
                    if progress_counter['completed'] % 100 == 0:
                        print(f"Progress: {progress_counter['completed']}/{progress_counter['total']} days | {progress_counter['rows']:,} rows scraped")
                    
                    return df
                return pd.DataFrame()
        except Exception as e:
            # Only print SSL errors once per station to avoid spam
            if "SSL" not in str(e) or date.endswith("10-01"):
                print(f"Error scraping {station} on {date}: {e}")
            return pd.DataFrame()

async def scrape_multidate_async(station, start_date, end_date, max_concurrent=20):
    """Scrape multiple dates for a single station asynchronously"""
    start_date = datetime.strptime(start_date, '%Y-%m-%d').date()
    end_date = datetime.strptime(end_date, '%Y-%m-%d').date()
    delta = end_date - start_date
    
    # Create list of dates to scrape
    dates = [(start_date + timedelta(days=i)).strftime('%Y-%m-%d') 
             for i in range(delta.days + 1)]
    
    total_days = len(dates)
    
    # Progress counter dictionary (mutable so it can be shared across async tasks)
    progress_counter = {'completed': 0, 'total': total_days, 'rows': 0}
    
    # Semaphore to limit concurrent requests (avoid overwhelming the API)
    semaphore = asyncio.Semaphore(max_concurrent)
    
    # Create SSL context with certifi certificates
    ssl_context = ssl.create_default_context(cafile=certifi.where())
    
    # Create aiohttp session with timeout and SSL context
    timeout = aiohttp.ClientTimeout(total=30)
    connector = aiohttp.TCPConnector(ssl=ssl_context)
    
    print(f"Starting to scrape {total_days} days for {station}...")
    
    async with aiohttp.ClientSession(timeout=timeout, connector=connector) as session:
        # Create all tasks
        tasks = [scrape_wunderground_async(session, station, date, semaphore, progress_counter) 
                 for date in dates]
        
        # Execute all tasks concurrently
        results = await asyncio.gather(*tasks, return_exceptions=True)
    
    # Filter out empty DataFrames and exceptions
    df_list = [df for df in results if isinstance(df, pd.DataFrame) and not df.empty]
    
    if df_list:
        combined_df = pd.concat(df_list, ignore_index=True)
        print(f"✓ Scraped {len(df_list)}/{total_days} days for {station} | Total rows: {len(combined_df):,}")
        return combined_df
    else:
        print(f"✗ No data scraped for {station}")
        return pd.DataFrame()

async def scrape_all_stations_async(station_list, start_date, end_date):
    """Scrape all stations asynchronously, one station at a time to avoid conflicts"""
    all_weather = []
    total_rows = 0
    
    for idx, station in enumerate(station_list, 1):
        print(f"\n{'='*60}")
        print(f"Station {idx}/{len(station_list)}: {station}")
        print(f"{'='*60}")
        
        df_weather = await scrape_multidate_async(station, start_date, end_date, max_concurrent=20)
        
        if not df_weather.empty:
            all_weather.append(df_weather)
            total_rows += len(df_weather)
            print(f"Running total: {total_rows:,} rows across {len(all_weather)} stations")
        
        # Small delay between stations to be respectful to the API
        await asyncio.sleep(1)
    
    return all_weather

In [5]:
# --- Scrape weather for all ICAO codes (list 1) - OPTIMIZED ASYNC VERSION ---
print("Starting async scraping for list 1...")
print(f"Stations to scrape: {list1}")

# Run the async scraping
all_weather_1 = await scrape_all_stations_async(list1, '1987-10-01', '2008-04-30')

# Combine all stations into one DataFrame
if all_weather_1:
    df_all_weather_1 = pd.concat(all_weather_1, ignore_index=True)
    print("\n" + "="*50)
    print("✓ Scraping complete!")
    print("="*50)
    print(f"\nTotal rows scraped: {len(df_all_weather_1):,}")
    print(f"\nFirst 5 rows:")
    print(df_all_weather_1.head())

    # Save to CSV
    output_file = 'weather_data_list1.csv'
    df_all_weather_1.to_csv(output_file, index=False)
    print(f"\n✓ Data saved to: {output_file}")
else:
    print("\n✗ No data was scraped.")

Starting async scraping for list 1...
Stations to scrape: ['KORD', 'KATL', 'KDFW', 'KLAX']

Station 1/4: KORD
Starting to scrape 7518 days for KORD...
Progress: 100/7518 days | 2,707 rows scraped
Progress: 200/7518 days | 5,460 rows scraped
Progress: 300/7518 days | 7,967 rows scraped
Progress: 400/7518 days | 10,583 rows scraped
Progress: 500/7518 days | 13,359 rows scraped
Progress: 600/7518 days | 16,072 rows scraped
Progress: 700/7518 days | 18,729 rows scraped
Progress: 800/7518 days | 21,317 rows scraped
Progress: 900/7518 days | 24,124 rows scraped
Progress: 1000/7518 days | 26,843 rows scraped
Progress: 1100/7518 days | 29,604 rows scraped
Progress: 1200/7518 days | 32,427 rows scraped
Progress: 1300/7518 days | 35,255 rows scraped
Progress: 1400/7518 days | 37,857 rows scraped
Progress: 1500/7518 days | 40,577 rows scraped
Progress: 1600/7518 days | 43,497 rows scraped
Progress: 1700/7518 days | 46,313 rows scraped
Progress: 1800/7518 days | 49,056 rows scraped
Progress: 1900/

  combined_df = pd.concat(df_list, ignore_index=True)


✓ Scraped 7487/7518 days for KORD | Total rows: 205,796
Running total: 205,796 rows across 1 stations

Station 2/4: KATL
Starting to scrape 7518 days for KATL...
Progress: 100/7518 days | 2,683 rows scraped
Progress: 200/7518 days | 5,341 rows scraped
Progress: 300/7518 days | 7,923 rows scraped
Progress: 400/7518 days | 10,662 rows scraped
Progress: 500/7518 days | 13,413 rows scraped
Progress: 600/7518 days | 16,156 rows scraped
Progress: 700/7518 days | 18,960 rows scraped
Progress: 800/7518 days | 21,667 rows scraped
Progress: 900/7518 days | 24,414 rows scraped
Progress: 1000/7518 days | 27,015 rows scraped
Progress: 1100/7518 days | 29,632 rows scraped
Progress: 1200/7518 days | 32,369 rows scraped
Progress: 1300/7518 days | 35,148 rows scraped
Progress: 1400/7518 days | 37,964 rows scraped
Progress: 1500/7518 days | 40,602 rows scraped
Progress: 1600/7518 days | 43,291 rows scraped
Progress: 1700/7518 days | 45,942 rows scraped
Progress: 1800/7518 days | 48,725 rows scraped
Prog

  combined_df = pd.concat(df_list, ignore_index=True)


✓ Scraped 7486/7518 days for KATL | Total rows: 207,266
Running total: 413,062 rows across 2 stations

Station 3/4: KDFW
Starting to scrape 7518 days for KDFW...
Progress: 100/7518 days | 2,790 rows scraped
Progress: 200/7518 days | 5,549 rows scraped
Progress: 300/7518 days | 8,144 rows scraped
Progress: 400/7518 days | 10,729 rows scraped
Progress: 500/7518 days | 13,573 rows scraped
Progress: 600/7518 days | 16,393 rows scraped
Progress: 700/7518 days | 19,141 rows scraped
Progress: 800/7518 days | 21,724 rows scraped
Progress: 900/7518 days | 24,573 rows scraped
Progress: 1000/7518 days | 27,468 rows scraped
Progress: 1100/7518 days | 30,080 rows scraped
Progress: 1200/7518 days | 33,034 rows scraped
Progress: 1300/7518 days | 35,970 rows scraped
Progress: 1400/7518 days | 38,734 rows scraped
Progress: 1500/7518 days | 41,512 rows scraped
Progress: 1600/7518 days | 44,593 rows scraped
Progress: 1700/7518 days | 47,475 rows scraped
Progress: 1800/7518 days | 50,193 rows scraped
Prog

  combined_df = pd.concat(df_list, ignore_index=True)


✓ Scraped 7486/7518 days for KDFW | Total rows: 207,091
Running total: 620,153 rows across 3 stations

Station 4/4: KLAX
Starting to scrape 7518 days for KLAX...
Progress: 100/7518 days | 2,664 rows scraped
Progress: 200/7518 days | 5,356 rows scraped
Progress: 300/7518 days | 8,072 rows scraped
Progress: 400/7518 days | 10,862 rows scraped
Progress: 500/7518 days | 13,500 rows scraped
Progress: 600/7518 days | 16,193 rows scraped
Progress: 700/7518 days | 18,936 rows scraped
Progress: 800/7518 days | 21,654 rows scraped
Progress: 900/7518 days | 24,319 rows scraped
Progress: 1000/7518 days | 27,012 rows scraped
Progress: 1100/7518 days | 29,678 rows scraped
Progress: 1200/7518 days | 32,308 rows scraped
Progress: 1300/7518 days | 35,062 rows scraped
Progress: 1400/7518 days | 37,875 rows scraped
Progress: 1500/7518 days | 40,979 rows scraped
Progress: 1600/7518 days | 43,657 rows scraped
Progress: 1700/7518 days | 46,371 rows scraped
Progress: 1800/7518 days | 49,198 rows scraped
Prog

  combined_df = pd.concat(df_list, ignore_index=True)


✓ Scraped 7485/7518 days for KLAX | Total rows: 202,057
Running total: 822,210 rows across 4 stations


  df_all_weather_1 = pd.concat(all_weather_1, ignore_index=True)



✓ Scraping complete!

Total rows scraped: 822,210

First 5 rows:
  obs_id  valid_time_gmt      wx_phrase  temp  precip_hrly  snow_hrly  wspd  \
0   KORD       560062800           Fair  44.0          NaN        NaN   NaN   
1   KORD       560066400           Fair  44.0          NaN        NaN   5.0   
2   KORD       560070000           Fair  42.0          NaN        NaN   5.0   
3   KORD       560073600  Partly Cloudy  40.0          NaN        NaN   NaN   
4   KORD       560077200           Fair  39.0          NaN        NaN   5.0   

  clds    rh   vis  
0  CLR  79.0  15.0  
1  CLR  79.0  15.0  
2  CLR  82.0  15.0  
3  SCT  89.0  15.0  
4  FEW  93.0  15.0  

✓ Data saved to: weather_data_list1.csv


In [None]:
# --- Scrape weather for all ICAO codes (list 2) - OPTIMIZED ASYNC VERSION ---
print("Starting async scraping for list 2...")
print(f"Stations to scrape: {list2}")

# Run the async scraping
all_weather_2 = await scrape_all_stations_async(list2, '1987-10-01', '2008-04-30')

# Combine all stations into one DataFrame
if all_weather_2:
    df_all_weather_2 = pd.concat(all_weather_2, ignore_index=True)
    print("\n" + "="*50)
    print("✓ Scraping complete!")
    print("="*50)
    print(f"\nTotal rows scraped: {len(df_all_weather_2):,}")
    print(f"\nFirst 5 rows:")
    print(df_all_weather_2.head())

    # Save to CSV
    output_file = 'weather_data_list2.csv'
    df_all_weather_2.to_csv(output_file, index=False)
    print(f"\n✓ Data saved to: {output_file}")
else:
    print("\n✗ No data was scraped.")

In [None]:
# --- Scrape weather for all ICAO codes (list 3) - OPTIMIZED ASYNC VERSION ---
print("Starting async scraping for list 3...")
print(f"Stations to scrape: {list3}")

# Run the async scraping
all_weather_3 = await scrape_all_stations_async(list3, '1987-10-01', '2008-04-30')

# Combine all stations into one DataFrame
if all_weather_3:
    df_all_weather_3 = pd.concat(all_weather_3, ignore_index=True)
    print("\n" + "="*50)
    print("✓ Scraping complete!")
    print("="*50)
    print(f"\nTotal rows scraped: {len(df_all_weather_3):,}")
    print(f"\nFirst 5 rows:")
    print(df_all_weather_3.head())

    # Save to CSV
    output_file = 'weather_data_list3.csv'
    df_all_weather_3.to_csv(output_file, index=False)
    print(f"\n✓ Data saved to: {output_file}")
else:
    print("\n✗ No data was scraped.")

Starting async scraping for list 3...
Stations to scrape: ['KMSP', 'KSFO', 'KSTL', 'KEWR']

Station 1/4: KMSP
Starting to scrape 7518 days for KMSP...
Progress: 100/7518 days | 2,688 rows scraped
Progress: 100/7518 days | 2,688 rows scraped
Progress: 200/7518 days | 5,381 rows scraped
Progress: 200/7518 days | 5,381 rows scraped
Progress: 300/7518 days | 7,846 rows scraped
Progress: 300/7518 days | 7,846 rows scraped
Progress: 400/7518 days | 10,419 rows scraped
Progress: 400/7518 days | 10,419 rows scraped
Progress: 500/7518 days | 13,111 rows scraped
Progress: 500/7518 days | 13,111 rows scraped
Progress: 600/7518 days | 15,792 rows scraped
Progress: 600/7518 days | 15,792 rows scraped
Progress: 700/7518 days | 18,340 rows scraped
Progress: 700/7518 days | 18,340 rows scraped
Progress: 800/7518 days | 20,889 rows scraped
Progress: 800/7518 days | 20,889 rows scraped
Progress: 900/7518 days | 23,567 rows scraped
Progress: 900/7518 days | 23,567 rows scraped
Progress: 1000/7518 days | 

  combined_df = pd.concat(df_list, ignore_index=True)


✓ Scraped 7484/7518 days for KMSP | Total rows: 207,237
Running total: 207,237 rows across 1 stations

Station 2/4: KSFO
Starting to scrape 7518 days for KSFO...

Station 2/4: KSFO
Starting to scrape 7518 days for KSFO...
Progress: 100/7518 days | 2,910 rows scraped
Progress: 100/7518 days | 2,910 rows scraped
Progress: 200/7518 days | 5,584 rows scraped
Progress: 200/7518 days | 5,584 rows scraped
Progress: 300/7518 days | 8,267 rows scraped
Progress: 300/7518 days | 8,267 rows scraped
Progress: 400/7518 days | 11,043 rows scraped
Progress: 400/7518 days | 11,043 rows scraped
Progress: 500/7518 days | 13,938 rows scraped
Progress: 500/7518 days | 13,938 rows scraped
Progress: 600/7518 days | 16,834 rows scraped
Progress: 600/7518 days | 16,834 rows scraped
Progress: 700/7518 days | 19,486 rows scraped
Progress: 700/7518 days | 19,486 rows scraped
Progress: 800/7518 days | 22,218 rows scraped
Progress: 800/7518 days | 22,218 rows scraped
Progress: 900/7518 days | 24,955 rows scraped
Pr

  combined_df = pd.concat(df_list, ignore_index=True)


✓ Scraped 7487/7518 days for KSFO | Total rows: 199,134
Running total: 406,371 rows across 2 stations

Station 3/4: KSTL
Starting to scrape 7518 days for KSTL...

Station 3/4: KSTL
Starting to scrape 7518 days for KSTL...
Progress: 100/7518 days | 3,072 rows scraped
Progress: 100/7518 days | 3,072 rows scraped
Progress: 200/7518 days | 6,172 rows scraped
Progress: 200/7518 days | 6,172 rows scraped
Progress: 300/7518 days | 8,813 rows scraped
Progress: 300/7518 days | 8,813 rows scraped
Progress: 400/7518 days | 11,505 rows scraped
Progress: 400/7518 days | 11,505 rows scraped
Progress: 500/7518 days | 14,602 rows scraped
Progress: 500/7518 days | 14,602 rows scraped
Progress: 600/7518 days | 17,580 rows scraped
Progress: 600/7518 days | 17,580 rows scraped
Progress: 700/7518 days | 20,484 rows scraped
Progress: 700/7518 days | 20,484 rows scraped
Progress: 800/7518 days | 23,311 rows scraped
Progress: 800/7518 days | 23,311 rows scraped
Progress: 900/7518 days | 26,226 rows scraped
Pr

  combined_df = pd.concat(df_list, ignore_index=True)


✓ Scraped 7487/7518 days for KSTL | Total rows: 216,551
Running total: 622,922 rows across 3 stations

Station 4/4: KEWR
Starting to scrape 7518 days for KEWR...

Station 4/4: KEWR
Starting to scrape 7518 days for KEWR...
Progress: 100/7518 days | 2,619 rows scraped
Progress: 100/7518 days | 2,619 rows scraped
Progress: 200/7518 days | 5,212 rows scraped
Progress: 200/7518 days | 5,212 rows scraped
Progress: 300/7518 days | 7,867 rows scraped
Progress: 300/7518 days | 7,867 rows scraped
Progress: 400/7518 days | 10,392 rows scraped
Progress: 400/7518 days | 10,392 rows scraped
Progress: 500/7518 days | 12,957 rows scraped
Progress: 500/7518 days | 12,957 rows scraped
Progress: 600/7518 days | 15,535 rows scraped
Progress: 600/7518 days | 15,535 rows scraped
Progress: 700/7518 days | 18,173 rows scraped
Progress: 700/7518 days | 18,173 rows scraped
Progress: 800/7518 days | 20,763 rows scraped
Progress: 800/7518 days | 20,763 rows scraped
Progress: 900/7518 days | 23,330 rows scraped
Pr

  combined_df = pd.concat(df_list, ignore_index=True)


✓ Scraped 7486/7518 days for KEWR | Total rows: 198,102
Running total: 821,024 rows across 4 stations


  df_all_weather_3 = pd.concat(all_weather_3, ignore_index=True)



✓ Scraping complete!

Total rows scraped: 821,024

First 5 rows:
  obs_id  valid_time_gmt      wx_phrase  temp  precip_hrly  snow_hrly  wspd  \
0   KMSP       560062800  Mostly Cloudy  49.0          NaN        NaN   3.0   
1   KMSP       560066400         Cloudy  52.0          NaN        NaN   8.0   
2   KMSP       560070000         Cloudy  53.0          NaN        NaN  12.0   
3   KMSP       560073600           Fair  52.0          NaN        NaN  12.0   
4   KMSP       560077200           Fair  50.0          NaN        NaN   9.0   

  clds    rh   vis  
0  BKN  69.0  15.0  
1  OVC  54.0  10.0  
2  OVC  48.0  10.0  
3  FEW  47.0  10.0  
4  FEW  50.0  10.0  

✓ Data saved to: weather_data_list3.csv

✓ Data saved to: weather_data_list3.csv


In [None]:
# --- Scrape weather for all ICAO codes (list 4) - OPTIMIZED ASYNC VERSION ---
print("Starting async scraping for list 4...")
print(f"Stations to scrape: {list4}")

# Run the async scraping
all_weather_4 = await scrape_all_stations_async(list4, '1987-10-01', '2008-04-30')

# Combine all stations into one DataFrame
if all_weather_4:
    df_all_weather_4 = pd.concat(all_weather_4, ignore_index=True)
    print("\n" + "="*50)
    print("✓ Scraping complete!")
    print("="*50)
    print(f"\nTotal rows scraped: {len(df_all_weather_4):,}")
    print(f"\nFirst 5 rows:")
    print(df_all_weather_4.head())

    # Save to CSV
    output_file = 'weather_data_list4.csv'
    df_all_weather_4.to_csv(output_file, index=False)
    print(f"\n✓ Data saved to: {output_file}")
else:
    print("\n✗ No data was scraped.")

Scraping station: KLAS
Scraped data for KLAS on 1987-10-01
Scraped data for KLAS on 1987-10-02
Scraped data for KLAS on 1987-10-03
Scraped data for KLAS on 1987-10-04
Scraped data for KLAS on 1987-10-05
Scraped data for KLAS on 1987-10-06
Scraped data for KLAS on 1987-10-07
Scraped data for KLAS on 1987-10-08
Scraped data for KLAS on 1987-10-09
Scraped data for KLAS on 1987-10-10
Scraped data for KLAS on 1987-10-11
Scraped data for KLAS on 1987-10-12
Scraped data for KLAS on 1987-10-13
Scraped data for KLAS on 1987-10-14
Scraped data for KLAS on 1987-10-15
Scraped data for KLAS on 1987-10-16
Scraped data for KLAS on 1987-10-17
Scraped data for KLAS on 1987-10-18
Scraped data for KLAS on 1987-10-19
Scraped data for KLAS on 1987-10-20
Scraped data for KLAS on 1987-10-21
Scraped data for KLAS on 1987-10-22
Scraped data for KLAS on 1987-10-23
Scraped data for KLAS on 1987-10-24
Scraped data for KLAS on 1987-10-25
Scraped data for KLAS on 1987-10-26
Scraped data for KLAS on 1987-10-27
Scrap

KeyboardInterrupt: 

In [None]:
# --- Scrape weather for all ICAO codes (list 5) - OPTIMIZED ASYNC VERSION ---
print("Starting async scraping for list 5...")
print(f"Stations to scrape: {list5}")

# Run the async scraping
all_weather_5 = await scrape_all_stations_async(list5, '1987-10-01', '2008-04-30')

# Combine all stations into one DataFrame
if all_weather_5:
    df_all_weather_5 = pd.concat(all_weather_5, ignore_index=True)
    print("\n" + "="*50)
    print("✓ Scraping complete!")
    print("="*50)
    print(f"\nTotal rows scraped: {len(df_all_weather_5):,}")
    print(f"\nFirst 5 rows:")
    print(df_all_weather_5.head())
    
    # Save to CSV
    output_file = 'weather_data_list5.csv'
    df_all_weather_5.to_csv(output_file, index=False)
    print(f"\n✓ Data saved to: {output_file}")
else:
    print("\n✗ No data was scraped.")

Starting async scraping for list 5...
Stations to scrape: ['KPHL', 'KPIT', 'KSLC', 'KSEA']

Station 1/4: KPHL
Starting to scrape 7518 days for KPHL...
Progress: 100/7518 days | 2,662 rows scraped
Progress: 100/7518 days | 2,662 rows scraped
Progress: 200/7518 days | 5,341 rows scraped
Progress: 200/7518 days | 5,341 rows scraped
Progress: 300/7518 days | 8,029 rows scraped
Progress: 300/7518 days | 8,029 rows scraped
Progress: 400/7518 days | 10,659 rows scraped
Progress: 400/7518 days | 10,659 rows scraped
Progress: 500/7518 days | 13,427 rows scraped
Progress: 500/7518 days | 13,427 rows scraped
Progress: 600/7518 days | 16,215 rows scraped
Progress: 600/7518 days | 16,215 rows scraped
Progress: 700/7518 days | 19,054 rows scraped
Progress: 700/7518 days | 19,054 rows scraped
Progress: 800/7518 days | 21,865 rows scraped
Progress: 800/7518 days | 21,865 rows scraped
Progress: 900/7518 days | 24,572 rows scraped
Progress: 900/7518 days | 24,572 rows scraped
Progress: 1000/7518 days | 

  combined_df = pd.concat(df_list, ignore_index=True)


✓ Scraped 7487/7518 days for KPHL | Total rows: 203,783
Running total: 203,783 rows across 1 stations

Station 2/4: KPIT
Starting to scrape 7518 days for KPIT...

Station 2/4: KPIT
Starting to scrape 7518 days for KPIT...
Progress: 100/7518 days | 2,958 rows scraped
Progress: 100/7518 days | 2,958 rows scraped
Progress: 200/7518 days | 5,980 rows scraped
Progress: 200/7518 days | 5,980 rows scraped
Progress: 300/7518 days | 8,672 rows scraped
Progress: 300/7518 days | 8,672 rows scraped
Progress: 400/7518 days | 11,509 rows scraped
Progress: 400/7518 days | 11,509 rows scraped
Progress: 500/7518 days | 14,374 rows scraped
Progress: 500/7518 days | 14,374 rows scraped
Progress: 600/7518 days | 17,310 rows scraped
Progress: 600/7518 days | 17,310 rows scraped
Progress: 700/7518 days | 20,151 rows scraped
Progress: 700/7518 days | 20,151 rows scraped
Progress: 800/7518 days | 22,976 rows scraped
Progress: 800/7518 days | 22,976 rows scraped
Progress: 900/7518 days | 25,828 rows scraped
Pr

  combined_df = pd.concat(df_list, ignore_index=True)


✓ Scraped 7485/7518 days for KPIT | Total rows: 209,444
Running total: 413,227 rows across 2 stations

Station 3/4: KSLC
Starting to scrape 7518 days for KSLC...

Station 3/4: KSLC
Starting to scrape 7518 days for KSLC...
Progress: 100/7518 days | 2,573 rows scraped
Progress: 100/7518 days | 2,573 rows scraped
Progress: 200/7518 days | 5,207 rows scraped
Progress: 200/7518 days | 5,207 rows scraped
Progress: 300/7518 days | 7,702 rows scraped
Progress: 300/7518 days | 7,702 rows scraped
Progress: 400/7518 days | 10,160 rows scraped
Progress: 400/7518 days | 10,160 rows scraped
Progress: 500/7518 days | 13,061 rows scraped
Progress: 500/7518 days | 13,061 rows scraped
Progress: 600/7518 days | 15,641 rows scraped
Progress: 600/7518 days | 15,641 rows scraped
Progress: 700/7518 days | 18,131 rows scraped
Progress: 700/7518 days | 18,131 rows scraped
Progress: 800/7518 days | 20,686 rows scraped
Progress: 800/7518 days | 20,686 rows scraped
Progress: 900/7518 days | 23,308 rows scraped
Pr

  combined_df = pd.concat(df_list, ignore_index=True)


✓ Scraped 7488/7518 days for KSLC | Total rows: 195,345
Running total: 608,572 rows across 3 stations

Station 4/4: KSEA
Starting to scrape 7518 days for KSEA...

Station 4/4: KSEA
Starting to scrape 7518 days for KSEA...
Progress: 100/7518 days | 2,903 rows scraped
Progress: 100/7518 days | 2,903 rows scraped
Progress: 200/7518 days | 5,913 rows scraped
Progress: 200/7518 days | 5,913 rows scraped
Progress: 300/7518 days | 8,634 rows scraped
Progress: 300/7518 days | 8,634 rows scraped
Progress: 400/7518 days | 11,564 rows scraped
Progress: 400/7518 days | 11,564 rows scraped
Progress: 500/7518 days | 14,660 rows scraped
Progress: 500/7518 days | 14,660 rows scraped
Progress: 600/7518 days | 17,589 rows scraped
Progress: 600/7518 days | 17,589 rows scraped
Progress: 700/7518 days | 20,253 rows scraped
Progress: 700/7518 days | 20,253 rows scraped
Progress: 800/7518 days | 23,187 rows scraped
Progress: 800/7518 days | 23,187 rows scraped
Progress: 900/7518 days | 26,381 rows scraped
Pr

  combined_df = pd.concat(df_list, ignore_index=True)


✓ Scraped 7489/7518 days for KSEA | Total rows: 215,893
Running total: 824,465 rows across 4 stations

✓ Scraping complete!

Total rows scraped: 824,465

First 5 rows:
  obs_id  valid_time_gmt      wx_phrase  temp  precip_hrly  snow_hrly  wspd  \
0   KPHL       560059200  Partly Cloudy  58.0          NaN        NaN   7.0   
1   KPHL       560062800  Partly Cloudy  55.0          NaN        NaN   9.0   
2   KPHL       560066400           Fair  57.0          NaN        NaN  10.0   
3   KPHL       560070000           Fair  57.0          NaN        NaN  10.0   
4   KPHL       560073600  Partly Cloudy  55.0          NaN        NaN  13.0   

  clds    rh   vis  
0  SCT  78.0  10.0  
1  SCT  86.0  10.0  
2  CLR  81.0  15.0  
3  CLR  74.0  15.0  
4  SCT  74.0  15.0  

✓ Scraping complete!

Total rows scraped: 824,465

First 5 rows:
  obs_id  valid_time_gmt      wx_phrase  temp  precip_hrly  snow_hrly  wspd  \
0   KPHL       560059200  Partly Cloudy  58.0          NaN        NaN   7.0   
1   KPH