In [1]:
from pathlib import Path
import hashlib
import pandas as pd
import requests
import json

### DATA ACQUISITION & CHECKSUMS FOR EPA AIR QUALITY DATA

In [2]:
# Update this to your actual project root folder
PROJECT_ROOT = Path(r"C:\Users\ycjia\OneDrive\Desktop\IS-477-Project")

# EPA CSV path
epa_csv_path = PROJECT_ROOT / "data" / "raw" / "epa" / "daily_88101_2023" / "daily_88101_2023.csv"

# Checksums folder
checksums_dir = PROJECT_ROOT / "data" / "checksums"
checksums_dir.mkdir(parents=True, exist_ok=True)

epa_csv_path, checksums_dir


(WindowsPath('C:/Users/ycjia/OneDrive/Desktop/IS-477-Project/data/raw/epa/daily_88101_2023/daily_88101_2023.csv'),
 WindowsPath('C:/Users/ycjia/OneDrive/Desktop/IS-477-Project/data/checksums'))

In [3]:
# Function to compute SHA-256 checksum for JSON file
def compute_sha256(file_path):
    hasher = hashlib.sha256()
    with open(file_path, "rb") as f:
        for chunk in iter(lambda: f.read(8192), b""):
            hasher.update(chunk)
    return hasher.hexdigest()

epa_hash = compute_sha256(epa_csv_path)
epa_hash

'd705eac34cf86ddd7703c14ca6e14d41c9e46998e3af5bbbb4187d9876da2add'

In [4]:
# Write checksum to file
checksum_file = checksums_dir / "epa_daily_88101_2023.sha256"

with checksum_file.open("w", encoding="utf-8") as f:
    f.write(f"{epa_hash}  data/raw/epa/daily_88101_2023/daily_88101_2023.csv\n")

checksum_file, checksum_file.read_text()


(WindowsPath('C:/Users/ycjia/OneDrive/Desktop/IS-477-Project/data/checksums/epa_daily_88101_2023.sha256'),
 'd705eac34cf86ddd7703c14ca6e14d41c9e46998e3af5bbbb4187d9876da2add  data/raw/epa/daily_88101_2023/daily_88101_2023.csv\n')

In [5]:
# Load the EPA CSV to check
epa_df = pd.read_csv(epa_csv_path)
epa_df.head()

Unnamed: 0,State Code,County Code,Site Num,Parameter Code,POC,Latitude,Longitude,Datum,Parameter Name,Sample Duration,...,AQI,Method Code,Method Name,Local Site Name,Address,State Name,County Name,City Name,CBSA Name,Date of Last Change
0,1,3,10,88101,3,30.497478,-87.880258,NAD83,PM2.5 - Local Conditions,1 HOUR,...,,209,Met One BAM-1022 Mass Monitor w/ VSCC or TE-PM...,"FAIRHOPE, Alabama","FAIRHOPE HIGH SCHOOL, 1 PIRATE DRIVE, FAIRHOPE...",Alabama,Baldwin,Fairhope,"Daphne-Fairhope-Foley, AL",2024-08-06
1,1,3,10,88101,3,30.497478,-87.880258,NAD83,PM2.5 - Local Conditions,1 HOUR,...,,209,Met One BAM-1022 Mass Monitor w/ VSCC or TE-PM...,"FAIRHOPE, Alabama","FAIRHOPE HIGH SCHOOL, 1 PIRATE DRIVE, FAIRHOPE...",Alabama,Baldwin,Fairhope,"Daphne-Fairhope-Foley, AL",2024-08-06
2,1,3,10,88101,3,30.497478,-87.880258,NAD83,PM2.5 - Local Conditions,1 HOUR,...,,209,Met One BAM-1022 Mass Monitor w/ VSCC or TE-PM...,"FAIRHOPE, Alabama","FAIRHOPE HIGH SCHOOL, 1 PIRATE DRIVE, FAIRHOPE...",Alabama,Baldwin,Fairhope,"Daphne-Fairhope-Foley, AL",2024-08-06
3,1,3,10,88101,3,30.497478,-87.880258,NAD83,PM2.5 - Local Conditions,1 HOUR,...,,209,Met One BAM-1022 Mass Monitor w/ VSCC or TE-PM...,"FAIRHOPE, Alabama","FAIRHOPE HIGH SCHOOL, 1 PIRATE DRIVE, FAIRHOPE...",Alabama,Baldwin,Fairhope,"Daphne-Fairhope-Foley, AL",2024-08-06
4,1,3,10,88101,3,30.497478,-87.880258,NAD83,PM2.5 - Local Conditions,1 HOUR,...,,209,Met One BAM-1022 Mass Monitor w/ VSCC or TE-PM...,"FAIRHOPE, Alabama","FAIRHOPE HIGH SCHOOL, 1 PIRATE DRIVE, FAIRHOPE...",Alabama,Baldwin,Fairhope,"Daphne-Fairhope-Foley, AL",2024-08-06


### DATA ACQUISITION & CHECKSUMS FOR NOAA JSON WEATHER DATA

In [6]:
# Update this to your actual project root folder
PROJECT_ROOT = Path(r"C:\Users\ycjia\OneDrive\Desktop\IS-477-Project")

# NOAA Weather Data Paths
weather_raw_dir = PROJECT_ROOT / "data" / "raw" / "noaa"
weather_raw_dir.mkdir(parents=True, exist_ok=True)

# Paths for JSON and CSV files
weather_json_path = weather_raw_dir / "open_meteo_chicago_2023.json"
weather_csv_path = weather_raw_dir / "open_meteo_chicago_2023.csv"

weather_raw_dir, weather_json_path, weather_csv_path


(WindowsPath('C:/Users/ycjia/OneDrive/Desktop/IS-477-Project/data/raw/noaa'),
 WindowsPath('C:/Users/ycjia/OneDrive/Desktop/IS-477-Project/data/raw/noaa/open_meteo_chicago_2023.json'),
 WindowsPath('C:/Users/ycjia/OneDrive/Desktop/IS-477-Project/data/raw/noaa/open_meteo_chicago_2023.csv'))

In [7]:
# NOAA API request for Chicago 2023 daily data
url = "https://archive-api.open-meteo.com/v1/archive"

params = {
    "latitude": 41.85,
    "longitude": -87.65,
    "start_date": "2023-01-01",
    "end_date": "2023-12-31",
    "daily": [
        "temperature_2m_max",
        "temperature_2m_min",
        "wind_direction_10m_dominant",
        "precipitation_sum",
        "shortwave_radiation_sum",
        "relative_humidity_2m_mean",
        "wind_speed_10m_max",
    ],
    "timezone": "America/Chicago",
}

response = requests.get(url, params=params)
response.status_code


200

In [8]:
data = response.json()

with weather_json_path.open("w", encoding="utf-8") as f:
    json.dump(data, f, ensure_ascii=False, indent=2)

weather_json_path


WindowsPath('C:/Users/ycjia/OneDrive/Desktop/IS-477-Project/data/raw/noaa/open_meteo_chicago_2023.json')

In [9]:
data = response.json()
print(data.keys())
daily = data["daily"]
print(daily.keys())
print(len(daily["time"]), len(daily["temperature_2m_max"]), len(daily["wind_speed_10m_max"]))

dict_keys(['latitude', 'longitude', 'generationtime_ms', 'utc_offset_seconds', 'timezone', 'timezone_abbreviation', 'elevation', 'daily_units', 'daily'])
dict_keys(['time', 'temperature_2m_max', 'temperature_2m_min', 'wind_direction_10m_dominant', 'precipitation_sum', 'shortwave_radiation_sum', 'relative_humidity_2m_mean', 'wind_speed_10m_max'])
365 365 365


In [10]:
weather_sha = compute_sha256(weather_json_path)
weather_sha

'ed65708a63f1292fdc15b656713cbfaaa5fee4a19d5c239ec5b3a57637ec4f5b'

In [11]:
# Write checksum to file
checksum_file = checksums_dir / "noaa_open_meteo_2023_json.sha256"

with checksum_file.open("w", encoding="utf-8") as f:
    f.write(f"{weather_sha}  data/raw/noaa/open_meteo_chicago_2023.json\n")

checksum_file, checksum_file.read_text()


(WindowsPath('C:/Users/ycjia/OneDrive/Desktop/IS-477-Project/data/checksums/noaa_open_meteo_2023_json.sha256'),
 'ed65708a63f1292fdc15b656713cbfaaa5fee4a19d5c239ec5b3a57637ec4f5b  data/raw/noaa/open_meteo_chicago_2023.json\n')

### CONVERT JSON to CSV

In [12]:
PROJECT_ROOT = Path(r"C:\Users\ycjia\OneDrive\Desktop\IS-477-Project")

weather_json_path = PROJECT_ROOT / "data" / "raw" / "noaa" / "open_meteo_chicago_2023.json"

with weather_json_path.open("r", encoding="utf-8") as f:
    data = json.load(f)

# Quick check
data.keys()

dict_keys(['latitude', 'longitude', 'generationtime_ms', 'utc_offset_seconds', 'timezone', 'timezone_abbreviation', 'elevation', 'daily_units', 'daily'])

In [13]:
daily = data["daily"]

# Turn daily dict into a DataFrame
weather_df = pd.DataFrame(daily)

# Rename the "time" column to "date"
weather_df = weather_df.rename(columns={"time": "date"})

# convert date to real datetime type
weather_df["date"] = pd.to_datetime(weather_df["date"])

weather_df.head()


Unnamed: 0,date,temperature_2m_max,temperature_2m_min,wind_direction_10m_dominant,precipitation_sum,shortwave_radiation_sum,relative_humidity_2m_mean,wind_speed_10m_max
0,2023-01-01,8.5,0.6,227,3.3,7.02,92,17.1
1,2023-01-02,4.6,-0.4,35,0.3,4.73,97,18.1
2,2023-01-03,12.7,4.2,112,13.1,2.13,98,23.6
3,2023-01-04,7.2,0.2,232,0.0,3.29,86,23.0
4,2023-01-05,0.9,-0.4,240,1.3,2.61,84,21.3


In [14]:
weather_csv_path = PROJECT_ROOT / "data" / "raw" / "noaa" / "open_meteo_chicago_2023_daily.csv"
weather_df.to_csv(weather_csv_path, index=False)
weather_csv_path

WindowsPath('C:/Users/ycjia/OneDrive/Desktop/IS-477-Project/data/raw/noaa/open_meteo_chicago_2023_daily.csv')

In [15]:
checksums_dir = PROJECT_ROOT / "data" / "checksums"
checksums_dir.mkdir(parents=True, exist_ok=True)

weather_csv_hash = compute_sha256(weather_csv_path)

weather_csv_checksum_file = checksums_dir / "open_meteo_chicago_2023_daily_csv.sha256"

with weather_csv_checksum_file.open("w", encoding="utf-8") as f:
    f.write(f"{weather_csv_hash}  data/raw/noaa/open_meteo_chicago_2023_daily.csv\n")

weather_csv_checksum_file.read_text()


'e775c7d27564ffc91e850b61ed8af09dc8be933151fdb1253ba6b2178da44e26  data/raw/noaa/open_meteo_chicago_2023_daily.csv\n'