<a href="https://colab.research.google.com/github/aa23amd/NOAA-DATASET-CSV/blob/main/BIG_RAW_DATA_NOAA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
import requests
import pandas as pd
from datetime import datetime, timedelta

# NOAA API token and parameters (replace with your actual token)
API_TOKEN = 'meMJNjoRWdehUATKDiEJKsGcmCjJaUtN'
DATASET_ID = 'GHCND'          # Daily Summaries dataset
LOCATION_ID = 'FIPS:37'       # Example: North Carolina (adjust as needed)

# Define the overall date range (nearly 11 months)
overall_start = datetime(2021, 1, 1)
overall_end = datetime(2021, 11, 30)

# We'll break the overall period into 30-day intervals.
INTERVAL_DAYS = 30
LIMIT = 1000  # Maximum records per request (pagination limit)


In [6]:
def fetch_noaa_data(start_date, end_date, dataset_id, location_id, token, limit=LIMIT):
    """
    Fetch NOAA data for a given date range and handle pagination.
    Returns a list of record dictionaries.
    """
    all_results = []
    offset = 1  # NOAA API offset is 1-indexed
    url = 'https://www.ncdc.noaa.gov/cdo-web/api/v2/data'
    headers = {'token': token}

    while True:
        params = {
            'datasetid': dataset_id,
            'locationid': location_id,
            'startdate': start_date.strftime('%Y-%m-%d'),
            'enddate': end_date.strftime('%Y-%m-%d'),
            'limit': limit,
            'offset': offset
        }
        response = requests.get(url, headers=headers, params=params)

        if response.status_code != 200:
            print(f"Error: {response.status_code} for {start_date.date()} to {end_date.date()} (offset: {offset})")
            break

        data = response.json()
        results = data.get('results', [])
        if not results:
            break

        all_results.extend(results)

        # If fewer records than limit are returned, this interval is done.
        if len(results) < limit:
            break
        else:
            offset += limit

    return all_results


In [8]:
all_data = []
current_start = overall_start

while current_start <= overall_end:
    current_end = min(current_start + timedelta(days=INTERVAL_DAYS - 1), overall_end)

    print(f"Fetching data from {current_start.date()} to {current_end.date()} ...")
    interval_data = fetch_noaa_data(current_start, current_end, DATASET_ID, LOCATION_ID, API_TOKEN)
    print(f"Records fetched in this interval: {len(interval_data)}")

    all_data.extend(interval_data)
    current_start = current_end + timedelta(days=1)

print(f"Total records collected: {len(all_data)}")


Fetching data from 2021-01-01 to 2021-01-30 ...
Error: 503 for 2021-01-01 to 2021-01-30 (offset: 22001)
Records fetched in this interval: 22000
Fetching data from 2021-01-31 to 2021-03-01 ...
Error: 503 for 2021-01-31 to 2021-03-01 (offset: 24001)
Records fetched in this interval: 24000
Fetching data from 2021-03-02 to 2021-03-31 ...
Error: 503 for 2021-03-02 to 2021-03-31 (offset: 7001)
Records fetched in this interval: 7000
Fetching data from 2021-04-01 to 2021-04-30 ...
Error: 502 for 2021-04-01 to 2021-04-30 (offset: 17001)
Records fetched in this interval: 17000
Fetching data from 2021-05-01 to 2021-05-30 ...
Error: 503 for 2021-05-01 to 2021-05-30 (offset: 9001)
Records fetched in this interval: 9000
Fetching data from 2021-05-31 to 2021-06-29 ...
Error: 503 for 2021-05-31 to 2021-06-29 (offset: 20001)
Records fetched in this interval: 20000
Fetching data from 2021-06-30 to 2021-07-29 ...
Error: 503 for 2021-06-30 to 2021-07-29 (offset: 41001)
Records fetched in this interval: 41

In [9]:
# Convert the collected data into a DataFrame
raw_df = pd.DataFrame(all_data)
print("Raw DataFrame shape:", raw_df.shape)
print(raw_df.head())

# Save the raw data to a CSV file in the Colab environment
raw_csv_filename = 'raw_noaa_data_large.csv'
raw_df.to_csv(raw_csv_filename, index=False)
print(f"Raw data saved as '{raw_csv_filename}'")


Raw DataFrame shape: (287708, 5)
                  date datatype            station attributes  value
0  2021-01-01T00:00:00     PRCP  GHCND:US1NCAG0005   ,,N,0700     15
1  2021-01-01T00:00:00     PRCP  GHCND:US1NCAG0007   ,,N,0700     10
2  2021-01-01T00:00:00     PRCP  GHCND:US1NCAL0014   ,,N,0900     13
3  2021-01-01T00:00:00     PRCP  GHCND:US1NCAL0036   ,,N,0800      5
4  2021-01-01T00:00:00     PRCP  GHCND:US1NCAL0038   ,,N,0700     10
Raw data saved as 'raw_noaa_data_large.csv'


In [10]:
from google.colab import files
files.download(raw_csv_filename)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>