<a href="https://colab.research.google.com/github/aa23amd/NOAA-DATASET-CSV/blob/main/Untitled5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import requests
import pandas as pd
from datetime import datetime, timedelta

# NOAA API token and parameters
API_TOKEN = 'meMJNjoRWdehUATKDiEJKsGcmCjJaUtN'  # Replace with your actual token
DATASET_ID = 'GHCND'  # Daily Summaries
LOCATION_ID = 'FIPS:37'  # Example: North Carolina

# Define the overall date range (choose a period with multiple days)
overall_start = datetime(2022, 1, 1)
overall_end = datetime(2022, 6, 30)  # 6 months, for example

# Set the maximum allowed interval (NOAA requires less than one year)
# We'll use a 30-day interval for each request.
interval_days = 30


In [2]:
def fetch_noaa_data(start_date, end_date, dataset_id, location_id, token, limit=1000):
    url = 'https://www.ncdc.noaa.gov/cdo-web/api/v2/data'
    headers = {'token': token}
    params = {
        'datasetid': dataset_id,
        'locationid': location_id,
        'startdate': start_date.strftime('%Y-%m-%d'),
        'enddate': end_date.strftime('%Y-%m-%d'),
        'limit': limit
    }

    response = requests.get(url, headers=headers, params=params)

    if response.status_code == 200:
        data = response.json()
        return data.get('results', [])
    else:
        print(f"Error: {response.status_code} for {start_date.date()} to {end_date.date()}")
        return []


In [3]:
all_data = []
current_start = overall_start

while current_start < overall_end:
    # Set current_end to current_start + interval_days, not exceeding overall_end
    current_end = min(current_start + timedelta(days=interval_days - 1), overall_end)

    print(f"Fetching data from {current_start.date()} to {current_end.date()} ...")
    interval_data = fetch_noaa_data(current_start, current_end, DATASET_ID, LOCATION_ID, API_TOKEN)
    print(f"Records fetched in this interval: {len(interval_data)}")
    all_data.extend(interval_data)

    # Move to the next interval (the day after current_end)
    current_start = current_end + timedelta(days=1)

print(f"Total records collected: {len(all_data)}")


Fetching data from 2022-01-01 to 2022-01-30 ...
Records fetched in this interval: 1000
Fetching data from 2022-01-31 to 2022-03-01 ...
Records fetched in this interval: 1000
Fetching data from 2022-03-02 to 2022-03-31 ...
Records fetched in this interval: 1000
Fetching data from 2022-04-01 to 2022-04-30 ...
Records fetched in this interval: 1000
Fetching data from 2022-05-01 to 2022-05-30 ...
Records fetched in this interval: 1000
Fetching data from 2022-05-31 to 2022-06-29 ...
Records fetched in this interval: 1000
Total records collected: 6000


In [4]:
# Convert to DataFrame
raw_df = pd.DataFrame(all_data)
print("Raw DataFrame shape:", raw_df.shape)
print(raw_df.head())

# Convert 'date' to datetime and check unique dates
raw_df['date'] = pd.to_datetime(raw_df['date'], errors='coerce')
unique_dates = raw_df['date'].dt.date.unique()
print(f"Unique dates in raw data: {len(unique_dates)}")
print(unique_dates[:10])


Raw DataFrame shape: (6000, 5)
                  date datatype            station attributes  value
0  2022-01-01T00:00:00     PRCP  GHCND:US1NCAG0001   ,,N,0738      3
1  2022-01-01T00:00:00     SNOW  GHCND:US1NCAG0001   ,,N,0738      0
2  2022-01-01T00:00:00     WESD  GHCND:US1NCAG0001   ,,N,0738      0
3  2022-01-01T00:00:00     WESF  GHCND:US1NCAG0001   ,,N,0738      0
4  2022-01-01T00:00:00     PRCP  GHCND:US1NCAG0005   ,,N,0700      8
Unique dates in raw data: 6
[datetime.date(2022, 1, 1) datetime.date(2022, 1, 31)
 datetime.date(2022, 3, 2) datetime.date(2022, 4, 1)
 datetime.date(2022, 5, 1) datetime.date(2022, 5, 31)]


In [5]:
raw_df.to_csv('raw_noaa_data.csv', index=False)
print("Raw data saved as 'raw_noaa_data.csv'")


Raw data saved as 'raw_noaa_data.csv'
