<a href="https://colab.research.google.com/github/aa23amd/NOAA-DATASET-CSV/blob/main/PROCESSED_DATA_FROM_NOAA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
import requests
import pandas as pd
from datetime import datetime, timedelta

# NOAA API token and parameters (replace with your actual token)
API_TOKEN = 'meMJNjoRWdehUATKDiEJKsGcmCjJaUtN'
DATASET_ID = 'GHCND'          # Daily Summaries dataset
LOCATION_ID = 'FIPS:37'       # Example: North Carolina (adjust as needed)

# Define the overall date range (nearly 11 months)
overall_start = datetime(2021, 1, 1)
overall_end = datetime(2021, 11, 30)

# We'll break the overall period into 30-day intervals.
INTERVAL_DAYS = 30
LIMIT = 1000  # Maximum records per request (pagination limit)


In [6]:
def fetch_noaa_data(start_date, end_date, dataset_id, location_id, token, limit=LIMIT):
    """
    Fetch NOAA data for a given date range and handle pagination.
    Returns a list of record dictionaries.
    """
    all_results = []
    offset = 1  # NOAA API offset is 1-indexed
    url = 'https://www.ncdc.noaa.gov/cdo-web/api/v2/data'
    headers = {'token': token}

    while True:
        params = {
            'datasetid': dataset_id,
            'locationid': location_id,
            'startdate': start_date.strftime('%Y-%m-%d'),
            'enddate': end_date.strftime('%Y-%m-%d'),
            'limit': limit,
            'offset': offset
        }
        response = requests.get(url, headers=headers, params=params)

        if response.status_code != 200:
            print(f"Error: {response.status_code} for {start_date.date()} to {end_date.date()} (offset: {offset})")
            break

        data = response.json()
        results = data.get('results', [])
        if not results:
            break

        all_results.extend(results)

        # If fewer records than limit are returned, this interval is done.
        if len(results) < limit:
            break
        else:
            offset += limit

    return all_results


In [8]:
all_data = []
current_start = overall_start

while current_start <= overall_end:
    current_end = min(current_start + timedelta(days=INTERVAL_DAYS - 1), overall_end)

    print(f"Fetching data from {current_start.date()} to {current_end.date()} ...")
    interval_data = fetch_noaa_data(current_start, current_end, DATASET_ID, LOCATION_ID, API_TOKEN)
    print(f"Records fetched in this interval: {len(interval_data)}")

    all_data.extend(interval_data)
    current_start = current_end + timedelta(days=1)

print(f"Total records collected: {len(all_data)}")


Fetching data from 2021-01-01 to 2021-01-30 ...
Error: 503 for 2021-01-01 to 2021-01-30 (offset: 22001)
Records fetched in this interval: 22000
Fetching data from 2021-01-31 to 2021-03-01 ...
Error: 503 for 2021-01-31 to 2021-03-01 (offset: 24001)
Records fetched in this interval: 24000
Fetching data from 2021-03-02 to 2021-03-31 ...
Error: 503 for 2021-03-02 to 2021-03-31 (offset: 7001)
Records fetched in this interval: 7000
Fetching data from 2021-04-01 to 2021-04-30 ...
Error: 502 for 2021-04-01 to 2021-04-30 (offset: 17001)
Records fetched in this interval: 17000
Fetching data from 2021-05-01 to 2021-05-30 ...
Error: 503 for 2021-05-01 to 2021-05-30 (offset: 9001)
Records fetched in this interval: 9000
Fetching data from 2021-05-31 to 2021-06-29 ...
Error: 503 for 2021-05-31 to 2021-06-29 (offset: 20001)
Records fetched in this interval: 20000
Fetching data from 2021-06-30 to 2021-07-29 ...
Error: 503 for 2021-06-30 to 2021-07-29 (offset: 41001)
Records fetched in this interval: 41

In [9]:
# Convert the collected data into a DataFrame
raw_df = pd.DataFrame(all_data)
print("Raw DataFrame shape:", raw_df.shape)
print(raw_df.head())

# Save the raw data to a CSV file in the Colab environment
raw_csv_filename = 'raw_noaa_data_large.csv'
raw_df.to_csv(raw_csv_filename, index=False)
print(f"Raw data saved as '{raw_csv_filename}'")


Raw DataFrame shape: (287708, 5)
                  date datatype            station attributes  value
0  2021-01-01T00:00:00     PRCP  GHCND:US1NCAG0005   ,,N,0700     15
1  2021-01-01T00:00:00     PRCP  GHCND:US1NCAG0007   ,,N,0700     10
2  2021-01-01T00:00:00     PRCP  GHCND:US1NCAL0014   ,,N,0900     13
3  2021-01-01T00:00:00     PRCP  GHCND:US1NCAL0036   ,,N,0800      5
4  2021-01-01T00:00:00     PRCP  GHCND:US1NCAL0038   ,,N,0700     10
Raw data saved as 'raw_noaa_data_large.csv'


In [10]:
from google.colab import files
files.download(raw_csv_filename)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

LOADING RAW DATA

In [11]:
import pandas as pd

# Load raw data (adjust the filename if necessary)
raw_df = pd.read_csv('raw_noaa_data_large.csv')
print("Raw DataFrame shape:", raw_df.shape)
print(raw_df.head())

Raw DataFrame shape: (287708, 5)
                  date datatype            station attributes  value
0  2021-01-01T00:00:00     PRCP  GHCND:US1NCAG0005   ,,N,0700     15
1  2021-01-01T00:00:00     PRCP  GHCND:US1NCAG0007   ,,N,0700     10
2  2021-01-01T00:00:00     PRCP  GHCND:US1NCAL0014   ,,N,0900     13
3  2021-01-01T00:00:00     PRCP  GHCND:US1NCAL0036   ,,N,0800      5
4  2021-01-01T00:00:00     PRCP  GHCND:US1NCAL0038   ,,N,0700     10


PIVOT DATA FRAME

In [12]:
# Pivot the DataFrame:
# Each unique date becomes a row and each datatype becomes its own column.
df_pivot = raw_df.pivot_table(index='date', columns='datatype', values='value', aggfunc='first').reset_index()


In [None]:
CONVERTING DATE COLUMN TO DATETIME

In [13]:
# Convert 'date' column to datetime
df_pivot['date'] = pd.to_datetime(df_pivot['date'], errors='coerce')
print("\nPivoted DataFrame:")
print(df_pivot.head())
print("Pivoted DataFrame shape:", df_pivot.shape)


Pivoted DataFrame:
datatype       date  ADPT     ASLP    ASTP  AWBT  AWND  DAPR  EVAP   MDPR  \
0        2021-01-01  44.0  10213.0  9793.0  50.0  31.0  11.0   NaN  533.0   
1        2021-01-02  61.0  10149.0  9726.0  67.0   0.0   2.0   NaN  239.0   
2        2021-01-03  50.0  10125.0  9702.0  67.0   0.0   3.0   NaN  251.0   
3        2021-01-04 -11.0  10149.0  9726.0  28.0   0.0   4.0   NaN  287.0   
4        2021-01-05  11.0  10132.0  9705.0  33.0  31.0   2.0   NaN    0.0   

datatype    PGTM  ...   WSF5   WSFG  WT01  WT02  WT03  WT04  WT05  WT06  WT08  \
0         2358.0  ...  116.0  197.0   1.0   1.0   1.0   NaN   NaN   NaN   1.0   
1          258.0  ...   45.0  237.0   1.0   1.0   NaN   NaN   NaN   NaN   NaN   
2          648.0  ...   72.0  130.0   1.0   1.0   1.0   NaN   NaN   NaN   NaN   
3           16.0  ...   49.0  197.0   1.0   NaN   NaN   NaN   NaN   1.0   NaN   
4         1538.0  ...   81.0  116.0   1.0   1.0   NaN   NaN   NaN   NaN   NaN   

datatype  WT11  
0          1.

In [None]:
DATA CLEANING

In [14]:
# Identify weather parameter columns (all except 'date')
weather_cols = df_pivot.columns.drop('date')

# Convert these columns to numeric (errors will be coerced to NaN)
for col in weather_cols:
    df_pivot[col] = pd.to_numeric(df_pivot[col], errors='coerce')

# Fill missing values:
# 1. Use forward fill to propagate last valid observation.
df_pivot.ffill(inplace=True)
# 2. Fill any remaining NaNs with 0 (or use a different strategy if needed).
df_pivot.fillna(0, inplace=True)

# Sort DataFrame by date (ensures correct time order)
df_pivot.sort_values('date', inplace=True)

print("\nCleaned DataFrame preview:")
print(df_pivot.head())
print("DataFrame shape after cleaning:", df_pivot.shape)



Cleaned DataFrame preview:
datatype       date  ADPT     ASLP    ASTP  AWBT  AWND  DAPR  EVAP   MDPR  \
0        2021-01-01  44.0  10213.0  9793.0  50.0  31.0  11.0   0.0  533.0   
1        2021-01-02  61.0  10149.0  9726.0  67.0   0.0   2.0   0.0  239.0   
2        2021-01-03  50.0  10125.0  9702.0  67.0   0.0   3.0   0.0  251.0   
3        2021-01-04 -11.0  10149.0  9726.0  28.0   0.0   4.0   0.0  287.0   
4        2021-01-05  11.0  10132.0  9705.0  33.0  31.0   2.0   0.0    0.0   

datatype    PGTM  ...   WSF5   WSFG  WT01  WT02  WT03  WT04  WT05  WT06  WT08  \
0         2358.0  ...  116.0  197.0   1.0   1.0   1.0   0.0   0.0   0.0   1.0   
1          258.0  ...   45.0  237.0   1.0   1.0   1.0   0.0   0.0   0.0   1.0   
2          648.0  ...   72.0  130.0   1.0   1.0   1.0   0.0   0.0   0.0   1.0   
3           16.0  ...   49.0  197.0   1.0   1.0   1.0   0.0   0.0   1.0   1.0   
4         1538.0  ...   81.0  116.0   1.0   1.0   1.0   0.0   0.0   1.0   1.0   

datatype  WT11  
0    

FEATURE ENGINEERING

In [15]:
# Check if the 'PRCP' column exists
if 'PRCP' in df_pivot.columns:
    # Create lag features: previous day's and two days ago's PRCP
    df_pivot['PRCP_lag1'] = df_pivot['PRCP'].shift(1)
    df_pivot['PRCP_lag2'] = df_pivot['PRCP'].shift(2)

    # Drop rows with NaN values resulting from shifting (this removes the first two rows)
    df_pivot.dropna(inplace=True)
else:
    print("PRCP column not found; please check your raw data.")

print("\nDataFrame after creating lag features:")
print(df_pivot.head())
print("Shape after lag feature creation:", df_pivot.shape)



DataFrame after creating lag features:
datatype       date  ADPT     ASLP    ASTP  AWBT  AWND  DAPR  EVAP   MDPR  \
2        2021-01-03  50.0  10125.0  9702.0  67.0   0.0   3.0   0.0  251.0   
3        2021-01-04 -11.0  10149.0  9726.0  28.0   0.0   4.0   0.0  287.0   
4        2021-01-05  11.0  10132.0  9705.0  33.0  31.0   2.0   0.0    0.0   
5        2021-01-06 -39.0  10207.0  9776.0  11.0   0.0  12.0  15.0  869.0   
6        2021-01-07 -28.0  10200.0  9776.0   0.0   0.0   2.0  10.0   13.0   

datatype    PGTM  ...  WT01  WT02  WT03  WT04  WT05  WT06  WT08  WT11  \
2          648.0  ...   1.0   1.0   1.0   0.0   0.0   0.0   1.0   1.0   
3           16.0  ...   1.0   1.0   1.0   0.0   0.0   1.0   1.0   1.0   
4         1538.0  ...   1.0   1.0   1.0   0.0   0.0   1.0   1.0   1.0   
5         1326.0  ...   1.0   1.0   1.0   0.0   0.0   1.0   1.0   1.0   
6         2340.0  ...   1.0   1.0   1.0   0.0   0.0   1.0   1.0   1.0   

datatype  PRCP_lag1  PRCP_lag2  
2             193.0      

NORMALIZATION

In [16]:
from sklearn.preprocessing import MinMaxScaler

# Identify columns to scale (all columns except 'date')
cols_to_scale = df_pivot.columns.drop('date')

scaler = MinMaxScaler()
df_pivot[cols_to_scale] = scaler.fit_transform(df_pivot[cols_to_scale])

print("\nScaled DataFrame preview:")
print(df_pivot.head())



Scaled DataFrame preview:
datatype       date      ADPT      ASLP      ASTP      AWBT      AWND  \
2        2021-01-03  0.525074  0.448795  0.424437  0.363985  0.000000   
3        2021-01-04  0.345133  0.521084  0.501608  0.214559  0.000000   
4        2021-01-05  0.410029  0.469880  0.434084  0.233716  0.169399   
5        2021-01-06  0.262537  0.695783  0.662379  0.149425  0.000000   
6        2021-01-07  0.294985  0.674699  0.662379  0.107280  0.000000   

datatype      DAPR      EVAP      MDPR      PGTM  ...  WT01  WT02  WT03  WT04  \
2         0.018519  0.000000  0.190008  0.271945  ...   0.0   0.0   0.0   0.0   
3         0.037037  0.000000  0.217260  0.000000  ...   0.0   0.0   0.0   0.0   
4         0.000000  0.000000  0.000000  0.654905  ...   0.0   0.0   0.0   0.0   
5         0.185185  0.067873  0.657835  0.563683  ...   0.0   0.0   0.0   0.0   
6         0.000000  0.045249  0.009841  1.000000  ...   0.0   0.0   0.0   0.0   

datatype  WT05  WT06  WT08  WT11  PRCP_lag1  PR

Prepare Data for XGBoost

In [17]:
from sklearn.model_selection import train_test_split

# Define features and target (adjust if you add more features later)
features = ['PRCP_lag1', 'PRCP_lag2']
target = 'PRCP'

# Verify that required columns exist
missing_cols = [col for col in features + [target] if col not in df_pivot.columns]
if missing_cols:
    print("Missing columns for tabular model:", missing_cols)
else:
    X = df_pivot[features]
    y = df_pivot[target]

    # Use sequential split for time series data: first 80% for training, last 20% for testing.
    train_size = int(len(df_pivot) * 0.8)
    X_train, X_test = X.iloc[:train_size], X.iloc[train_size:]
    y_train, y_test = y.iloc[:train_size], y.iloc[train_size:]

    print("\nTabular Data Shapes:")
    print("X_train shape:", X_train.shape)
    print("X_test shape:", X_test.shape)
    print("y_train shape:", y_train.shape)
    print("y_test shape:", y_test.shape)



Tabular Data Shapes:
X_train shape: (101, 2)
X_test shape: (26, 2)
y_train shape: (101,)
y_test shape: (26,)


Prepare Data for Sequence-Based Models LSTM

DEINE FUNCTION

In [18]:
import numpy as np

def create_sequences(data, seq_length, feature_cols, target_col):
    """
    Create sliding window sequences for sequence models.
    Returns:
        X_seq: 3D numpy array of shape (samples, seq_length, num_features)
        y_seq: 1D numpy array of targets.
    """
    sequences = []
    targets = []
    for i in range(len(data) - seq_length):
        seq = data[feature_cols].iloc[i:i+seq_length].values
        target_val = data[target_col].iloc[i+seq_length]
        sequences.append(seq)
        targets.append(target_val)
    return np.array(sequences), np.array(targets)


GENERATE SEQUENCE

In [19]:
# Define sequence parameters
sequence_length = 2  # For small data; typically 5 or more for larger datasets.
sequence_features = ['PRCP']  # You can include more features here if desired.
target_seq = 'PRCP'

# Check that required columns exist
missing_seq_cols = [col for col in sequence_features + [target_seq] if col not in df_pivot.columns]
if missing_seq_cols:
    print("Missing columns for sequence model:", missing_seq_cols)
else:
    X_seq, y_seq = create_sequences(df_pivot, sequence_length, sequence_features, target_seq)
    print("\nSequence Data Shapes:")
    print("X_seq shape (samples, timesteps, features):", X_seq.shape)
    print("y_seq shape:", y_seq.shape)



Sequence Data Shapes:
X_seq shape (samples, timesteps, features): (125, 2, 1)
y_seq shape: (125,)


DATA SAVING PROCESSED

In [20]:
# Save the processed DataFrame to CSV
df_pivot.to_csv('processed_noaa_weather_data.csv', index=False)
print("\nProcessed data saved as 'processed_noaa_weather_data.csv'")

# Save sequence data as NumPy arrays
np.save('X_seq.npy', X_seq)
np.save('y_seq.npy', y_seq)
print("Sequence data saved as 'X_seq.npy' and 'y_seq.npy'")



Processed data saved as 'processed_noaa_weather_data.csv'
Sequence data saved as 'X_seq.npy' and 'y_seq.npy'


In [21]:
from google.colab import files

# Replace the filename if necessary
files.download('processed_noaa_weather_data.csv')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>