# Data from the National Data Buoy Center (NDBC / NOAA)

In [9]:
import requests
import json
import pandas as pd
import numpy as np
from io import StringIO

In [40]:
def read_ndbc_data(station_id, year, print_error = False, daily = False):
    url = "https://www.ndbc.noaa.gov/view_text_file.php?filename="+str(station_id)+"h"+str(year)+".txt.gz&dir=data/historical/stdmet/"
    response = requests.get(url)
    if response.status_code != 200:
        if print_error:
            print(f"Error: Received status code {response.status_code}" + ". year: " + str(year))
            print(url)
        return None
    # Remove comment lines0
    lines = response.text.strip().split('\n')
    # data_without_comments = '\n'.join(line for line in lines if not line.startswith('#'))
    data_with_headers = '\n'.join([lines[0]] + lines[2:])

    
    # Read the data into a pandas DataFrame
    df = pd.read_csv(StringIO(data_with_headers), delim_whitespace=True, header = 0)
    df = df.rename(columns={'#YY':'YY','YYYY':'YY','WD':'WDIR','BAR':'PRES'})

    # Fix years that are reported as only two digits (e.g. 94)
    def convert_year(year):
        if year < 100:  # Assuming 2-digit years are in the 20th century
            if year <= 25:
                return year + 2000
            return year + 1900
        return year
    df['YY'] = df['YY'].apply(convert_year)

    # Fix MWD == 999.0000
    # df.loc[round(df['MWD'],2)==999.00,'MWD'] = np.nan
    # Define a function to replace 99.00 and 999.00 with np.nan
    def replace_with_nan(value):
        if round(value, 2) == 99.00 or round(value, 2) == 999.00:
            return np.nan
        return value
    # Apply the function to the entire DataFrame
    #df = df.applymap(replace_with_nan)
    df = df.map(replace_with_nan)

    df['station_id'] = station_id

    df_selectcols = df[['station_id','YY', 'MM', 'DD', 'hh', 'WDIR', 'WSPD', 'GST', 'WVHT', 'DPD',
       'APD', 'MWD', 'PRES', 'ATMP', 'WTMP', 'DEWP','VIS']]

    if(daily):
        daily_avg_temp = df_selectcols.groupby(["station_id","YY","MM","DD"]).mean().reset_index()
        return daily_avg_temp

    monthly_avg_temp = df_selectcols.groupby(["station_id","YY","MM"]).mean().reset_index()
    return monthly_avg_temp

In [41]:
stations = {
    'Corpus Christi, TX':'42020',
    'Bay of Campeche':'42055',
    'West Tampa':'42036',
    'Garden Key, FL':'GKYF1',
    'Mid Gulf':'42001',
    'Yucatan Basin':'42056',
    'Western Caribbean':'42057',
    'Central Caribbean':'42058',
    'Eastern Caribbean':'42059',
    'West Indies':'41300',
    'North Equatorial Two':'41041',
    'NE St. Martin':'41044',
    'East Bahamas':'41046',
    'NE Bahamas':'41047',
    'South Hatteras':'41002',
    'Southwest Bermuda':'41425',
    'Canaveral East':'41010'
}

In [42]:
#read_ndbc_data(42001,1992)
read_ndbc_data(stations['NE Bahamas'], 2007).head()

#'https://www.ndbc.noaa.gov/view_text_file.php?filename=41047h2023.txt.gz&dir=data/historical/stdmet/'

Unnamed: 0,station_id,YY,MM,DD,hh,WDIR,WSPD,GST,WVHT,DPD,APD,MWD,PRES,ATMP,WTMP,DEWP,VIS
0,41047,2007,9,25.600858,11.699571,103.926724,5.751502,6.939056,1.177586,7.464526,5.324871,82.497778,1016.898712,27.139914,28.528326,23.220601,
1,41047,2007,10,16.008075,11.495289,111.194558,7.396366,8.896904,1.905922,9.060162,6.05642,88.630081,1015.405518,26.485734,27.859354,22.147779,
2,41047,2007,11,15.486787,11.484006,135.198887,6.587761,8.15758,2.078748,9.205786,6.537677,105.380084,1017.974131,23.793602,25.853825,17.639525,
3,41047,2007,12,16.07337,11.516304,123.461853,6.783967,8.128261,1.911342,8.977603,6.328027,122.247238,1020.599457,23.016033,23.960136,18.701368,


In [43]:
## Save Monthly Averages
station_name = 'Mid Gulf'
station_id = stations[station_name]

dfs = []
df_2023 = read_ndbc_data(station_id, 2023, daily = False)
cols = df_2023.columns

for yr in reversed(range(1975,2024)):
    #df.append(yr)
    df0 = read_ndbc_data(42001,yr)
    if type(df0) == "list":
        print(yr)
        break
    if not all(x == y for x, y in zip(df0.columns, cols)):
        print(yr)
        print(df0.columns)
        break
    #dfs.append(read_ndbc_data(42001,yr))
    dfs.append(read_ndbc_data(station_id, yr, daily = False))
df = pd.concat(dfs, ignore_index = True)
df['station_name'] = station_name
#df.to_csv('1975to2023_MidGulf_buoydata.csv')
df.head()

Unnamed: 0,station_id,YY,MM,DD,hh,WDIR,WSPD,GST,WVHT,DPD,APD,MWD,PRES,ATMP,WTMP,DEWP,VIS,station_name
0,42001,2023,1,16.000448,11.501456,134.401937,6.34722,8.104596,1.308026,6.209131,4.857487,144.777325,1017.862469,22.861143,24.196882,19.596565,,Mid Gulf
1,42001,2023,2,14.5,11.5,150.22095,6.900868,8.811039,1.390223,6.526369,4.900804,151.238628,1020.280704,23.235624,24.985757,20.216073,,Mid Gulf
2,42001,2023,3,16.0,11.5,123.338292,7.162489,9.010291,1.291415,6.073982,4.643875,128.785521,1017.14711,24.463543,26.135011,21.566121,,Mid Gulf
3,42001,2023,4,15.488151,11.527649,139.341902,6.525668,8.246106,1.367727,6.616688,4.94196,128.66526,1017.065125,25.136628,26.035371,22.323848,,Mid Gulf
4,42001,2023,5,16.0,11.5,138.880381,4.597848,5.881398,0.735773,5.382695,4.131866,133.090763,1027.067137,26.550651,27.315691,23.466659,,Mid Gulf


In [46]:
print(list(stations.items())[:4])

[('Corpus Christi, TX', '42020'), ('Bay of Campeche', '42055'), ('West Tampa', '42036'), ('Garden Key, FL', 'GKYF1')]


In [51]:
for station_name, station_id in list(stations.items())[:2]:
    print(station_name)

Corpus Christi, TX
Bay of Campeche


In [65]:
## Save Daily/Monthly Averages

# Initialize List of DataFrames:
DAILY = True
dfs = []  # List of dataframes
df_2023 = read_ndbc_data(stations['Mid Gulf'], 2023, daily = DAILY)
cols = df_2023.columns

start_year = 1970 # A sensible year might be 1975 (based on MidGulf)
end_year = 2023 # the last year for which you want results

# Loop over every station:
for station_name, station_id in list(stations.items()):
    print(station_name)
    # Loop over every year and scrape NDBC website:
    for yr in reversed(range(start_year, end_year+1)):
        
        # Check for errors:
        df0 = read_ndbc_data(42001,yr)
        if df0 is None:
            break
        if type(df0) == "list":
            print(yr)
            break
        if not all(x == y for x, y in zip(df0.columns, cols)):
            print(yr)
            print(df0.columns)
            break

        # Read Data:
        df1 = read_ndbc_data(station_id, yr, daily = DAILY)
        if df1 is not None:
            #df1['station_name'] = station_name
            df1.insert(0, "station_name", station_name)
        dfs.append(df1)

df = pd.concat(dfs, ignore_index = True)
#df.to_csv('1975to2023_MidGulf_buoydata.csv')
df.head()

Corpus Christi, TX
Bay of Campeche
West Tampa
Garden Key, FL
Mid Gulf
Yucatan Basin
Western Caribbean
Central Caribbean
Eastern Caribbean
West Indies
North Equatorial Two
NE St. Martin
East Bahamas
NE Bahamas
South Hatteras
Southwest Bermuda
Canaveral East


Unnamed: 0,station_name,station_id,YY,MM,DD,hh,WDIR,WSPD,GST,WVHT,DPD,APD,MWD,PRES,ATMP,WTMP,DEWP,VIS
0,"Corpus Christi, TX",42020,2023,1,1,11.5,179.416667,4.028472,4.998611,0.865208,7.702708,5.256458,105.166667,1014.175694,21.734722,21.688889,20.75625,
1,"Corpus Christi, TX",42020,2023,1,2,11.5,156.604167,8.588889,10.549306,1.310208,6.398125,4.584583,133.604167,1010.929861,23.291667,21.958042,21.950694,
2,"Corpus Christi, TX",42020,2023,1,3,11.5,206.111111,4.736111,6.190278,2.101667,8.277708,6.554583,133.625,1009.859722,20.429861,19.714685,19.140278,
3,"Corpus Christi, TX",42020,2023,1,4,11.5,126.298611,3.11875,4.084028,1.304583,7.703125,5.93375,126.583333,1016.865972,19.440278,18.525694,15.725,
4,"Corpus Christi, TX",42020,2023,1,5,11.5,166.496503,2.475694,3.150694,0.868333,7.559792,5.719792,108.755556,1022.669444,19.191667,17.539161,14.959722,


In [66]:
df.tail()

Unnamed: 0,station_name,station_id,YY,MM,DD,hh,WDIR,WSPD,GST,WVHT,DPD,APD,MWD,PRES,ATMP,WTMP,DEWP,VIS
110411,Canaveral East,41010,1988,12,27,11.531915,81.790698,9.425532,11.378723,1.81087,6.669565,5.173913,,1026.470213,21.574468,23.653191,,
110412,Canaveral East,41010,1988,12,28,11.531915,147.744681,8.521277,10.382979,1.931915,7.882979,5.689362,,1023.891489,22.770213,23.731915,,
110413,Canaveral East,41010,1988,12,29,11.531915,184.510638,4.37234,5.312766,1.234783,8.169565,6.065217,,1023.648936,23.431915,24.465957,,
110414,Canaveral East,41010,1988,12,30,11.531915,121.382979,2.497872,3.234043,1.171739,8.78913,6.897826,,1023.904255,23.155319,24.282979,,
110415,Canaveral East,41010,1988,12,31,11.391304,194.934783,3.793478,4.793478,1.152174,8.797826,7.013043,,1022.045652,23.306522,24.213043,,


In [68]:
df['station_name'].value_counts()

station_name
Mid Gulf                15461
South Hatteras          14281
Canaveral East          11436
Corpus Christi, TX      10985
West Tampa               9913
Bay of Campeche          6442
Yucatan Basin            6213
East Bahamas             5662
NE Bahamas               5389
NE St. Martin            5282
North Equatorial Two     5076
Eastern Caribbean        5058
Central Caribbean        4626
Western Caribbean        4592
Name: count, dtype: int64

In [67]:
#df.to_csv('daily_buoydata.csv')

In [69]:
df.size

1987488

In [64]:
df_monthly = df