In [27]:
import pandas as pd

# Read in available stations
stn_ids = pd.read_fwf('http://noaa-ghcn-pds.s3.amazonaws.com/ghcnd-stations.txt', header=None, infer_nrows=1000)
stn_ids.columns = ['ID','LAT','LON','ELEV','UKN','NAME','GSN','WBAN']
stn_ids

Unnamed: 0,ID,LAT,LON,ELEV,UKN,NAME,GSN,WBAN
0,ACW00011604,17.1167,-61.7833,10.1,,ST JOHNS COOLIDGE FLD,,
1,ACW00011647,17.1333,-61.7833,19.2,,ST JOHNS,,
2,AE000041196,25.3330,55.5170,34.0,,SHARJAH INTER. AIRP,GSN,41196.0
3,AEM00041194,25.2550,55.3640,10.4,,DUBAI INTL,,41194.0
4,AEM00041217,24.4330,54.6510,26.8,,ABU DHABI INTL,,41217.0
...,...,...,...,...,...,...,...,...
129653,ZI000067969,-21.0500,29.3670,861.0,,WEST NICHOLSON,,67969.0
129654,ZI000067975,-20.0670,30.8670,1095.0,,MASVINGO,,67975.0
129655,ZI000067977,-21.0170,31.5830,430.0,,BUFFALO RANGE,,67977.0
129656,ZI000067983,-20.2000,32.6160,1132.0,,CHIPINGE,GSN,67983.0


In [28]:
# Read in station inventory file (gives years availablee for each variable)
periods = pd.read_fwf('http://noaa-ghcn-pds.s3.amazonaws.com/ghcnd-inventory.txt', header=None, infer_nrows=1000)
periods.columns = ['ID','LAT','LON','ELEM','TiMIN','TiMAX']
periods

Unnamed: 0,ID,LAT,LON,ELEM,TiMIN,TiMAX
0,ACW00011604,17.1167,-61.7833,TMAX,1949,1949
1,ACW00011604,17.1167,-61.7833,TMIN,1949,1949
2,ACW00011604,17.1167,-61.7833,PRCP,1949,1949
3,ACW00011604,17.1167,-61.7833,SNOW,1949,1949
4,ACW00011604,17.1167,-61.7833,SNWD,1949,1949
...,...,...,...,...,...,...
767053,ZI000067983,-20.2000,32.6160,PRCP,1951,2025
767054,ZI000067983,-20.2000,32.6160,TAVG,1962,2025
767055,ZI000067991,-22.2170,30.0000,TMAX,1951,1990
767056,ZI000067991,-22.2170,30.0000,TMIN,1951,1990


In [29]:
# Merge the inventory and station dfs based on ID
merged_stns = pd.merge(stn_ids,periods,how='left',left_on='ID',right_on='ID')
merged_stns

Unnamed: 0,ID,LAT_x,LON_x,ELEV,UKN,NAME,GSN,WBAN,LAT_y,LON_y,ELEM,TiMIN,TiMAX
0,ACW00011604,17.1167,-61.7833,10.1,,ST JOHNS COOLIDGE FLD,,,17.1167,-61.7833,TMAX,1949.0,1949.0
1,ACW00011604,17.1167,-61.7833,10.1,,ST JOHNS COOLIDGE FLD,,,17.1167,-61.7833,TMIN,1949.0,1949.0
2,ACW00011604,17.1167,-61.7833,10.1,,ST JOHNS COOLIDGE FLD,,,17.1167,-61.7833,PRCP,1949.0,1949.0
3,ACW00011604,17.1167,-61.7833,10.1,,ST JOHNS COOLIDGE FLD,,,17.1167,-61.7833,SNOW,1949.0,1949.0
4,ACW00011604,17.1167,-61.7833,10.1,,ST JOHNS COOLIDGE FLD,,,17.1167,-61.7833,SNWD,1949.0,1949.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
767089,ZI000067983,-20.2000,32.6160,1132.0,,CHIPINGE,GSN,67983.0,-20.2000,32.6160,PRCP,1951.0,2025.0
767090,ZI000067983,-20.2000,32.6160,1132.0,,CHIPINGE,GSN,67983.0,-20.2000,32.6160,TAVG,1962.0,2025.0
767091,ZI000067991,-22.2170,30.0000,457.0,,BEITBRIDGE,,67991.0,-22.2170,30.0000,TMAX,1951.0,1990.0
767092,ZI000067991,-22.2170,30.0000,457.0,,BEITBRIDGE,,67991.0,-22.2170,30.0000,TMIN,1951.0,1990.0


In [30]:
# Grab stations where TMAX data is availble past 2020
# TiMIN is the earliest year available
# TiMAX is the most recent year available
merged_stns = merged_stns[(merged_stns['ELEM'] == 'TMAX') & (merged_stns['TiMAX'] == 2025)]
merged_stns

Unnamed: 0,ID,LAT_x,LON_x,ELEV,UKN,NAME,GSN,WBAN,LAT_y,LON_y,ELEM,TiMIN,TiMAX
18,AE000041196,25.3330,55.517,34.0,,SHARJAH INTER. AIRP,GSN,41196.0,25.3330,55.517,TMAX,1944.0,2025.0
22,AEM00041194,25.2550,55.364,10.4,,DUBAI INTL,,41194.0,25.2550,55.364,TMAX,1983.0,2025.0
26,AEM00041217,24.4330,54.651,26.8,,ABU DHABI INTL,,41217.0,24.4330,54.651,TMAX,1983.0,2025.0
30,AEM00041218,24.2620,55.609,264.9,,AL AIN INTL,,41218.0,24.2620,55.609,TMAX,1994.0,2025.0
54,AG000060390,36.7167,3.250,24.0,,ALGER-DAR EL BEIDA,GSN,60390.0,36.7167,3.250,TMAX,1940.0,2025.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
767021,ZA000067743,-17.8170,25.817,986.0,,LIVINGSTONE,GSN,67743.0,-17.8170,25.817,TMAX,1973.0,2025.0
767026,ZAM00067663,-14.4500,28.467,1207.0,,KABWE/MILLIKEN,,67663.0,-14.4500,28.467,TMAX,1973.0,2025.0
767038,ZI000067775,-17.9170,31.133,1480.0,,HARARE (KUTSAGA),GSN,67775.0,-17.9170,31.133,TMAX,1956.0,2025.0
767079,ZI000067975,-20.0670,30.867,1095.0,,MASVINGO,,67975.0,-20.0670,30.867,TMAX,1951.0,2025.0


In [31]:
merged_stns[merged_stns['NAME'].str.contains('BALTIMORE', regex=False)]

Unnamed: 0,ID,LAT_x,LON_x,ELEV,UKN,NAME,GSN,WBAN,LAT_y,LON_y,ELEM,TiMIN,TiMAX
757616,USW00093721,39.1733,-76.6842,42.1,MD,BALTIMORE-WASHINGTON INTL AP,,72406.0,39.1733,-76.6842,TMAX,1939.0,2025.0


In [32]:
# Read in actual data for selected station from AWS
df = pd.read_csv(
    "s3://noaa-ghcn-pds/csv/by_station/USW00093721.csv",
    storage_options={"anon": True},  # passed to `s3fs.S3FileSystem`
    dtype={'Q_FLAG': 'object', 'M_FLAG': 'object'},
    parse_dates=['DATE']
).set_index('DATE')

In [33]:
df

Unnamed: 0_level_0,ID,ELEMENT,DATA_VALUE,M_FLAG,Q_FLAG,S_FLAG,OBS_TIME
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1939-07-01,USW00093721,TMAX,306,,,X,
1939-07-02,USW00093721,TMAX,322,,,X,
1939-07-03,USW00093721,TMAX,283,,,X,
1939-07-04,USW00093721,TMAX,311,,,X,
1939-07-05,USW00093721,TMAX,256,,,X,
...,...,...,...,...,...,...,...
2025-02-02,USW00093721,WSF5,89,,,W,
2025-02-03,USW00093721,WSF5,54,,,W,
2025-02-04,USW00093721,WSF5,152,,,W,
2025-02-05,USW00093721,WSF5,85,,,W,


In [34]:
# Grab max and min temps (in tenths of degrees C)
df_tmax = df.loc[df['ELEMENT'] == 'TMAX']
df_tmin = df.loc[df['ELEMENT'] == 'TMIN']

In [35]:
df_tmin
#df_tmax

Unnamed: 0_level_0,ID,ELEMENT,DATA_VALUE,M_FLAG,Q_FLAG,S_FLAG,OBS_TIME
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1939-07-01,USW00093721,TMIN,200,,,X,
1939-07-02,USW00093721,TMIN,189,,,X,
1939-07-03,USW00093721,TMIN,150,,,X,
1939-07-04,USW00093721,TMIN,206,,,X,
1939-07-05,USW00093721,TMIN,211,,,X,
...,...,...,...,...,...,...,...
2025-02-01,USW00093721,TMIN,-21,,,W,2400.0
2025-02-02,USW00093721,TMIN,-55,,,W,2400.0
2025-02-03,USW00093721,TMIN,-38,,,W,2400.0
2025-02-04,USW00093721,TMIN,17,,,W,2400.0


In [69]:
# Convert GHCN temps from tenths of °C to °C
df_tmin["TMIN"] = df_tmin["DATA_VALUE"] / 10
df_tmax["TMAX"] = df_tmax["DATA_VALUE"] / 10

# Add month-day identifier
df_tmin["MONTH_DAY"] = df_tmin.index.strftime("%m-%d")
df_tmax["MONTH_DAY"] = df_tmax.index.strftime("%m-%d")

# Group by month-day and find all time max/min
df_tmax_daily = (
    df_tmax.groupby(['MONTH_DAY'], as_index=True)
      .agg(record_max_temp=('TMAX','max'))
)

df_tmin_daily = (
    df_tmin.groupby(['MONTH_DAY'], as_index=True)
      .agg(record_min_temp=('TMIN','min'))
)


# Now get 1991-2020 averages
df_91_20_max = df_tmax[(df_tmax.index >= pd.to_datetime('1991-01-01')) & (df_tmax.index <= pd.to_datetime('2020-01-01'))]
df_91_20_min = df_tmin[(df_tmin.index >= pd.to_datetime('1991-01-01')) & (df_tmin.index <= pd.to_datetime('2020-01-01'))]

# Group by month-day and find means
df_tmax_daily_mean = (
    df_91_20_max.groupby(['MONTH_DAY'], as_index=True)
      .agg(average_max_temp=('TMAX','mean'))
)

df_tmin_daily_mean = (
    df_91_20_min.groupby(['MONTH_DAY'], as_index=True)
      .agg(average_min_temp=('TMIN','mean'))
)

# Merge all dataframes
df_merged = pd.concat([df_tmax_daily_mean, df_tmin_daily_mean, df_tmax_daily, df_tmin_daily], axis=1)
df_merged

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_tmin["TMIN"] = df_tmin["DATA_VALUE"] / 10
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_tmax["TMAX"] = df_tmax["DATA_VALUE"] / 10
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_tmin["MONTH_DAY"] = df_tmin.index.strftime("%m-%d")
A value is trying to be set on a copy of a slice from a Data

Unnamed: 0_level_0,average_max_temp,average_min_temp,record_max_temp,record_min_temp
MONTH_DAY,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
01-01,7.653333,-2.803333,19.4,-16.0
01-02,6.610345,-1.993103,20.0,-17.8
01-03,7.010345,-2.044828,20.6,-14.9
01-04,7.748276,-2.327586,21.1,-14.3
01-05,6.344828,-2.700000,20.6,-13.9
...,...,...,...,...
12-27,7.472414,-2.334483,20.6,-12.8
12-28,7.555172,-2.368966,21.1,-12.2
12-29,7.565517,-0.865517,25.0,-12.1
12-30,7.624138,-1.748276,20.0,-13.9


In [13]:
import ipywidgets as widgets
from IPython.display import display, clear_output

In [71]:
def select_station(city):
    """ 
    This function returns the GHCN-d station ID based on the city/town
    you input and allows you to select the specfiic station from
    a dropdown menu.

    Inputs:
        city (str): City/town name you want station data from (MUST BE IN ALL CAPITAL LETTERS). 

    Outputs:
        selected_id (dict): Station ID based on dropdown menu selection.
    """
    
    # Read in available stations
    stn_ids = pd.read_fwf('http://noaa-ghcn-pds.s3.amazonaws.com/ghcnd-stations.txt', header=None, infer_nrows=1000)
    stn_ids.columns = ['ID','LAT','LON','ELEV','UKN','NAME','GSN','WBAN']

    # Read in station inventory file (gives years available for each variable)
    periods = pd.read_fwf('http://noaa-ghcn-pds.s3.amazonaws.com/ghcnd-inventory.txt', header=None, infer_nrows=1000)
    periods.columns = ['ID','LAT','LON','ELEM','TiMIN','TiMAX']

    # Merge the inventory and station dfs based on ID
    merged_stns = pd.merge(stn_ids,periods,how='left',left_on='ID',right_on='ID')

    # Grab stations where TMAX data is availble up to 2025
    # TiMIN is the earliest year available
    # TiMAX is the most recent year available
    merged_stns = merged_stns[(merged_stns['ELEM'] == 'TMAX') & (merged_stns['TiMAX'] == 2025)]

    # Select staions with city in name
    merged_stns = merged_stns[merged_stns['NAME'].str.contains(city, regex=False)]

    if merged_stns.empty:
        print(f"No stations found for city: {city}")
        return None

    # Create dropdown options (include empty option first)
    # Including the empty option allows callback to 
    # work properly when selection is made.
    options = {'-- Select a Station --': None}
    for _, row in merged_stns.iterrows():
        label = f"{row['NAME']} ({row['ID']})"
        options[label] = row['ID']

    dropdown = widgets.Dropdown(
        options=options,
        value=None,  # Ensures no pre-selection
        description='Station:',
        style={'description_width': 'initial'},
        layout=widgets.Layout(width='70%')
    )

    output = widgets.Output()
    selected_id = {'value': None}

    def on_select(change):
        if change.new:  # Only trigger when actual station is chosen
            selected_id['value'] = change.new
            with output:
                clear_output()
                stn = merged_stns[merged_stns['ID'] == change.new].iloc[0]
                print("Selected Station:")
                print(stn[['ID', 'NAME', 'LAT_x', 'LON_x', 'ELEV', 'TiMIN', 'TiMAX']])

    dropdown.observe(on_select, names='value')
    display(dropdown, output)

    print("Please choose a station from the dropdown above.")

    return selected_id

def get_station_data(station_id=s['value']):
    """
    This function grabs the data from the selected station and returns
    a dataframe with the record min and max temp from each calendar day
    over the period of record, as well as the mean min and max daily
    temps for the 1991-2020 period.

    Inputs:
        station_id (str): Station ID of selected station, default is the one you
        selected using the previous function.

    Outputs:
        df_merged (pd.DataFrame): The dataframe containing average_max_temp,
        average_min_temp, record_max_temp, and record_min_temp. 
    """
    
    # Read in actual data for selected station from AWS
    df = pd.read_csv(
        f"s3://noaa-ghcn-pds/csv/by_station/{station_id}.csv",
        storage_options={"anon": True},  # passed to `s3fs.S3FileSystem`
        dtype={'Q_FLAG': 'object', 'M_FLAG': 'object'},
        parse_dates=['DATE']
    ).set_index('DATE')

    # Grab max and min temps (in tenths of degrees C)
    df_tmax = df.loc[df['ELEMENT'] == 'TMAX']
    df_tmin = df.loc[df['ELEMENT'] == 'TMIN']

    # Convert GHCN temps from tenths of °C to °C
    df_tmin["TMIN"] = df_tmin["DATA_VALUE"] / 10
    df_tmax["TMAX"] = df_tmax["DATA_VALUE"] / 10

    # Add month-day identifier
    df_tmin["MONTH_DAY"] = df_tmin.index.strftime("%m-%d")
    df_tmax["MONTH_DAY"] = df_tmax.index.strftime("%m-%d")

    # Group by month-day and find all time max/min
    df_tmax_daily = (
        df_tmax.groupby(['MONTH_DAY'], as_index=True)
        .agg(record_max_temp=('TMAX','max'))
    )

    df_tmin_daily = (
        df_tmin.groupby(['MONTH_DAY'], as_index=True)
        .agg(record_min_temp=('TMIN','min'))
    )

    # Now get 1991-2020 averages
    df_91_20_max = df_tmax[(df_tmax.index >= pd.to_datetime('1991-01-01')) & (df_tmax.index <= pd.to_datetime('2020-01-01'))]
    df_91_20_min = df_tmin[(df_tmin.index >= pd.to_datetime('1991-01-01')) & (df_tmin.index <= pd.to_datetime('2020-01-01'))]

    # Group by month-day and find means
    df_tmax_daily_mean = (
        df_91_20_max.groupby(['MONTH_DAY'], as_index=True)
        .agg(average_max_temp=('TMAX','mean'))
    )

    df_tmin_daily_mean = (
        df_91_20_min.groupby(['MONTH_DAY'], as_index=True)
        .agg(average_min_temp=('TMIN','mean'))
    )

    # Merge all dataframes
    df_merged = pd.concat([df_tmax_daily_mean, df_tmin_daily_mean, df_tmax_daily, df_tmin_daily], axis=1)

    return df_merged


In [None]:
s = select_station('BALTIMORE')

Dropdown(description='Station:', layout=Layout(width='70%'), options={'-- Select a Station --': None, 'BALTIMO…

Output()

Please choose a station from the dropdown above.


In [74]:
df_records = get_station_data()
df_records

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_tmin["TMIN"] = df_tmin["DATA_VALUE"] / 10
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_tmax["TMAX"] = df_tmax["DATA_VALUE"] / 10
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_tmin["MONTH_DAY"] = df_tmin.index.strftime("%m-%d")
A value is trying to be set on a copy of a slice from a Data

Unnamed: 0_level_0,average_max_temp,average_min_temp,record_max_temp,record_min_temp
MONTH_DAY,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
01-01,7.653333,-2.803333,19.4,-16.0
01-02,6.610345,-1.993103,20.0,-17.8
01-03,7.010345,-2.044828,20.6,-14.9
01-04,7.748276,-2.327586,21.1,-14.3
01-05,6.344828,-2.700000,20.6,-13.9
...,...,...,...,...
12-27,7.472414,-2.334483,20.6,-12.8
12-28,7.555172,-2.368966,21.1,-12.2
12-29,7.565517,-0.865517,25.0,-12.1
12-30,7.624138,-1.748276,20.0,-13.9
