Zachary Torstrick  
HW3  
ATMS 523  
UIUC  
Fall 2025  

---

## Assignment Overview
This notebook contains the solution for Module 3 Project, which involves:
1. Adapting code to read GHCN Daily Data and calculate temperature statistics
2. Creating a matplotlib plot showing record, average, and actual temperatures


### Part 1: Temperature Statistics Function

Adapt the code from class that reads GHCN Daily Data from Amazon Web Services and write a function that will download the station you want (called with a GHCN station ID), and calculate (1) the all time record high and low and (2) the normal (mean) high and low temperature *FOR EACH CALENDAR DAY* for the 1991-2020 period for the desired station.  The function should return a pandas data frame with the columns ['record_min_temp', average_min_temp', 'average_max_temp', record_max_temp'] FOR EACH DAY.  Write a code that can call this function and successfully demonstrate that it works.

In [74]:
import pandas as pd

def convert_temperatures_to_celsius(df, temp_columns):
    """
    Convert GHCN temperature data from tenths of degrees Celsius to degrees Celsius.
    
    Parameters:
    -----------
    df : pandas.DataFrame
        DataFrame containing temperature columns
    temp_columns : list
        List of column names to convert
    
    Returns:
    --------
    pandas.DataFrame
        DataFrame with temperature columns converted to degrees Celsius
    """
    df_converted = df.copy()
    for col in temp_columns:
        if col in df_converted.columns:
            df_converted[col] = df_converted[col] / 10.0
    
    return df_converted

def get_ghcn_station_stats(ghcn_station_id):
    """
    Calculate temperature statistics for a GHCN weather station.
    
    This function downloads weather data for a specified station and calculates
    record and average temperature statistics for each calendar day during the
    1991-2020 period.
    
    Parameters:
    -----------
    ghcn_station_id : str
        The GHCN station identifier (e.g., 'USC00120784')
    
    Returns:
    --------
    pandas.DataFrame
        DataFrame with columns:
        - day_of_year: Day of year (1-366)
        - record_min_temp: Minimum temperature ever recorded (°C)
        - average_min_temp: Average minimum temperature (°C) 
        - record_max_temp: Maximum temperature ever recorded (°C)
        - average_max_temp: Average maximum temperature (°C)
    """
    df = pd.read_csv(
        f"s3://noaa-ghcn-pds/csv/by_station/{ghcn_station_id}.csv",
        storage_options={"anon": True},  # passed to `s3fs.S3FileSystem`
            dtype={'Q_FLAG': 'object', 'M_FLAG': 'object'},
            parse_dates=['DATE']
    ).set_index('DATE')
    
    # Filter to 1991-2020 period
    df_1991_2020 = df[(df.index.year >= 1991) & (df.index.year <= 2020)]

    # Filter to TMAX and TMIN elements
    df_tmax = df_1991_2020[df_1991_2020['ELEMENT'] == 'TMAX']
    df_tmin = df_1991_2020[df_1991_2020['ELEMENT'] == 'TMIN']

    # Drop any NaN before calculating the average
    df_tmax = df_tmax.dropna(subset=['DATA_VALUE'])
    df_tmin = df_tmin.dropna(subset=['DATA_VALUE'])

    tmax_by_calendar_day = df_tmax.groupby(df_tmax.index.dayofyear).agg(
        record_max_temp=('DATA_VALUE', 'max'),
        average_max_temp=('DATA_VALUE', 'mean')
    )

    tmin_by_calendar_day = df_tmin.groupby(df_tmin.index.dayofyear).agg(
        record_min_temp=('DATA_VALUE', 'min'),
        average_min_temp=('DATA_VALUE', 'mean')
    )
    
    # Join the two dataframes on the day_of_year column
    df_by_calendar_day = pd.merge(
        tmax_by_calendar_day, 
        tmin_by_calendar_day, 
        on=['DATE'])

    # Convert temperatures to Celsius
    temp_columns = ['record_min_temp', 'average_min_temp', 'record_max_temp', 'average_max_temp']
    df_by_calendar_day = convert_temperatures_to_celsius(df_by_calendar_day, temp_columns)

    return df_by_calendar_day

ghcn_station_id = 'USC00120784' # BLOOMINGTON INDIANA UNIV

df_bloomington = get_ghcn_station_stats(ghcn_station_id)
print(df_bloomington)



      record_max_temp  average_max_temp  record_min_temp  average_min_temp
DATE                                                                      
1                16.1          4.080000            -20.6         -4.036667
2                16.1          3.926667            -21.7         -5.436667
3                18.9          3.720000            -21.7         -4.073333
4                17.2          4.565517            -17.8         -4.730000
5                18.9          3.556667            -20.6         -5.280000
...               ...               ...              ...               ...
362              17.8          4.163333            -18.9         -4.660000
363              20.0          5.368966            -16.7         -3.679310
364              17.8          4.743333            -17.2         -3.920000
365              13.9          4.023333            -17.2         -3.443333
366              16.1          7.287500            -10.0         -2.275000

[366 rows x 4 columns]


  df = pd.read_csv(


## Part 2: Weather Visualization Plot

Develop a plot (using matplotlib) that displays for the city of choice a plot showing the record, average, and actual high and low temperatures for that year and city for each day over the calendar year.  Note: You do not need to make the chart interactive, you can just plot the data from a your city and for the year of your choice.

In [75]:
from bokeh.plotting import figure, show
from bokeh.io import output_notebook
from bokeh.models import FixedTicker

def get_ghcn_station_stats_by_year(ghcn_station_id, year):
    df = pd.read_csv(
        f"s3://noaa-ghcn-pds/csv/by_station/{ghcn_station_id}.csv",
        storage_options={"anon": True},  # passed to `s3fs.S3FileSystem`
            dtype={'Q_FLAG': 'object', 'M_FLAG': 'object'},
            parse_dates=['DATE']
    ).set_index('DATE')
    
    # Filter by year
    df_year = df[df.index.year == year]
    
    # Split into TMAX and TMIN
    df_tmax = df_year[df_year['ELEMENT'] == 'TMAX']
    df_tmin = df_year[df_year['ELEMENT'] == 'TMIN']
    
    # Group by day of year and get actual values
    tmax_by_day = df_tmax.groupby(df_tmax.index.dayofyear)['DATA_VALUE'].max()
    tmin_by_day = df_tmin.groupby(df_tmin.index.dayofyear)['DATA_VALUE'].min()
    
    # Return max and min temperatures for each day
    merged = pd.DataFrame({
        'day_of_year': tmax_by_day.index,
        'max_temp': tmax_by_day.values,
        'min_temp': tmin_by_day.values
    })

    return convert_temperatures_to_celsius(merged, ['max_temp', 'min_temp'])

# Get data for 1999
df_1999 = get_ghcn_station_stats_by_year(ghcn_station_id, 1999)


output_notebook()

p = figure(
    title="Daily Temperature for Bloomington, IN (1991-2020)", 
    width=800, 
    height=500,
    x_axis_label='Date (MM/YYYY)',
    y_axis_label='Temperature (°C)')

# Record temperatures
p.varea(
    x=df_bloomington.index,
    y1=df_bloomington['record_min_temp'],
    y2=df_bloomington['record_max_temp'],
    alpha=0.3,
    color='lightblue',
    legend_label='Record Range'
)

# Average temperatures
p.varea(
    x=df_bloomington.index,
    y1=df_bloomington['average_min_temp'],
    y2=df_bloomington['average_max_temp'],
    alpha=0.5,
    color='blue',
    legend_label='Average Range'
)

# Actual temperatures for 1999
p.varea(
    x=df_1999.index,
    y1=df_1999['min_temp'],
    y2=df_1999['max_temp'],
    alpha=0.5,
    color='darkblue',
    legend_label='Actual Range'
)

# Add custom x-axis ticks
month_positions = [1, 32, 60, 91, 121, 152, 182, 213, 244, 274, 305, 335]
month_labels = ['01/1999', '02/1999', '03/1999', '04/1999', '05/1999', '06/1999',
                '07/1999', '08/1999', '09/1999', '10/1999', '11/1999', '12/1999']

p.xaxis.ticker = FixedTicker(ticks=month_positions)
p.xaxis.major_label_overrides = dict(zip(month_positions, month_labels))

show(p)


  df = pd.read_csv(
