**Concat all datasets into one dataframe**

In [1]:
!pip freeze > requirements.txt

In [2]:
# Import 16 datasets from source
import pandas as pd
df = {}
for i in range (1, 17):
  file = f"/content/drive/MyDrive/Study/M2M Data Talent/Capstone 1/Dataset/climate_{i}.csv"
  df[i] = pd.read_csv(file, dtype={"LOCAL_YEAR": str, "LOCAL_MONTH": str})

In [3]:
# Check shape the column names and column orders before concat
for i in range (1, 17):
  print(f'df{i} shape is:', df[i].shape)

if (list(df[1].columns) == list(df[2].columns) == list(df[3].columns)
== list(df[4].columns) == list(df[5].columns) == list(df[6].columns)
== list(df[7].columns) == list(df[8].columns) == list(df[9].columns)
== list(df[10].columns) == list(df[11].columns) == list(df[12].columns)
== list(df[13].columns) == list(df[14].columns) == list(df[15].columns)
== list(df[16].columns)):
  print("All datasets have the same column names and orders")

df1 shape is: (10000, 34)
df2 shape is: (10000, 34)
df3 shape is: (10000, 34)
df4 shape is: (10000, 34)
df5 shape is: (10000, 34)
df6 shape is: (10000, 34)
df7 shape is: (10000, 34)
df8 shape is: (3900, 34)
df9 shape is: (10000, 34)
df10 shape is: (4322, 34)
df11 shape is: (10000, 34)
df12 shape is: (10000, 34)
df13 shape is: (10000, 34)
df14 shape is: (10000, 34)
df15 shape is: (10000, 34)
df16 shape is: (10000, 34)
All datasets have the same column names and orders


In [4]:
# Concat all file into one dataset
df = pd.concat([df[1],df[2],df[3],df[4],df[5],df[6],df[7],df[8], df[9], df[10],
                df[11], df[12], df[13], df[14], df[15], df[16]])
df

Unnamed: 0,x,y,LATITUDE,LONGITUDE,STATION_NAME,CLIMATE_IDENTIFIER,ID,LOCAL_DATE,LAST_UPDATED,PROVINCE_CODE,...,DAYS_WITH_PRECIP_GE_1MM,NORMAL_SNOWFALL,TOTAL_SNOWFALL,DAYS_WITH_VALID_SNOWFALL,SNOW_ON_GROUND_LAST_DAY,NORMAL_SUNSHINE,BRIGHT_SUNSHINE,DAYS_WITH_VALID_SUNSHINE,COOLING_DEGREE_DAYS,HEATING_DEGREE_DAYS
0,-112.966667,54.283333,54.283,-112.967,ABEE AGDM,3010010,32232.2020.1,Jan-20,2023-07-20 10:16,AB,...,3.0,,,0,,,,0.0,0.0,1075.4
1,-112.966667,54.283333,54.283,-112.967,ABEE AGDM,3010010,32232.2020.2,Feb-20,2023-07-20 10:07,AB,...,5.0,,,0,,,,0.0,0.0,844.5
2,-112.966667,54.283333,54.283,-112.967,ABEE AGDM,3010010,32232.2020.3,Mar-20,2023-07-20 9:51,AB,...,3.0,,,0,,,,0.0,0.0,860.9
3,-112.966667,54.283333,54.283,-112.967,ABEE AGDM,3010010,32232.2020.4,Apr-20,2023-07-20 10:16,AB,...,2.0,,,0,,,,0.0,0.0,566.3
4,-112.966667,54.283333,54.283,-112.967,ABEE AGDM,3010010,32232.2020.5,May-20,2023-07-20 10:03,AB,...,8.0,,,0,,,,0.0,0.7,252.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,-130.444722,54.286111,54.286,-130.445,PRINCE RUPERT,1066482,48693.2019.11,2019-11,2019-12-01 14:50:45,BC,...,23.0,,,0,,,,0.0,0.0,329.8
9996,-130.444722,54.286111,54.286,-130.445,PRINCE RUPERT,1066482,48693.2019.12,2019-12,2020-01-01 11:32:32,BC,...,22.0,,,0,,,,0.0,0.0,441.4
9997,-130.290000,54.320278,54.320,-130.290,PRINCE RUPERT MONT CIRC,1066488,424.2015.1,2015-01,2015-02-26 17:12:55,BC,...,16.0,25.27,1.0,17,,,,,,
9998,-130.290000,54.320278,54.320,-130.290,PRINCE RUPERT MONT CIRC,1066488,424.2015.2,2015-02,2015-03-31 16:07:54,BC,...,15.0,22.47,0.0,18,,,,,,


**Cleanse dataset**

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 148222 entries, 0 to 9999
Data columns (total 34 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   x                          148222 non-null  float64
 1   y                          148222 non-null  float64
 2   LATITUDE                   148222 non-null  float64
 3   LONGITUDE                  148222 non-null  float64
 4   STATION_NAME               148222 non-null  object 
 5   CLIMATE_IDENTIFIER         148222 non-null  object 
 6   ID                         148222 non-null  object 
 7   LOCAL_DATE                 148222 non-null  object 
 8   LAST_UPDATED               148222 non-null  object 
 9   PROVINCE_CODE              148222 non-null  object 
 10  ENG_PROVINCE_NAME          148222 non-null  object 
 11  FRE_PROVINCE_NAME          148222 non-null  object 
 12  LOCAL_YEAR                 148222 non-null  object 
 13  LOCAL_MONTH                148222 no

In [6]:
# Change LOCAL_DATE column to have consistent dtype as datetime.It's the combination of data from LOCAL_YEAR and LOCAL_MONTH and default date as the 5th each month as recording date
df["LOCAL_DATE"] = pd.to_datetime(df["LOCAL_YEAR"] + "-" + df["LOCAL_MONTH"] + "-5")

In [7]:
# Keep structural NaN in NORMAL_ because not all stations have 30-year recording to have the baseline
# Remove any rows where DAYS_WITH_VALID < 25 days because of not reliable ("3/5 rule" by the World Meteorological Organization (WMO):
# If more than 3 consecutive daily values are missing or more than 5 daily values in total in a given month are missing)
df = df[df["DAYS_WITH_VALID_MEAN_TEMP"] >= 25]
df = df[df["DAYS_WITH_VALID_MIN_TEMP"] >= 25]
df = df[df["DAYS_WITH_VALID_MAX_TEMP"] >= 25]
df = df[df["DAYS_WITH_VALID_PRECIP"] >= 25]

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 97402 entries, 4 to 9996
Data columns (total 34 columns):
 #   Column                     Non-Null Count  Dtype         
---  ------                     --------------  -----         
 0   x                          97402 non-null  float64       
 1   y                          97402 non-null  float64       
 2   LATITUDE                   97402 non-null  float64       
 3   LONGITUDE                  97402 non-null  float64       
 4   STATION_NAME               97402 non-null  object        
 5   CLIMATE_IDENTIFIER         97402 non-null  object        
 6   ID                         97402 non-null  object        
 7   LOCAL_DATE                 97402 non-null  datetime64[ns]
 8   LAST_UPDATED               97402 non-null  object        
 9   PROVINCE_CODE              97402 non-null  object        
 10  ENG_PROVINCE_NAME          97402 non-null  object        
 11  FRE_PROVINCE_NAME          97402 non-null  object        
 12  LOCAL_YEAR

In [9]:
# Keep analytical columns
df = df[["LATITUDE", "LONGITUDE", "STATION_NAME", "PROVINCE_CODE", "LOCAL_DATE",
         "LOCAL_YEAR", "LOCAL_MONTH", "MEAN_TEMPERATURE", "MIN_TEMPERATURE",
         "MAX_TEMPERATURE", "COOLING_DEGREE_DAYS", "HEATING_DEGREE_DAYS" ]]
df

Unnamed: 0,LATITUDE,LONGITUDE,STATION_NAME,PROVINCE_CODE,LOCAL_DATE,LOCAL_YEAR,LOCAL_MONTH,MEAN_TEMPERATURE,MIN_TEMPERATURE,MAX_TEMPERATURE,COOLING_DEGREE_DAYS,HEATING_DEGREE_DAYS
4,54.283,-112.967,ABEE AGDM,AB,2020-05-05,2020,5,9.877419,-4.6,25.4,0.7,252.5
6,54.283,-112.967,ABEE AGDM,AB,2020-07-05,2020,7,17.325806,7.2,28.7,16.5,37.4
7,54.283,-112.967,ABEE AGDM,AB,2020-08-05,2020,8,16.145161,1.3,29.7,21.2,78.7
8,54.283,-112.967,ABEE AGDM,AB,2020-09-05,2020,9,10.246667,-5.1,25.2,0.0,232.6
9,54.283,-112.967,ABEE AGDM,AB,2020-10-05,2020,10,0.522581,-17.6,22.7,0.0,541.8
...,...,...,...,...,...,...,...,...,...,...,...,...
9992,54.286,-130.445,PRINCE RUPERT,BC,2019-08-05,2019,8,14.341935,7.2,22.2,0.0,113.4
9993,54.286,-130.445,PRINCE RUPERT,BC,2019-09-05,2019,9,12.796667,2.7,21.2,0.0,156.1
9994,54.286,-130.445,PRINCE RUPERT,BC,2019-10-05,2019,10,8.106452,-0.1,13.7,0.0,306.7
9995,54.286,-130.445,PRINCE RUPERT,BC,2019-11-05,2019,11,7.006667,-5.2,12.5,0.0,329.8


In [10]:
df.describe()

Unnamed: 0,LATITUDE,LONGITUDE,LOCAL_DATE,MEAN_TEMPERATURE,MIN_TEMPERATURE,MAX_TEMPERATURE,COOLING_DEGREE_DAYS,HEATING_DEGREE_DAYS
count,97402.0,97402.0,97402,97402.0,97402.0,97402.0,97402.0,97402.0
mean,51.406515,-98.44989,2019-11-16 14:25:01.915771648,4.051678,-10.048371,18.588691,9.594525,423.198332
min,41.949,-140.868,2015-01-05 00:00:00,-40.010714,-54.7,-30.8,0.0,0.0
25%,48.369,-116.848,2017-05-05 00:00:00,-4.192857,-21.5,10.0,0.0,124.1
50%,50.144,-105.538,2019-11-05 00:00:00,5.53,-6.0,19.6,0.0,370.9
75%,53.386,-77.786,2022-05-05 00:00:00,14.04,1.7,28.4,5.2,659.175
max,82.5,-52.753,2024-12-05 00:00:00,26.658065,19.1,49.6,268.4,1788.6
std,5.926261,22.736515,,11.609667,14.501634,11.247921,23.372606,331.821884


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 97402 entries, 4 to 9996
Data columns (total 12 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   LATITUDE             97402 non-null  float64       
 1   LONGITUDE            97402 non-null  float64       
 2   STATION_NAME         97402 non-null  object        
 3   PROVINCE_CODE        97402 non-null  object        
 4   LOCAL_DATE           97402 non-null  datetime64[ns]
 5   LOCAL_YEAR           97402 non-null  object        
 6   LOCAL_MONTH          97402 non-null  object        
 7   MEAN_TEMPERATURE     97402 non-null  float64       
 8   MIN_TEMPERATURE      97402 non-null  float64       
 9   MAX_TEMPERATURE      97402 non-null  float64       
 10  COOLING_DEGREE_DAYS  97402 non-null  float64       
 11  HEATING_DEGREE_DAYS  97402 non-null  float64       
dtypes: datetime64[ns](1), float64(7), object(4)
memory usage: 11.7+ MB


**Data analysis and visualization**

In [12]:
from bokeh.models.formatters import NumeralTickFormatter
# Import Bokeh library
from bokeh.plotting import figure, show
from bokeh.models import ColumnDataSource, NumeralTickFormatter
from bokeh.palettes import Spectral, OrRd, PuBu, Viridis11, Category20c
from bokeh.io import output_notebook
output_notebook()
from bokeh.models import(HoverTool)
from bokeh.transform import factor_cmap

In [13]:
df["MONTH_NAME"] = df["LOCAL_MONTH"].map(
    {"1": "Jan", "2": "Feb", "3": "Mar", "4": "Apr",
    "5": "May", "6": "Jun", "7": "Jul", "8": "Aug",
    "9": "Sep", "10": "Oct", "11": "Nov", "12": "Dec"}
)
month_order = ["Jan","Feb","Mar","Apr","May","Jun","Jul","Aug","Sep","Oct","Nov","Dec"]
df["MONTH_NAME"] = pd.Categorical(df["MONTH_NAME"], categories=month_order, ordered=True)

In [14]:
df

Unnamed: 0,LATITUDE,LONGITUDE,STATION_NAME,PROVINCE_CODE,LOCAL_DATE,LOCAL_YEAR,LOCAL_MONTH,MEAN_TEMPERATURE,MIN_TEMPERATURE,MAX_TEMPERATURE,COOLING_DEGREE_DAYS,HEATING_DEGREE_DAYS,MONTH_NAME
4,54.283,-112.967,ABEE AGDM,AB,2020-05-05,2020,5,9.877419,-4.6,25.4,0.7,252.5,May
6,54.283,-112.967,ABEE AGDM,AB,2020-07-05,2020,7,17.325806,7.2,28.7,16.5,37.4,Jul
7,54.283,-112.967,ABEE AGDM,AB,2020-08-05,2020,8,16.145161,1.3,29.7,21.2,78.7,Aug
8,54.283,-112.967,ABEE AGDM,AB,2020-09-05,2020,9,10.246667,-5.1,25.2,0.0,232.6,Sep
9,54.283,-112.967,ABEE AGDM,AB,2020-10-05,2020,10,0.522581,-17.6,22.7,0.0,541.8,Oct
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9992,54.286,-130.445,PRINCE RUPERT,BC,2019-08-05,2019,8,14.341935,7.2,22.2,0.0,113.4,Aug
9993,54.286,-130.445,PRINCE RUPERT,BC,2019-09-05,2019,9,12.796667,2.7,21.2,0.0,156.1,Sep
9994,54.286,-130.445,PRINCE RUPERT,BC,2019-10-05,2019,10,8.106452,-0.1,13.7,0.0,306.7,Oct
9995,54.286,-130.445,PRINCE RUPERT,BC,2019-11-05,2019,11,7.006667,-5.2,12.5,0.0,329.8,Nov


In [15]:
# Scatter charts of Mean - Max - Min temperature by time
## Mean_temperature
mean_temp_scatter = figure(title="Mean temperature in 2020-2025 period",
                          x_axis_label="Time", y_axis_label="Temperature",
                          x_axis_type="datetime", height=300, width=800)

mean_temp_scatter.scatter(x=df["LOCAL_DATE"], y=df["MEAN_TEMPERATURE"], size=1,
                          color="#9E9E9E")

## Min_temperature
min_temp_scatter = figure(title="Min temperature in 2020-2025 period",
                          x_axis_label="Time", y_axis_label="Temperature",
                          x_axis_type="datetime", height=300, width=800)
min_temp_scatter.scatter(x=df["LOCAL_DATE"], y=df["MIN_TEMPERATURE"], size=1,
                         color="#A6BDDB")

## Max_temperature
max_temp_scatter = figure(title="Max temperature in 2020-2025 period",
                          x_axis_label="Time", y_axis_label="Temperature",
                          x_axis_type="datetime", height=300, width=800)
max_temp_scatter.scatter(x=df["LOCAL_DATE"], y=df["MAX_TEMPERATURE"], size=1,
                         color="#E34A33")

# Show charts in column
from bokeh.layouts import column
show(column(mean_temp_scatter, min_temp_scatter, max_temp_scatter))




**Are Canadian winters becoming milder between 2015–2024?**

In [16]:
#Mean temperature in winter months (Dec-Feb) by year
winter_mths = ["Dec","Jan","Feb"]
df_winter = df[df["MONTH_NAME"].isin(winter_mths)].groupby(["LOCAL_YEAR", "MONTH_NAME"])["MEAN_TEMPERATURE"].median().reset_index()
df_winter["LOCAL_YEAR"] = df_winter["LOCAL_YEAR"].astype(int)

colors = PuBu[4]

winter_fig = figure(title="Winter Mean Temperature by Month",
                    x_axis_label="Time", y_axis_label="Mean Temperature",
                    width=800, height=500,
                    tooltips=[("Year", "@LOCAL_YEAR"), ("Month", "@MONTH_NAME"),
                     ("Mean Temperature", "@MEAN_TEMPERATURE")])
winter_fig.xaxis.formatter = NumeralTickFormatter(format="0")
sources = {}
for i, month in enumerate(winter_mths):
  df_winter_mth = df_winter[df_winter["MONTH_NAME"] == month]
  source = ColumnDataSource(df_winter_mth)
  sources[month] = source

  winter_fig.line(x="LOCAL_YEAR", y="MEAN_TEMPERATURE",
                source=source, color=colors[i],
                  legend_label=month, line_width=2)

winter_fig.legend.click_policy = "hide"
handle = show(winter_fig, notebook_handle=True)

#Add widgets sliders for years
# from ipywidgets import IntRangeSlider, interact
# from bokeh.io import push_notebook

# min_year = df_winter["LOCAL_YEAR"].min()
# max_year = df_winter["LOCAL_YEAR"].max()

# year_range = IntRangeSlider(
#     value = [min_year, max_year],
#     min = min_year,
#     max = max_year
# )

# @interact(years = year_range)
# def update(years):
#   yr_min, yr_max = years
#   for month in winter_mths:
#     df_winter_mth = df_winter[
#         (df_winter["MONTH_NAME"] == month) &
#         (df_winter["LOCAL_YEAR"] >= yr_min) &
#         (df_winter["LOCAL_YEAR"] <= yr_max)
#     ]
  #   sources[month].data = dict(ColumnDataSource(df_winter_mth).data)
  # push_notebook(handle=handle)


  df_winter = df[df["MONTH_NAME"].isin(winter_mths)].groupby(["LOCAL_YEAR", "MONTH_NAME"])["MEAN_TEMPERATURE"].median().reset_index()


In [17]:
# Number of HDD days by years
df_HDD = df.groupby("LOCAL_YEAR")["HEATING_DEGREE_DAYS"].mean().reset_index()

HDD_fig = figure(title="Heating Degree Days by year in Canada",
                 x_axis_label="Year", y_axis_label="Degree Days",
                 x_range=list(df_HDD["LOCAL_YEAR"].unique()), width=800, height=500,
                 tooltips=[("Year", "@LOCAL_YEAR"),
                  ("Heating Degree Days", "@HEATING_DEGREE_DAYS")])

HDD_fig.vbar(x="LOCAL_YEAR", top="HEATING_DEGREE_DAYS", source=df_HDD,
             width=0.8, color="#a6bddb")
show(HDD_fig)


In [18]:
#Min temperature in Canada by month and year
df_min_temp = df.groupby("LOCAL_DATE")["MIN_TEMPERATURE"].median().reset_index()

source = ColumnDataSource(df_min_temp)

min_temp_fig = figure(title="Min temperature by month and year",
                      x_axis_label="Time", y_axis_label="Min temperature(°C)",
                      x_axis_type="datetime", width=800, height=500
)
min_temp_fig.line(x="LOCAL_DATE", y="MIN_TEMPERATURE", source=df_min_temp,
                  line_width=2, color="#a6bddb")
min_temp_fig.add_tools(HoverTool(
    tooltips = [("Time", "@LOCAL_DATE{%b %Y}"),
               ("Min temperature °C", "@MIN_TEMPERATURE{0.00}")],
    formatters = {"@LOCAL_DATE": "datetime"}
))
show(min_temp_fig)


In [19]:
#Frequency of extreme cold days
df_extreme_winter = df[df["MONTH_NAME"].isin(winter_mths)]

In [20]:
# Regional winter temperature by province
df_winter_temp = df[df["MONTH_NAME"].isin(winter_mths)].groupby(
    ["LOCAL_YEAR", "PROVINCE_CODE"])[["MEAN_TEMPERATURE", "MIN_TEMPERATURE"]].mean().reset_index()

prov_list = df["PROVINCE_CODE"].unique()
prov_colors = Category20c[len(prov_list)]

from bokeh.layouts import gridplot
plots = []

for prov in prov_list:
  df_winter_prov = df_winter_temp[df_winter_temp["PROVINCE_CODE"] == prov]
  winter_prov_fig = figure(title=f"Winter Mean vs. Min temperature of {prov}",
                           x_range=list(df_winter_temp["LOCAL_YEAR"].unique()),
                           y_range=[min(df_winter_temp["MIN_TEMPERATURE"]), max(df_winter_temp["MEAN_TEMPERATURE"])],
                           height=400)
  winter_prov_fig.line(x="LOCAL_YEAR", y="MEAN_TEMPERATURE", source=df_winter_prov,
                       legend_label="Mean temperature", color="#9E9E9E", line_width=3)
  winter_prov_fig.line(x="LOCAL_YEAR", y="MIN_TEMPERATURE", source=df_winter_prov,
                       legend_label="Min temperature", color="#A6BDDB", line_width=3)
  winter_prov_fig.add_tools(HoverTool(
      tooltips=[("Year", "@LOCAL_YEAR"),
                ("Province", "@PROVINCE_CODE"),
                ("Mean Temperature °C", "@MEAN_TEMPERATURE"),
                ("Min Temperature °C", "@MIN_TEMPERATURE")]
  ))
  winter_prov_fig.legend.orientation = "horizontal"
  winter_prov_fig.legend.border_line_color = None
  legend = winter_prov_fig.legend[0]
  winter_prov_fig.add_layout(legend, "above")

  plots.append(winter_prov_fig)
grid = gridplot(plots, ncols=4, sizing_mode="scale_both")

show(grid)


**Are Canadian summers becoming hotter or more variable?**

In [21]:
#Mean temperature in summer months (Jun-Aug) by year
summer_mths = ["Jun","Jul","Aug"]
df_summer = df[df["MONTH_NAME"].isin(summer_mths)].groupby(["LOCAL_YEAR", "MONTH_NAME"])["MEAN_TEMPERATURE"].median().reset_index()
df_summer["LOCAL_YEAR"] = df_summer["LOCAL_YEAR"].astype(int)

colors = OrRd[3]

summer_fig = figure(title="Summer Mean Temperature by Month",
                    x_axis_label="Time", y_axis_label="Mean Temperature",
                    width=800, height=500,
                    tooltips=[("Year", "@LOCAL_YEAR"), ("Month", "@MONTH_NAME"),
                     ("Mean Temperature", "@MEAN_TEMPERATURE")])
summer_fig.xaxis.formatter = NumeralTickFormatter(format="0")
sources = {}
for i, month in enumerate(summer_mths):
  df_summer_mth = df_summer[df_winter["MONTH_NAME"] == month]
  source = ColumnDataSource(df_summer_mth)
  sources[month] = source

  summer_fig.line(x="LOCAL_YEAR", y="MEAN_TEMPERATURE",
                source=source, color=colors[i],
                  legend_label=month, line_width=2)

summer_fig.legend.click_policy = "hide"
handle = show(summer_fig, notebook_handle=True)

# #Add widgets sliders for years
# from ipywidgets import IntRangeSlider, interact
# from bokeh.io import push_notebook

# min_year = df_summer["LOCAL_YEAR"].min()
# max_year = df_summer["LOCAL_YEAR"].max()

# year_range = IntRangeSlider(
#     value = [min_year, max_year],
#     min = min_year,
#     max = max_year
# )

# @interact(years = year_range)
# def update(years):
#   yr_min, yr_max = years
#   for month in summer_mths:
#     df_summer_mth = df_summer[
#         (df_summer["MONTH_NAME"] == month) &
#         (df_summer["LOCAL_YEAR"] >= yr_min) &
#         (df_summer["LOCAL_YEAR"] <= yr_max)
#     ]
  #   sources[month].data = dict(ColumnDataSource(df_summer_mth).data)
  # push_notebook(handle=handle)

  df_summer = df[df["MONTH_NAME"].isin(summer_mths)].groupby(["LOCAL_YEAR", "MONTH_NAME"])["MEAN_TEMPERATURE"].median().reset_index()


In [22]:
# Number of CDD days by years
df_CDD = df.groupby("LOCAL_YEAR")["COOLING_DEGREE_DAYS"].mean().reset_index()

CDD_fig = figure(title="Cooling Degree Days by year in Canada",
                 x_axis_label="Year", y_axis_label="Degree Days",
                 x_range=list(df_CDD["LOCAL_YEAR"].unique()),
                 width=800, height=500,
                 tooltips=[("Year", "@LOCAL_YEAR"),
                  ("Cooling Degree Days", "@COOLING_DEGREE_DAYS")])

CDD_fig.vbar(x="LOCAL_YEAR", top="COOLING_DEGREE_DAYS", source=df_CDD,
             width=0.8, color="#FC8D09")
show(CDD_fig)

In [23]:
#Max temperature in Canada by month and year
df_max_temp = df.groupby("LOCAL_DATE")["MAX_TEMPERATURE"].median().reset_index()

source = ColumnDataSource(df_max_temp)

max_temp_fig = figure(title="Max temperature by month and year",
                      x_axis_label="Time", y_axis_label="Max temperature(°C)",
                      x_axis_type="datetime", width=800, height=500
)
max_temp_fig.line(x="LOCAL_DATE", y="MAX_TEMPERATURE", source=df_max_temp,
                  line_width=2, color="#FC8D09")
max_temp_fig.add_tools(HoverTool(
    tooltips = [("Time", "@LOCAL_DATE{%b %Y}"),
               ("Max temperature °C", "@MAX_TEMPERATURE{0.00}")],
    formatters = {"@LOCAL_DATE": "datetime"}
))
show(max_temp_fig)

In [24]:
# Regional summer temperature
df_summer_temp = df[df["MONTH_NAME"].isin(summer_mths)].groupby(["LOCAL_YEAR", "PROVINCE_CODE"])[["MEAN_TEMPERATURE", "MAX_TEMPERATURE"]].mean().reset_index()

plots = []

for prov in prov_list:
  df_summer_prov = df_summer_temp[df_summer_temp["PROVINCE_CODE"] == prov]
  summer_prov_fig = figure(title=f"Summer Mean vs. Max temperature of {prov}",
                           x_range=list(df_summer_temp["LOCAL_YEAR"].unique()),
                           y_range=[min(df_summer_temp["MEAN_TEMPERATURE"]), max(df_summer_temp["MAX_TEMPERATURE"])],
                           height=400)
  summer_prov_fig.line(x="LOCAL_YEAR", y="MEAN_TEMPERATURE", source=df_summer_prov,
                       legend_label="Mean temperature", color="#9E9E9E", line_width=3)
  summer_prov_fig.line(x="LOCAL_YEAR", y="MAX_TEMPERATURE", source=df_summer_prov,
                       legend_label="Max temperature", color="#FC8D09", line_width=3)
  summer_prov_fig.add_tools(HoverTool(
      tooltips=[("Year", "@LOCAL_YEAR"),
                ("Province", "@PROVINCE_CODE"),
                ("Mean Temperature °C", "@MEAN_TEMPERATURE"),
                ("Max Temperature °C", "@MAX_TEMPERATURE")]
  ))
  summer_prov_fig.legend.orientation = "horizontal"
  summer_prov_fig.legend.border_line_color = None
  legend = summer_prov_fig.legend[0]
  summer_prov_fig.add_layout(legend, "above")

  plots.append(summer_prov_fig)
grid = gridplot(plots, ncols=4, sizing_mode="scale_both")

show(grid)

**How do temperature trends differ across Canadian regions from 2015–2024?**

In [25]:
# Monthly Mean temperature  of provinces by month

# Dataframe of median temperature by province and month
df_temp_prov = df.groupby(["PROVINCE_CODE", "MONTH_NAME"])["MEAN_TEMPERATURE"].mean().reset_index()

# Figure
temp_prov_line = figure(title="Average temperature of each province by month",
                        x_axis_label="Month", y_axis_label="Average of Mean temperature",
                        x_range=list(df_temp_prov["MONTH_NAME"].unique()),
                        width=800, height=500)
# Loop each province to draw lines
for i, prov in enumerate(prov_list):
  df1 = df_temp_prov[df_temp_prov["PROVINCE_CODE"] == prov]
  source = ColumnDataSource(df1)
  temp_prov_line.line(x="MONTH_NAME", y="MEAN_TEMPERATURE", source=source,
                      legend_label=prov, color=prov_colors[i])
# Show tooltips
temp_prov_line.add_tools(HoverTool(
    tooltips=[
        ("Province", "@PROVINCE_CODE"),
         ("Month", "@MONTH_NAME"),
         ("Mean Temperature", "@MEAN_TEMPERATURE")
    ]
))

#Baseline of general Canada
df_temp_canada = df.groupby("MONTH_NAME")["MEAN_TEMPERATURE"].mean().reset_index()
source_canada = ColumnDataSource(df_temp_canada)
temp_prov_line.line(x="MONTH_NAME", y="MEAN_TEMPERATURE", source=source_canada,
                    legend_label="Canada", line_width=3, color="#EE6677")
# Clickable legends to focused view
temp_prov_line.legend.click_policy = "hide"
# Show visual
show(temp_prov_line)


  df_temp_prov = df.groupby(["PROVINCE_CODE", "MONTH_NAME"])["MEAN_TEMPERATURE"].mean().reset_index()
  df_temp_canada = df.groupby("MONTH_NAME")["MEAN_TEMPERATURE"].mean().reset_index()


In [26]:
# Temperature amplitude (max - min) by province
from bokeh.models import ColorBar, LinearColorMapper
from bokeh.transform import transform

df["TEMP_AMPLITUDE"] = df["MAX_TEMPERATURE"] - df["MIN_TEMPERATURE"]
df_temp_amp = df.groupby(["PROVINCE_CODE", "LOCAL_YEAR"])["TEMP_AMPLITUDE"].mean().reset_index()

source = ColumnDataSource(df_temp_amp)

years = list(df["LOCAL_YEAR"].unique())
original_palette = OrRd[9]
color_mapper = LinearColorMapper(palette=original_palette[::-1],
                                 low=df_temp_amp["TEMP_AMPLITUDE"].min(),
                                 high=df_temp_amp["TEMP_AMPLITUDE"].max())

temp_amp_fig = figure(title="Temperature amplitude heatmap by province",
                      x_range=sorted(df_temp_amp["LOCAL_YEAR"].unique()),
                      y_range=sorted(df_temp_amp["PROVINCE_CODE"].unique()),
                      width=600, height=400, x_axis_location="above",
                      tooltips=[
                          ("Year", "@LOCAL_YEAR"),
                          ("Province", "@PROVINCE_CODE"),
                          ("Temperature Amplitude", "@TEMP_AMPLITUDE")]
)
temp_amp_fig.rect(x="LOCAL_YEAR", y="PROVINCE_CODE", source=source,
                  fill_color=transform("TEMP_AMPLITUDE", color_mapper),
                  line_color=None, width=1, height=1)

color_bar = ColorBar(
    color_mapper=color_mapper
)

temp_amp_fig.add_layout(color_bar, "right")

show(temp_amp_fig)
