In [138]:
# pip install torch 
import pandas as pd
import altair as alt
from torch.utils.data import TensorDataset, DataLoader
import torch
import torch.nn as nn
import torch.optim as optim
from datetime import timedelta
import numpy as np 

In [139]:
alt.data_transformers.enable("vegafusion")

DataTransformerRegistry.enable('vegafusion')

Salmon Detection Data

In [140]:
salmon = pd.read_csv('data/data_salmon.csv')
salmon['earliest_detect_date'] = pd.to_datetime(salmon['earliest_detect_date'], format='%Y-%m-%d %H:%M:%S.%f', errors='coerce') # convert to date data type 
salmon['earliest_detect_date'] = pd.to_datetime(salmon['earliest_detect_date']).dt.floor('d') # keep date but not time 
salmon.head()

Unnamed: 0,watershed,location,earliest_detect_date,tagid
0,cowichan,202,2021-07-27,989.001039
1,cowichan,202,2023-06-14,989.001039
2,cowichan,202,2023-06-15,989.001039
3,cowichan,202,2023-06-14,989.001039
4,cowichan,202,2023-06-14,989.001039


In [141]:
locations = salmon['watershed'].unique()

charts = []

for loc in locations:
    salmon_filtered = salmon[salmon['watershed'] == loc]
    
    chart = alt.Chart(salmon_filtered).mark_bar().encode(
        x=alt.X('earliest_detect_date:T', title='Date'), 
        y=alt.Y('count()', title='Number of Records'),  
        tooltip=[alt.Tooltip('earliest_detect_date:T', title='Date'), alt.Tooltip('count()', title='Records')]
    ).properties(
        title=f'Detections for {loc}'
    )
    
    charts.append(chart)

for chart in charts:
    chart.display()

EDA of the detection data. Since only Cowichan and Englishman have sufficient data, the model building will only consider these two watersheds. 

Flow Data

| Location    | Station Number |
|-------------|----------------|
| cowichan    | 08HA003        |
| englishman  | 08HA001        |

In [142]:
flow_ori = pd.read_csv('data/flow_2023.csv')
flow_columns = [col for col in flow_ori.columns if 'FLOW' in col]
flow_long = pd.melt(flow_ori, id_vars=["STATION_NUMBER", "YEAR", "MONTH"], value_vars=flow_columns, var_name="FlowType", value_name="FlowValue")
flow_long['FlowType'] = flow_long['FlowType'].str.replace("FLOW", "").astype(int)
sorted_flow = flow_long.sort_values(by=["STATION_NUMBER", "YEAR", "MONTH", "FlowType"])
sorted_flow.rename(columns={'FlowType': 'DAY'}, inplace=True)
sorted_flow = sorted_flow.dropna()
sorted_flow['Date'] = pd.to_datetime(sorted_flow[['YEAR', 'MONTH', 'DAY']])
sorted_flow = sorted_flow.reset_index(drop=True)
sorted_flow.head()

Unnamed: 0,STATION_NUMBER,YEAR,MONTH,DAY,FlowValue,Date
0,08HA001,2018,1,1,24.200001,2018-01-01
1,08HA001,2018,1,2,19.9,2018-01-02
2,08HA001,2018,1,3,17.4,2018-01-03
3,08HA001,2018,1,4,16.0,2018-01-04
4,08HA001,2018,1,5,28.700001,2018-01-05


In [143]:
df_filtered = sorted_flow[sorted_flow['STATION_NUMBER'] == "08HA003"]
cowichan_flow_chart = alt.Chart(df_filtered).mark_line(point=False, color = 'red').encode(
    x='Date:T',  
    y='FlowValue:Q',  
    tooltip=['YEAR', 'MONTH', 'FlowValue']  
).properties(
    width=600,
    height=300,
    title='Flow of Cowichan'
)

cowichan_flow_chart 

Temperature Data 

In [144]:
tem_cow = pd.read_csv('data/northcochiwan.csv')
tem_cow['UTC_DATE'] = pd.to_datetime(tem_cow['UTC_DATE']).dt.normalize()
tem_cow_daily = tem_cow.groupby('UTC_DATE').mean().reset_index() 
tem_cow_daily = tem_cow_daily.drop(['Unnamed: 0', 'WINDCHILL'], axis=1)
tem_cow_daily.head() 

Unnamed: 0,UTC_DATE,TEMP,RELATIVE_HUMIDITY,WIND_SPEED,DEW_POINT_TEMP
0,2018-04-02,4.516667,63.625,3.625,-2.516667
1,2018-04-03,6.229167,64.75,6.041667,-0.191667
2,2018-04-04,5.308333,81.458333,1.083333,2.245833
3,2018-04-05,6.4375,95.708333,2.208333,5.779167
4,2018-04-06,9.470833,95.291667,2.75,8.741667


In [145]:
chartcctemp = alt.Chart(tem_cow_daily).mark_line(point=False, color = 'yellow').encode(
    x='UTC_DATE:T',  
    y='TEMP:Q',  
    tooltip=['UTC_DATE', 'TEMP', 'RELATIVE_HUMIDITY', 'WIND_SPEED', 'DEW_POINT_TEMP']  
).properties(
    width=600,
    height=300,
    title='Water Temperature of Cochiwen'
)
chartcctemp

Cowichan df 

In [146]:
tem_cow_daily.rename(columns={'UTC_DATE': 'Date'}, inplace=True)
tem_cow_daily.tail() 

Unnamed: 0,Date,TEMP,RELATIVE_HUMIDITY,WIND_SPEED,DEW_POINT_TEMP
2230,2024-05-10,17.1125,55.541667,3.125,6.65
2231,2024-05-11,17.7625,57.791667,2.958333,8.041667
2232,2024-05-12,16.395833,64.25,3.416667,8.645833
2233,2024-05-13,14.641667,66.833333,3.833333,7.933333
2234,2024-05-14,15.0,68.571429,2.285714,8.928571


In [147]:
flow_cow = sorted_flow[sorted_flow['STATION_NUMBER'] == '08HA003'][['Date', 'FlowValue']].reset_index(drop=True)
flow_cow.tail() 

Unnamed: 0,Date,FlowValue
2186,2023-12-27,15.4
2187,2023-12-28,14.1
2188,2023-12-29,15.0
2189,2023-12-30,13.0
2190,2023-12-31,11.0


In [148]:
salmon_cow_long = salmon[salmon['watershed']== 'cowichan'][['earliest_detect_date', 'tagid']]
salmon_cow = salmon_cow_long.groupby('earliest_detect_date').agg(count=('tagid', 'nunique')).reset_index()
salmon_cow.rename(columns={'earliest_detect_date': 'Date'}, inplace=True)
salmon_cow.tail() 

Unnamed: 0,Date,count
61,2023-08-30,1
62,2023-09-04,1
63,2023-09-18,1
64,2023-09-21,2
65,2023-10-18,4


In [149]:
firstmerge = pd.merge(tem_cow_daily, flow_cow, on='Date', how='left')
full_cow = pd.merge(firstmerge, salmon_cow, on='Date', how='left')

In [150]:
full_cow.head()

Unnamed: 0,Date,TEMP,RELATIVE_HUMIDITY,WIND_SPEED,DEW_POINT_TEMP,FlowValue,count
0,2018-04-02,4.516667,63.625,3.625,-2.516667,4.78,
1,2018-04-03,6.229167,64.75,6.041667,-0.191667,4.45,
2,2018-04-04,5.308333,81.458333,1.083333,2.245833,4.46,
3,2018-04-05,6.4375,95.708333,2.208333,5.779167,7.85,
4,2018-04-06,9.470833,95.291667,2.75,8.741667,14.5,


Build a Multivariant CNN 
- the following scratch is build using the framework in DSCI574- Lecture 6: Advanced Time Series Modelling 

In [151]:
# For simplicity, only using 2022, 2023 data for now 
full_cow2223 = full_cow[full_cow['Date'].dt.year.isin([2022, 2023])].reset_index()
full_cow2223.drop('index', axis=1, inplace=True)
full_cow2223['count'] = full_cow2223['count'].fillna(0)
full_cow2223.rename({'TEMP': 'temperature', 'RELATIVE_HUMIDITY': 'relative_humidity', 'WIND_SPEED': 'wind_speed', 'DEW_POINT_TEMP': 'dew_point_temperature', 'FlowValue': 'flow_value', 'count': 'salmon_count'}, axis=1, inplace=True)

full_cow2223.head() 

Unnamed: 0,Date,temperature,relative_humidity,wind_speed,dew_point_temperature,flow_value,salmon_count
0,2022-01-01,-6.220833,89.875,0.583333,-7.6125,5.6,0.0
1,2022-01-02,2.170833,93.083333,4.083333,1.154167,7.47,0.0
2,2022-01-03,2.579167,94.916667,3.833333,1.825,14.1,0.0
3,2022-01-04,0.7625,95.125,3.375,0.058333,12.5,0.0
4,2022-01-05,-0.2,96.833333,1.541667,-0.645833,9.64,0.0


In [152]:
# visualize all data 
full_cow2223_long = full_cow2223.melt('Date', var_name='Measurement', value_name='Value')

chart_full_cow2223 = alt.Chart(full_cow2223_long).mark_line().encode(
    x='Date:T',
    y='Value:Q',
    color='Measurement:N'
).properties(
    width=600,
    height=400
)
chart_full_cow2223 

In [153]:
# creating lag data 
full_cow2223.set_index('Date', inplace=True)
def lag_df(df, lag=1):
    lagged_df = pd.DataFrame()
    for col in df.columns:
        for i in range(lag + 1):
            lagged_df[f'{col}_t-{i}'] = df[col].shift(i)
    return lagged_df.dropna()

SEQUENCE_LENGTH = 5
cnn_data = lag_df(full_cow2223, lag=SEQUENCE_LENGTH)
cnn_data.head() 

Unnamed: 0_level_0,temperature_t-0,temperature_t-1,temperature_t-2,temperature_t-3,temperature_t-4,temperature_t-5,relative_humidity_t-0,relative_humidity_t-1,relative_humidity_t-2,relative_humidity_t-3,...,flow_value_t-2,flow_value_t-3,flow_value_t-4,flow_value_t-5,salmon_count_t-0,salmon_count_t-1,salmon_count_t-2,salmon_count_t-3,salmon_count_t-4,salmon_count_t-5
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2022-01-06,-0.779167,-0.2,0.7625,2.579167,2.170833,-6.220833,97.625,96.833333,95.125,94.916667,...,12.5,14.1,7.47,5.6,0.0,0.0,0.0,0.0,0.0,0.0
2022-01-07,0.7625,-0.779167,-0.2,0.7625,2.579167,2.170833,98.833333,97.625,96.833333,95.125,...,9.64,12.5,14.1,7.47,0.0,0.0,0.0,0.0,0.0,0.0
2022-01-08,-1.275,0.7625,-0.779167,-0.2,0.7625,2.579167,96.958333,98.833333,97.625,96.833333,...,10.1,9.64,12.5,14.1,0.0,0.0,0.0,0.0,0.0,0.0
2022-01-09,2.45,-1.275,0.7625,-0.779167,-0.2,0.7625,92.333333,96.958333,98.833333,97.625,...,28.5,10.1,9.64,12.5,0.0,0.0,0.0,0.0,0.0,0.0
2022-01-10,-0.120833,2.45,-1.275,0.7625,-0.779167,-0.2,98.5,92.333333,96.958333,98.833333,...,21.700001,28.5,10.1,9.64,0.0,0.0,0.0,0.0,0.0,0.0


In [154]:
X_train = cnn_data.drop(columns=["temperature_t-0", "relative_humidity_t-0", "wind_speed_t-0", "dew_point_temperature_t-0", "flow_value_t-0", "salmon_count_t-0"]).to_numpy().reshape(-1,6,SEQUENCE_LENGTH)
y_train = cnn_data[["temperature_t-0", "relative_humidity_t-0", "wind_speed_t-0", "dew_point_temperature_t-0", "flow_value_t-0", "salmon_count_t-0"]].to_numpy()

In [155]:
dataset = TensorDataset(torch.tensor(X_train, dtype=torch.float32),
                        torch.tensor(y_train, dtype=torch.float32))
BATCH_SIZE = 16
dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)

X, y = next(iter(dataloader))
print(f"X shape: {X.shape}")
print(f"y shape: {y.shape}")

X shape: torch.Size([16, 6, 5])
y shape: torch.Size([16, 6])


In [156]:
class CNN(nn.Module):
    def __init__(self, input_size, output_size):
        super().__init__()
        self.main = nn.Sequential(
            nn.Conv1d(input_size, 24, kernel_size=3),  
            nn.ReLU(),
            nn.BatchNorm1d(24),
            nn.MaxPool1d(3),                         
            nn.Flatten(),                            
            nn.Linear(24, 50),                       
            nn.ReLU(),
            nn.Linear(50, output_size)               
        )

    def forward(self, x):
        return self.main(x)

In [157]:
model = CNN(input_size=6, output_size=6)
optimizer = optim.Adam(model.parameters())
criterion = nn.MSELoss()

# Training
EPOCHS = 200
for epoch in range(1, EPOCHS + 1):
    for X_batch, y_batch in dataloader:
        optimizer.zero_grad()
        y_hat = model(X_batch)
        loss = criterion(y_hat, y_batch)
        loss.backward()
        optimizer.step()
    if epoch % 20 == 0:
        print(f"Epoch {epoch}. Loss = {loss.item():.2f}")

Epoch 20. Loss = 26.15
Epoch 40. Loss = 23.31
Epoch 60. Loss = 9.78
Epoch 80. Loss = 25.39
Epoch 100. Loss = 49.89
Epoch 120. Loss = 150.29
Epoch 140. Loss = 17.38
Epoch 160. Loss = 14.32
Epoch 180. Loss = 17.08
Epoch 200. Loss = 16.42


In [158]:
forecast_index = pd.date_range(start=full_cow2223.index[-1] + timedelta(days=14), periods=36, freq='2W')
forecast_index

DatetimeIndex(['2024-01-14', '2024-01-28', '2024-02-11', '2024-02-25',
               '2024-03-10', '2024-03-24', '2024-04-07', '2024-04-21',
               '2024-05-05', '2024-05-19', '2024-06-02', '2024-06-16',
               '2024-06-30', '2024-07-14', '2024-07-28', '2024-08-11',
               '2024-08-25', '2024-09-08', '2024-09-22', '2024-10-06',
               '2024-10-20', '2024-11-03', '2024-11-17', '2024-12-01',
               '2024-12-15', '2024-12-29', '2025-01-12', '2025-01-26',
               '2025-02-09', '2025-02-23', '2025-03-09', '2025-03-23',
               '2025-04-06', '2025-04-20', '2025-05-04', '2025-05-18'],
              dtype='datetime64[ns]', freq='2W-SUN')

In [159]:
def recursive_CNN_forecast(input_data, model, n=20, responses=1):
    forecast = np.empty((n, responses))
    for i, n in enumerate(range(n)):
        forecast[i] = model(input_data).detach().numpy()
        input_data = torch.cat((torch.tensor([forecast[i]], dtype=torch.float32).unsqueeze(-1),
                                input_data[:, :, :-1]), -1)
    return forecast

In [160]:
input_data = torch.tensor(X_train[-1, :], dtype=torch.float32).unsqueeze(0)
cnn_multi = pd.DataFrame(recursive_CNN_forecast(input_data, model, n=36, responses=6),
                         columns=["temperature", "relative_humidity", "wind_speed", "dew_point_temperature", "flow_value", "salmon_count"],
                         index=forecast_index)


In [161]:
df = cnn_multi.reset_index().rename(columns={'index': 'Date'})
df['Date'] = pd.to_datetime(df['Date'])
df_melted = df.melt("Date", var_name='Variable', value_name='Value')

prediction_chart = alt.Chart(df_melted).mark_line().encode(
    x='Date:T',
    y='Value:Q',
    color='Variable:N',
    tooltip=['Date:T', 'Variable:N', 'Value:Q']
).properties(
    width=650,
    height=400,
    title="Prediction") 

prediction_chart