In [146]:
import pandas as pd
import altair as alt
from torch.utils.data import TensorDataset, DataLoader
import torch
import torch.nn as nn
import torch.optim as optim
from datetime import timedelta
import numpy as np 
from sklearn.linear_model import LogisticRegression, LinearRegression, Ridge
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate, train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler

alt.data_transformers.enable("vegafusion")

DataTransformerRegistry.enable('vegafusion')

# Read In Data 

Flow Data

| Location    | Station Number |
|-------------|----------------|
| cowichan    | 08HA003        |
| englishman  | 08HA001        |

In [147]:
flow_ori = pd.read_csv('data/flow_2023.csv')
flow_columns = [col for col in flow_ori.columns if 'FLOW' in col]
flow_long = pd.melt(flow_ori, id_vars=["STATION_NUMBER", "YEAR", "MONTH"], value_vars=flow_columns, var_name="FlowType", value_name="FlowValue")
flow_long['FlowType'] = flow_long['FlowType'].str.replace("FLOW", "").astype(int)
sorted_flow = flow_long.sort_values(by=["STATION_NUMBER", "YEAR", "MONTH", "FlowType"])
sorted_flow.rename(columns={'FlowType': 'DAY'}, inplace=True)
sorted_flow = sorted_flow.dropna()
sorted_flow['Date'] = pd.to_datetime(sorted_flow[['YEAR', 'MONTH', 'DAY']])
sorted_flow = sorted_flow.reset_index(drop=True)

In [148]:
# select only cowichan 
flow_cow = sorted_flow[sorted_flow['STATION_NUMBER'] == '08HA003']

Temperature Data 

In [149]:
tem_cow = pd.read_csv('data/northcochiwan_daily_temp-2.csv')
tem_cow['UTC_DATE'] = pd.to_datetime(tem_cow['UTC_DATE'])
tem_cow.set_index('UTC_DATE', inplace=True)
tem_cow.rename_axis('Date', inplace=True)
tem_cow.head() 

Unnamed: 0_level_0,RELATIVE_HUMIDITY,WIND_SPEED,TEMP,WINDCHILL,DEW_POINT_TEMP
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2013-09-02,75.818182,2.727273,19.127273,,14.372727
2013-09-03,83.125,2.458333,18.045833,,14.766667
2013-09-04,85.791667,2.0,17.0625,,14.379167
2013-09-05,94.708333,1.541667,16.8375,,15.9
2013-09-06,91.916667,1.583333,16.954167,,15.504167


Salmon Data

In [150]:
df_salmon = pd.read_csv('data/salmon_concat.csv') 
df_salmon.head() 

Unnamed: 0,date,count,70.2,cow bay,mainstem fence,skutz,vimy pool,ck,co
0,2014-05-01,7,,,,,,True,False
1,2014-05-02,34,,,,,,True,False
2,2014-05-07,21,,,,,,True,False
3,2014-05-08,136,,,,,,True,False
4,2014-05-13,74,,,,,,True,False


# Create Features

discharge of oct and nov (flow)

In [151]:
# create monthly flow 
flow_cow['Date'] = pd.to_datetime(flow_cow['Date'])
flow_cow.set_index('Date', inplace=True)
monthly_average_flow = flow_cow.resample('M')['FlowValue'].mean()
monthly_average_df = monthly_average_flow.to_frame(name='AverageFlowValue')
monthly_average_df.head() 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flow_cow['Date'] = pd.to_datetime(flow_cow['Date'])
  monthly_average_flow = flow_cow.resample('M')['FlowValue'].mean()


Unnamed: 0_level_0,AverageFlowValue
Date,Unnamed: 1_level_1
2013-01-31,14.82129
2013-02-28,14.317143
2013-03-31,21.153548
2013-04-30,9.061333
2013-05-31,2.744194


In [152]:
# create last winter flow 
df_filtered = monthly_average_df[monthly_average_df.index.month.isin([10, 11])]
annual_average_flow = df_filtered.resample('A').mean()
annual_average_flow.index = annual_average_flow.index.year  
annual_average_flow.index = pd.to_datetime(annual_average_flow.index, format='%Y') + pd.DateOffset(years=1)
annual_average_flow.rename(columns={'AverageFlowValue': 'LastWinterFlow'}, inplace=True)
annual_average_flow.index = annual_average_flow.index.year 
annual_average_flow.rename_axis('Year', inplace=True)

annual_average_flow.head() 


  annual_average_flow = df_filtered.resample('A').mean()


Unnamed: 0_level_0,LastWinterFlow
Year,Unnamed: 1_level_1
2014,5.316866
2015,12.066258
2016,11.385591
2017,22.321763
2018,19.36822


discharge for mar to may (flow)

In [153]:
flow35 = monthly_average_df[monthly_average_df.index.month.isin([3, 4, 5])]
flow35['Year'] = flow35.index.year
flow35['Month'] = flow35.index.month

pivot_df = flow35.pivot_table(index='Year', columns='Month', values='AverageFlowValue')
df_combined = annual_average_flow.merge(pivot_df, on='Year', how='outer')

df_combined.rename(columns={3: 'marFlow'}, inplace=True)
df_combined.rename(columns={4: 'aprFlow'}, inplace=True)
df_combined.rename(columns={5: 'mayFlow'}, inplace=True)

df_combined.head() 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flow35['Year'] = flow35.index.year
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flow35['Month'] = flow35.index.month


Unnamed: 0_level_0,LastWinterFlow,marFlow,aprFlow,mayFlow
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2013,,21.153548,9.061333,2.744194
2014,5.316866,27.383871,8.355,3.838387
2015,12.066258,11.441936,5.063667,1.485129
2016,11.385591,29.127097,3.260333,0.955323
2017,22.321763,27.607419,17.052667,3.889355


temp for dec - feb

In [154]:
# create monthly temp 
monthly_average_temp = tem_cow.resample('M')['TEMP'].mean()
monthly_average_temp = monthly_average_temp.to_frame(name='AverageTemp')

  monthly_average_temp = tem_cow.resample('M')['TEMP'].mean()


In [155]:
# last winter temp 
df_filtered_temp = monthly_average_temp[monthly_average_temp.index.month.isin([12, 1, 2])]
df_filtered_temp['RollingMean'] = df_filtered_temp['AverageTemp'].rolling(window=3, min_periods=3).mean()
filtered_mean_temp = df_filtered_temp[df_filtered_temp.index.month.isin([2])]
filtered_mean_temp = filtered_mean_temp.drop('AverageTemp', axis=1)

filtered_mean_temp.index = filtered_mean_temp.index.year  # Adjust index to show only the year
filtered_mean_temp.rename_axis('Year', inplace=True)
filtered_mean_temp.rename(columns={'RollingMean': 'LastWinterTemp'}, inplace=True)

filtered_mean_temp.head() 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered_temp['RollingMean'] = df_filtered_temp['AverageTemp'].rolling(window=3, min_periods=3).mean()


Unnamed: 0_level_0,LastWinterTemp
Year,Unnamed: 1_level_1
2014,2.82403
2015,5.555037
2016,4.686489
2017,1.328632
2018,3.191699


temp for mar - may 

In [156]:
df_filtered_marmay = monthly_average_temp[monthly_average_temp.index.month.isin([3,4,5])]

df_filtered_marmay['Year'] = df_filtered_marmay.index.year
df_filtered_marmay['Month'] = df_filtered_marmay.index.month

pivot_df_temp = df_filtered_marmay.pivot_table(index='Year', columns='Month', values='AverageTemp')

df_combined_temp = filtered_mean_temp.merge(pivot_df_temp, on='Year', how='outer')

df_combined_temp.rename(columns={3: 'marTemp'}, inplace=True)
df_combined_temp.rename(columns={4: 'aprTemp'}, inplace=True)
df_combined_temp.rename(columns={5: 'mayTemp'}, inplace=True)
df_combined_temp.head() 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered_marmay['Year'] = df_filtered_marmay.index.year
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered_marmay['Month'] = df_filtered_marmay.index.month


Unnamed: 0_level_0,LastWinterTemp,marTemp,aprTemp,mayTemp
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2014,2.82403,5.937401,9.200833,13.870775
2015,5.555037,7.991129,8.587107,14.717418
2016,4.686489,7.5125,11.871944,14.515945
2017,1.328632,5.671237,8.566111,13.086828
2018,3.191699,5.477688,9.018712,15.219624


In [157]:
# merge flow and temp 
flowTemp = df_combined_temp.merge(df_combined, on='Year', how='outer')
flowTemp.head() 

Unnamed: 0_level_0,LastWinterTemp,marTemp,aprTemp,mayTemp,LastWinterFlow,marFlow,aprFlow,mayFlow
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2013,,,,,,21.153548,9.061333,2.744194
2014,2.82403,5.937401,9.200833,13.870775,5.316866,27.383871,8.355,3.838387
2015,5.555037,7.991129,8.587107,14.717418,12.066258,11.441936,5.063667,1.485129
2016,4.686489,7.5125,11.871944,14.515945,11.385591,29.127097,3.260333,0.955323
2017,1.328632,5.671237,8.566111,13.086828,22.321763,27.607419,17.052667,3.889355


In [158]:
df_salmon['date'] = pd.to_datetime(df_salmon['date'])  

mask = ((df_salmon['date'].dt.month > 4) | ((df_salmon['date'].dt.month == 4) & (df_salmon['date'].dt.day >= 15))) & \
       ((df_salmon['date'].dt.month < 10) | ((df_salmon['date'].dt.month == 10) & (df_salmon['date'].dt.day <= 15)))

df_filtered = df_salmon.loc[mask]

start_year = df_filtered['date'].dt.year.min()
end_year = df_filtered['date'].dt.year.max()

all_dates = []
for year in range(start_year, end_year + 1):
    start_date = f"{year}-04-15"
    end_date = f"{year}-10-15"
    all_dates.extend(pd.date_range(start=start_date, end=end_date))

existing_dates = set(df_filtered['date'])
missing_dates = [date for date in all_dates if date not in existing_dates]

df_missing = pd.DataFrame({
    'date': missing_dates,
    'count': [0] * len(missing_dates),
    '70.2': [0] * len(missing_dates),
    'cow bay': [0] * len(missing_dates),
    'mainstem fence': [0] * len(missing_dates),
    'skutz': [0] * len(missing_dates),
    'vimy pool': [0] * len(missing_dates),
    'ck': [False] * len(missing_dates),
    'co': [False] * len(missing_dates)
})

df_final = pd.concat([df_filtered, df_missing]).sort_values('date')

In [159]:
df_final.replace({True: 1, False: 0, np.nan: 0}, inplace=True)
df_final.fillna(0, inplace=True)
df_final.head()

  df_final.replace({True: 1, False: 0, np.nan: 0}, inplace=True)


Unnamed: 0,date,count,70.2,cow bay,mainstem fence,skutz,vimy pool,ck,co
0,2014-04-15,0,0.0,0.0,0.0,0.0,0.0,0,0
1,2014-04-16,0,0.0,0.0,0.0,0.0,0.0,0,0
2,2014-04-17,0,0.0,0.0,0.0,0.0,0.0,0,0
3,2014-04-18,0,0.0,0.0,0.0,0.0,0.0,0,0
4,2014-04-19,0,0.0,0.0,0.0,0.0,0.0,0,0


In [160]:
df_final['Year'] = df_final['date'].dt.year
df_final.head() 

Unnamed: 0,date,count,70.2,cow bay,mainstem fence,skutz,vimy pool,ck,co,Year
0,2014-04-15,0,0.0,0.0,0.0,0.0,0.0,0,0,2014
1,2014-04-16,0,0.0,0.0,0.0,0.0,0.0,0,0,2014
2,2014-04-17,0,0.0,0.0,0.0,0.0,0.0,0,0,2014
3,2014-04-18,0,0.0,0.0,0.0,0.0,0.0,0,0,2014
4,2014-04-19,0,0.0,0.0,0.0,0.0,0.0,0,0,2014


In [161]:
wMacro = df_final.merge(flowTemp, how='left', left_on='Year', right_index=True)
wMacro.head() 

Unnamed: 0,date,count,70.2,cow bay,mainstem fence,skutz,vimy pool,ck,co,Year,LastWinterTemp,marTemp,aprTemp,mayTemp,LastWinterFlow,marFlow,aprFlow,mayFlow
0,2014-04-15,0,0.0,0.0,0.0,0.0,0.0,0,0,2014,2.82403,5.937401,9.200833,13.870775,5.316866,27.383871,8.355,3.838387
1,2014-04-16,0,0.0,0.0,0.0,0.0,0.0,0,0,2014,2.82403,5.937401,9.200833,13.870775,5.316866,27.383871,8.355,3.838387
2,2014-04-17,0,0.0,0.0,0.0,0.0,0.0,0,0,2014,2.82403,5.937401,9.200833,13.870775,5.316866,27.383871,8.355,3.838387
3,2014-04-18,0,0.0,0.0,0.0,0.0,0.0,0,0,2014,2.82403,5.937401,9.200833,13.870775,5.316866,27.383871,8.355,3.838387
4,2014-04-19,0,0.0,0.0,0.0,0.0,0.0,0,0,2014,2.82403,5.937401,9.200833,13.870775,5.316866,27.383871,8.355,3.838387


In [162]:
df_joined = pd.merge(wMacro, tem_cow, how='left', left_on='date', right_index = True)
df_joined.head() 

Unnamed: 0,date,count,70.2,cow bay,mainstem fence,skutz,vimy pool,ck,co,Year,...,mayTemp,LastWinterFlow,marFlow,aprFlow,mayFlow,RELATIVE_HUMIDITY,WIND_SPEED,TEMP,WINDCHILL,DEW_POINT_TEMP
0,2014-04-15,0,0.0,0.0,0.0,0.0,0.0,0,0,2014,...,13.870775,5.316866,27.383871,8.355,3.838387,52.416667,10.375,11.6375,,1.929167
1,2014-04-16,0,0.0,0.0,0.0,0.0,0.0,0,0,2014,...,13.870775,5.316866,27.383871,8.355,3.838387,88.291667,4.333333,9.508333,,7.583333
2,2014-04-17,0,0.0,0.0,0.0,0.0,0.0,0,0,2014,...,13.870775,5.316866,27.383871,8.355,3.838387,88.208333,6.041667,9.975,,8.05
3,2014-04-18,0,0.0,0.0,0.0,0.0,0.0,0,0,2014,...,13.870775,5.316866,27.383871,8.355,3.838387,63.291667,8.041667,9.441667,,2.55
4,2014-04-19,0,0.0,0.0,0.0,0.0,0.0,0,0,2014,...,13.870775,5.316866,27.383871,8.355,3.838387,75.25,3.25,8.529167,,4.15


In [163]:
df_joined.drop(['RELATIVE_HUMIDITY', 'WIND_SPEED', 'WINDCHILL', 'DEW_POINT_TEMP'], axis=1, inplace=True)
df_joined.head() 

Unnamed: 0,date,count,70.2,cow bay,mainstem fence,skutz,vimy pool,ck,co,Year,LastWinterTemp,marTemp,aprTemp,mayTemp,LastWinterFlow,marFlow,aprFlow,mayFlow,TEMP
0,2014-04-15,0,0.0,0.0,0.0,0.0,0.0,0,0,2014,2.82403,5.937401,9.200833,13.870775,5.316866,27.383871,8.355,3.838387,11.6375
1,2014-04-16,0,0.0,0.0,0.0,0.0,0.0,0,0,2014,2.82403,5.937401,9.200833,13.870775,5.316866,27.383871,8.355,3.838387,9.508333
2,2014-04-17,0,0.0,0.0,0.0,0.0,0.0,0,0,2014,2.82403,5.937401,9.200833,13.870775,5.316866,27.383871,8.355,3.838387,9.975
3,2014-04-18,0,0.0,0.0,0.0,0.0,0.0,0,0,2014,2.82403,5.937401,9.200833,13.870775,5.316866,27.383871,8.355,3.838387,9.441667
4,2014-04-19,0,0.0,0.0,0.0,0.0,0.0,0,0,2014,2.82403,5.937401,9.200833,13.870775,5.316866,27.383871,8.355,3.838387,8.529167


In [164]:
df_complete = pd.merge(df_joined, flow_cow, how='left', left_on='date', right_index = True)
df_complete

Unnamed: 0,date,count,70.2,cow bay,mainstem fence,skutz,vimy pool,ck,co,Year,...,LastWinterFlow,marFlow,aprFlow,mayFlow,TEMP,STATION_NUMBER,YEAR,MONTH,DAY,FlowValue
0,2014-04-15,0,0.0,0.0,0.0,0.0,0.0,0,0,2014,...,5.316866,27.383871,8.355000,3.838387,11.637500,08HA003,2014,4,15,4.300
1,2014-04-16,0,0.0,0.0,0.0,0.0,0.0,0,0,2014,...,5.316866,27.383871,8.355000,3.838387,9.508333,08HA003,2014,4,16,4.570
2,2014-04-17,0,0.0,0.0,0.0,0.0,0.0,0,0,2014,...,5.316866,27.383871,8.355000,3.838387,9.975000,08HA003,2014,4,17,6.020
3,2014-04-18,0,0.0,0.0,0.0,0.0,0.0,0,0,2014,...,5.316866,27.383871,8.355000,3.838387,9.441667,08HA003,2014,4,18,7.470
4,2014-04-19,0,0.0,0.0,0.0,0.0,0.0,0,0,2014,...,5.316866,27.383871,8.355000,3.838387,8.529167,08HA003,2014,4,19,6.110
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1423,2023-10-11,0,0.0,0.0,0.0,0.0,0.0,0,0,2023,...,1.746417,7.648065,12.996333,2.918065,12.458333,08HA003,2023,10,11,0.571
1424,2023-10-12,0,0.0,0.0,0.0,0.0,0.0,0,0,2023,...,1.746417,7.648065,12.996333,2.918065,9.975000,08HA003,2023,10,12,0.597
1425,2023-10-13,0,0.0,0.0,0.0,0.0,0.0,0,0,2023,...,1.746417,7.648065,12.996333,2.918065,10.087500,08HA003,2023,10,13,0.524
1426,2023-10-14,0,0.0,0.0,0.0,0.0,0.0,0,0,2023,...,1.746417,7.648065,12.996333,2.918065,12.341667,08HA003,2023,10,14,0.496


In [165]:
df_complete.drop(['STATION_NUMBER', 'YEAR', 'DAY', 'Year'], axis=1, inplace=True)
df_complete

Unnamed: 0,date,count,70.2,cow bay,mainstem fence,skutz,vimy pool,ck,co,LastWinterTemp,marTemp,aprTemp,mayTemp,LastWinterFlow,marFlow,aprFlow,mayFlow,TEMP,MONTH,FlowValue
0,2014-04-15,0,0.0,0.0,0.0,0.0,0.0,0,0,2.824030,5.937401,9.200833,13.870775,5.316866,27.383871,8.355000,3.838387,11.637500,4,4.300
1,2014-04-16,0,0.0,0.0,0.0,0.0,0.0,0,0,2.824030,5.937401,9.200833,13.870775,5.316866,27.383871,8.355000,3.838387,9.508333,4,4.570
2,2014-04-17,0,0.0,0.0,0.0,0.0,0.0,0,0,2.824030,5.937401,9.200833,13.870775,5.316866,27.383871,8.355000,3.838387,9.975000,4,6.020
3,2014-04-18,0,0.0,0.0,0.0,0.0,0.0,0,0,2.824030,5.937401,9.200833,13.870775,5.316866,27.383871,8.355000,3.838387,9.441667,4,7.470
4,2014-04-19,0,0.0,0.0,0.0,0.0,0.0,0,0,2.824030,5.937401,9.200833,13.870775,5.316866,27.383871,8.355000,3.838387,8.529167,4,6.110
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1423,2023-10-11,0,0.0,0.0,0.0,0.0,0.0,0,0,3.049505,5.093952,7.577862,15.933737,1.746417,7.648065,12.996333,2.918065,12.458333,10,0.571
1424,2023-10-12,0,0.0,0.0,0.0,0.0,0.0,0,0,3.049505,5.093952,7.577862,15.933737,1.746417,7.648065,12.996333,2.918065,9.975000,10,0.597
1425,2023-10-13,0,0.0,0.0,0.0,0.0,0.0,0,0,3.049505,5.093952,7.577862,15.933737,1.746417,7.648065,12.996333,2.918065,10.087500,10,0.524
1426,2023-10-14,0,0.0,0.0,0.0,0.0,0.0,0,0,3.049505,5.093952,7.577862,15.933737,1.746417,7.648065,12.996333,2.918065,12.341667,10,0.496


In [166]:
def lag_df(df, date_col, columns_to_lag, min_lag, max_lag):
    df = df.sort_values(date_col)
    lagged_df = pd.DataFrame(index=df.index)
    
    for col in columns_to_lag:
        for i in range(min_lag, max_lag + 1):
            lagged_name = f'{col}_t-{i}'  
            lagged_df[lagged_name] = df[col].shift(i)
    
    result_df = pd.concat([df, lagged_df], axis=1)
    result_df.dropna(inplace=True)

    return result_df

In [167]:
columns_to_lag = ['TEMP', 'FlowValue']
lagged_data = lag_df(df_complete, 'date', columns_to_lag, min_lag=30, max_lag=40)
lagged_data.head() 

Unnamed: 0,date,count,70.2,cow bay,mainstem fence,skutz,vimy pool,ck,co,LastWinterTemp,...,FlowValue_t-31,FlowValue_t-32,FlowValue_t-33,FlowValue_t-34,FlowValue_t-35,FlowValue_t-36,FlowValue_t-37,FlowValue_t-38,FlowValue_t-39,FlowValue_t-40
9,2014-05-25,36,0.0,0.0,0.0,0.0,0.0,1,0,2.82403,...,9.09,6.5,6.99,6.97,6.98,6.11,7.47,6.02,4.57,4.3
10,2014-05-26,351,0.0,0.0,0.0,0.0,0.0,1,0,2.82403,...,11.0,9.09,6.5,6.99,6.97,6.98,6.11,7.47,6.02,4.57
11,2014-05-27,803,0.0,0.0,0.0,0.0,0.0,1,0,2.82403,...,9.09,11.0,9.09,6.5,6.99,6.97,6.98,6.11,7.47,6.02
12,2014-05-28,698,0.0,0.0,0.0,0.0,0.0,1,0,2.82403,...,10.0,9.09,11.0,9.09,6.5,6.99,6.97,6.98,6.11,7.47
31,2014-05-29,0,0.0,0.0,0.0,0.0,0.0,0,0,2.82403,...,10.5,10.0,9.09,11.0,9.09,6.5,6.99,6.97,6.98,6.11


In [168]:
cols = [col for col in lagged_data.columns if col != 'count'] + ['count']
lagged_data = lagged_data[cols]

# Modeling 

In [169]:
X = lagged_data.drop(columns=["count", 'date'])
y = lagged_data["count"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=123
) 

In [170]:
pipe = make_pipeline(StandardScaler(), Ridge())

In [171]:
pipe.fit(X_train, y_train)

In [172]:
kk = pipe.predict(X_test)

In [173]:
predict = pd.DataFrame(kk, columns = ['prediction']) 

In [174]:
test = pd.DataFrame(y_test).reset_index() 

In [175]:
df_horizontal = pd.concat([test, predict], axis=1)
df_horizontal.head() 

Unnamed: 0,index,count,prediction
0,800,0,-31.283342
1,1061,0,-13.660415
2,384,0,60.041356
3,40,3,92.376187
4,582,31,191.98499


In [176]:
df_horizontal['test_bi'] = df_horizontal['count'].apply(lambda x: 1 if x >= 1 else 0)
df_horizontal.head() 

Unnamed: 0,index,count,prediction,test_bi
0,800,0,-31.283342,0
1,1061,0,-13.660415,0
2,384,0,60.041356,0
3,40,3,92.376187,1
4,582,31,191.98499,1


In [177]:
df_sorted = df_horizontal.sort_values('prediction')
df_sorted.drop(['index'], axis=1, inplace=True)
df_sorted.head() 

Unnamed: 0,count,prediction,test_bi
289,0,-102.522089,0
159,0,-75.989889,0
348,0,-75.197285,0
70,0,-74.307642,0
145,0,-74.097901,0


In [178]:
chart = alt.Chart(df_sorted).mark_point().encode(
    x='index:Q',  
    y= 'prediction:Q',
    color='test_bi:N'  
).transform_window(
    index='count()' 
)

In [179]:
chart 

Logistic Regression 

In [180]:
lagged_data['count_bi'] = lagged_data['count'].apply(lambda x: 1 if x >= 1 else 0)
lagged_data.head() 

Unnamed: 0,date,70.2,cow bay,mainstem fence,skutz,vimy pool,ck,co,LastWinterTemp,marTemp,...,FlowValue_t-33,FlowValue_t-34,FlowValue_t-35,FlowValue_t-36,FlowValue_t-37,FlowValue_t-38,FlowValue_t-39,FlowValue_t-40,count,count_bi
9,2014-05-25,0.0,0.0,0.0,0.0,0.0,1,0,2.82403,5.937401,...,6.99,6.97,6.98,6.11,7.47,6.02,4.57,4.3,36,1
10,2014-05-26,0.0,0.0,0.0,0.0,0.0,1,0,2.82403,5.937401,...,6.5,6.99,6.97,6.98,6.11,7.47,6.02,4.57,351,1
11,2014-05-27,0.0,0.0,0.0,0.0,0.0,1,0,2.82403,5.937401,...,9.09,6.5,6.99,6.97,6.98,6.11,7.47,6.02,803,1
12,2014-05-28,0.0,0.0,0.0,0.0,0.0,1,0,2.82403,5.937401,...,11.0,9.09,6.5,6.99,6.97,6.98,6.11,7.47,698,1
31,2014-05-29,0.0,0.0,0.0,0.0,0.0,0,0,2.82403,5.937401,...,9.09,11.0,9.09,6.5,6.99,6.97,6.98,6.11,0,0


In [181]:
X_1 = lagged_data.drop(columns=["count", 'date', 'count_bi', 'TEMP', 'FlowValue', 'aprTemp', 'mayTemp', 'aprFlow', 'aprTemp'])
y_1 = lagged_data["count_bi"]

X_1_train, X_1_test, y_1_train, y_1_test = train_test_split(
    X_1, y_1, test_size=0.2, random_state=123
) 

In [182]:
pipe_1 = make_pipeline(StandardScaler(), LogisticRegression())

In [183]:
pipe_1.fit(X_1_train, y_1_train)

In [184]:
cv_scores = cross_val_score(pipe_1, X_1_train, y_1_train, cv=10)
cv_scores

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])

In [185]:
pipe_1.score(X_1_test, y_1_test)

1.0

Logistic Regression without Shuffle 

In [186]:
X_2_train, X_2_test, y_2_train, y_2_test = train_test_split(
    X_1, y_1, test_size=0.2, shuffle= False 
) 

In [187]:
pipe_2 = make_pipeline(StandardScaler(), LogisticRegression())

In [188]:
pipe_2.fit(X_2_train, y_2_train)

In [189]:
pipe_2.score(X_2_test, y_2_test)

1.0

In [190]:
train_3 = lagged_data[lagged_data['date'].dt.year != 2023]
test_3 = lagged_data[lagged_data['date'].dt.year == 2023]

In [191]:
X_3_train = train_3.drop(columns=["count", 'date', 'count_bi', 'TEMP', 'FlowValue', 'aprTemp', 'mayTemp', 'aprFlow', 'mayFlow'])
X_3_test = test_3.drop(columns=["count", 'date', 'count_bi', 'TEMP', 'FlowValue', 'aprTemp', 'mayTemp', 'aprFlow', 'mayFlow'])
y_3_train = train_3["count_bi"]
y_3_test = test_3["count_bi"]

In [192]:
pipe_3 = make_pipeline(StandardScaler(), LogisticRegression())

In [193]:
pipe_3.fit(X_3_train, y_3_train)

In [194]:
pipe_3.score(X_3_test, y_3_test)

1.0

In [195]:
prediction3 = pd.DataFrame(pipe_3.predict(X_3_test), columns=['prediction'])

In [196]:
X_3_test

Unnamed: 0,70.2,cow bay,mainstem fence,skutz,vimy pool,ck,co,LastWinterTemp,marTemp,LastWinterFlow,...,FlowValue_t-31,FlowValue_t-32,FlowValue_t-33,FlowValue_t-34,FlowValue_t-35,FlowValue_t-36,FlowValue_t-37,FlowValue_t-38,FlowValue_t-39,FlowValue_t-40
1262,0.0,0.0,0.0,0.0,0.0,0,0,3.049505,5.093952,1.746417,...,0.096,0.096,0.105,0.116,0.111,0.118,0.127,0.131,0.147,0.148
1263,0.0,0.0,0.0,0.0,0.0,0,0,3.049505,5.093952,1.746417,...,0.102,0.096,0.096,0.105,0.116,0.111,0.118,0.127,0.131,0.147
1264,0.0,0.0,0.0,0.0,0.0,0,0,3.049505,5.093952,1.746417,...,0.101,0.102,0.096,0.096,0.105,0.116,0.111,0.118,0.127,0.131
1265,0.0,0.0,0.0,0.0,0.0,0,0,3.049505,5.093952,1.746417,...,0.104,0.101,0.102,0.096,0.096,0.105,0.116,0.111,0.118,0.127
1266,0.0,0.0,0.0,0.0,0.0,0,0,3.049505,5.093952,1.746417,...,0.103,0.104,0.101,0.102,0.096,0.096,0.105,0.116,0.111,0.118
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1423,0.0,0.0,0.0,0.0,0.0,0,0,3.049505,5.093952,1.746417,...,0.193,0.199,0.195,0.211,0.237,0.242,0.234,0.229,0.238,0.246
1424,0.0,0.0,0.0,0.0,0.0,0,0,3.049505,5.093952,1.746417,...,0.199,0.193,0.199,0.195,0.211,0.237,0.242,0.234,0.229,0.238
1425,0.0,0.0,0.0,0.0,0.0,0,0,3.049505,5.093952,1.746417,...,0.201,0.199,0.193,0.199,0.195,0.211,0.237,0.242,0.234,0.229
1426,0.0,0.0,0.0,0.0,0.0,0,0,3.049505,5.093952,1.746417,...,0.199,0.201,0.199,0.193,0.199,0.195,0.211,0.237,0.242,0.234


In [197]:
y_3_test_withdate = test_3[["date", "count_bi"]].reset_index()
y_3_test_withdate = y_3_test_withdate.drop(columns=["index"])

In [198]:
result = pd.merge(y_3_test_withdate, prediction3, left_index=True, right_index=True, how='inner')
result

Unnamed: 0,date,count_bi,prediction
0,2023-04-15,0,0
1,2023-04-16,0,0
2,2023-04-17,0,0
3,2023-04-18,0,0
4,2023-04-19,0,0
...,...,...,...
191,2023-10-11,0,0
192,2023-10-12,0,0
193,2023-10-13,0,0
194,2023-10-14,0,0


In [199]:
test_3 = test_3.reset_index() 
test_3 = test_3.drop(columns = ["index"])

In [200]:
test_3

Unnamed: 0,date,70.2,cow bay,mainstem fence,skutz,vimy pool,ck,co,LastWinterTemp,marTemp,...,FlowValue_t-33,FlowValue_t-34,FlowValue_t-35,FlowValue_t-36,FlowValue_t-37,FlowValue_t-38,FlowValue_t-39,FlowValue_t-40,count,count_bi
0,2023-04-15,0.0,0.0,0.0,0.0,0.0,0,0,3.049505,5.093952,...,0.105,0.116,0.111,0.118,0.127,0.131,0.147,0.148,0,0
1,2023-04-16,0.0,0.0,0.0,0.0,0.0,0,0,3.049505,5.093952,...,0.096,0.105,0.116,0.111,0.118,0.127,0.131,0.147,0,0
2,2023-04-17,0.0,0.0,0.0,0.0,0.0,0,0,3.049505,5.093952,...,0.096,0.096,0.105,0.116,0.111,0.118,0.127,0.131,0,0
3,2023-04-18,0.0,0.0,0.0,0.0,0.0,0,0,3.049505,5.093952,...,0.102,0.096,0.096,0.105,0.116,0.111,0.118,0.127,0,0
4,2023-04-19,0.0,0.0,0.0,0.0,0.0,0,0,3.049505,5.093952,...,0.101,0.102,0.096,0.096,0.105,0.116,0.111,0.118,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
191,2023-10-11,0.0,0.0,0.0,0.0,0.0,0,0,3.049505,5.093952,...,0.195,0.211,0.237,0.242,0.234,0.229,0.238,0.246,0,0
192,2023-10-12,0.0,0.0,0.0,0.0,0.0,0,0,3.049505,5.093952,...,0.199,0.195,0.211,0.237,0.242,0.234,0.229,0.238,0,0
193,2023-10-13,0.0,0.0,0.0,0.0,0.0,0,0,3.049505,5.093952,...,0.193,0.199,0.195,0.211,0.237,0.242,0.234,0.229,0,0
194,2023-10-14,0.0,0.0,0.0,0.0,0.0,0,0,3.049505,5.093952,...,0.199,0.193,0.199,0.195,0.211,0.237,0.242,0.234,0,0


In [201]:
result = pd.merge(test_3, prediction3, left_index=True, right_index=True, how='inner')
result = result[result['date'] <= '2023-06-16']

result

Unnamed: 0,date,70.2,cow bay,mainstem fence,skutz,vimy pool,ck,co,LastWinterTemp,marTemp,...,FlowValue_t-34,FlowValue_t-35,FlowValue_t-36,FlowValue_t-37,FlowValue_t-38,FlowValue_t-39,FlowValue_t-40,count,count_bi,prediction
0,2023-04-15,0.0,0.0,0.0,0.0,0.0,0,0,3.049505,5.093952,...,0.116,0.111,0.118,0.127,0.131,0.147,0.148,0,0,0
1,2023-04-16,0.0,0.0,0.0,0.0,0.0,0,0,3.049505,5.093952,...,0.105,0.116,0.111,0.118,0.127,0.131,0.147,0,0,0
2,2023-04-17,0.0,0.0,0.0,0.0,0.0,0,0,3.049505,5.093952,...,0.096,0.105,0.116,0.111,0.118,0.127,0.131,0,0,0
3,2023-04-18,0.0,0.0,0.0,0.0,0.0,0,0,3.049505,5.093952,...,0.096,0.096,0.105,0.116,0.111,0.118,0.127,0,0,0
4,2023-04-19,0.0,0.0,0.0,0.0,0.0,0,0,3.049505,5.093952,...,0.102,0.096,0.096,0.105,0.116,0.111,0.118,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
70,2023-06-12,0.0,0.0,0.0,0.0,0.0,0,0,3.049505,5.093952,...,1.860,2.010,2.150,2.290,2.500,2.710,2.930,0,0,0
71,2023-06-13,0.0,0.0,0.0,0.0,0.0,0,0,3.049505,5.093952,...,1.860,1.860,2.010,2.150,2.290,2.500,2.710,0,0,0
72,2023-06-14,0.0,0.0,0.0,0.0,0.0,0,0,3.049505,5.093952,...,1.730,1.860,1.860,2.010,2.150,2.290,2.500,0,0,0
73,2023-06-15,0.0,0.0,0.0,0.0,0.0,0,0,3.049505,5.093952,...,1.610,1.730,1.860,1.860,2.010,2.150,2.290,0,0,0


In [202]:
base = alt.Chart(result).encode(
    x=alt.X('date:T', axis=alt.Axis(title='Date'))  
)

line_count = base.mark_line(color='blue').encode(
    y=alt.Y('count:Q', axis=alt.Axis(title='Count'))  
)

line_prediction = base.mark_point(color='red').encode(
    y=alt.Y('prediction:Q', axis=alt.Axis(title='Prediction'), scale=alt.Scale(domain=(0, 1)))
)

chart = alt.layer(line_count, line_prediction).resolve_scale(
    y='independent'  
)

chart


In [203]:
coefficients = pipe_3.named_steps['logisticregression'].coef_[0]
coefficients

array([ 1.87530653e-01,  1.19591154e-01,  4.67594019e-01,  1.05150085e-01,
        9.21663781e-02,  4.57957566e+00,  2.75189159e+00,  3.12391939e-02,
        9.32774671e-02, -2.36038702e-01, -5.73103596e-02, -1.87549545e-02,
       -1.35600626e-02, -2.57573863e-02,  2.31619704e-02,  2.06495562e-02,
        1.60884013e-02, -2.08638843e-02, -1.31979985e-02, -9.10308841e-03,
       -3.55419710e-02, -4.81443109e-02, -2.19454984e-02,  3.04829100e-02,
        7.75778850e-03,  9.53015778e-03, -5.53821118e-03,  8.63130939e-03,
        6.21466767e-03,  5.09552156e-03, -9.53809453e-03,  3.97883323e-03,
       -2.73850760e-03,  5.63981108e-03])

In [204]:
feature_names = X_3_train.columns
coef_df = pd.DataFrame({
    'Feature': feature_names,
    'Coefficient': coefficients
}).sort_values(by='Coefficient', ascending=False).reset_index().drop(columns = ['index'])

coef_df

Unnamed: 0,Feature,Coefficient
0,ck,4.579576
1,co,2.751892
2,mainstem fence,0.467594
3,70.2,0.187531
4,cow bay,0.119591
5,skutz,0.10515
6,marTemp,0.093277
7,vimy pool,0.092166
8,LastWinterTemp,0.031239
9,FlowValue_t-30,0.030483


In [205]:
coef_df.to_csv('data/coef_df.csv', index=False)


In [207]:
X_3_test.shape

(196, 34)

In [211]:
array_of_thousands = np.full(34, -1000)


In [212]:
reshaped = array_of_thousands.reshape(1, 34)
reshaped.shape

(1, 34)

In [213]:
pipe_3.predict(reshaped)



array([0])