In [287]:
import pandas as pd
import altair as alt
from torch.utils.data import TensorDataset, DataLoader
import torch
import torch.nn as nn
import torch.optim as optim
from datetime import timedelta
import numpy as np 
from sklearn.linear_model import LogisticRegression, LinearRegression, Ridge
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate, train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler

alt.data_transformers.enable("vegafusion")

DataTransformerRegistry.enable('vegafusion')

# Read In Data 

Flow Data

| Location    | Station Number |
|-------------|----------------|
| cowichan    | 08HA003        |
| englishman  | 08HA001        |

In [288]:
flow_ori = pd.read_csv('data/flow_2023.csv')
flow_columns = [col for col in flow_ori.columns if 'FLOW' in col]
flow_long = pd.melt(flow_ori, id_vars=["STATION_NUMBER", "YEAR", "MONTH"], value_vars=flow_columns, var_name="FlowType", value_name="FlowValue")
flow_long['FlowType'] = flow_long['FlowType'].str.replace("FLOW", "").astype(int)
sorted_flow = flow_long.sort_values(by=["STATION_NUMBER", "YEAR", "MONTH", "FlowType"])
sorted_flow.rename(columns={'FlowType': 'DAY'}, inplace=True)
sorted_flow = sorted_flow.dropna()
sorted_flow['Date'] = pd.to_datetime(sorted_flow[['YEAR', 'MONTH', 'DAY']])
sorted_flow = sorted_flow.reset_index(drop=True)

In [289]:
# select only cowichan 
flow_cow = sorted_flow[sorted_flow['STATION_NUMBER'] == '08HA003']

Temperature Data 

In [290]:
tem_cow = pd.read_csv('data/northcochiwan_daily_temp-2.csv')
tem_cow['UTC_DATE'] = pd.to_datetime(tem_cow['UTC_DATE'])
tem_cow.set_index('UTC_DATE', inplace=True)
tem_cow.rename_axis('Date', inplace=True)
tem_cow.head() 

Unnamed: 0_level_0,RELATIVE_HUMIDITY,WIND_SPEED,TEMP,WINDCHILL,DEW_POINT_TEMP
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2013-09-02,75.818182,2.727273,19.127273,,14.372727
2013-09-03,83.125,2.458333,18.045833,,14.766667
2013-09-04,85.791667,2.0,17.0625,,14.379167
2013-09-05,94.708333,1.541667,16.8375,,15.9
2013-09-06,91.916667,1.583333,16.954167,,15.504167


Salmon Data

In [291]:
df_salmon = pd.read_csv('data/salmon_concat.csv') 
df_salmon.head() 

Unnamed: 0,date,count,70.2,cow bay,mainstem fence,skutz,vimy pool,ck,co
0,2014-05-01,7,,,,,,True,False
1,2014-05-02,34,,,,,,True,False
2,2014-05-07,21,,,,,,True,False
3,2014-05-08,136,,,,,,True,False
4,2014-05-13,74,,,,,,True,False


# Create Features

discharge of oct and nov (flow)

In [292]:
# create monthly flow 
flow_cow['Date'] = pd.to_datetime(flow_cow['Date'])
flow_cow.set_index('Date', inplace=True)
monthly_average_flow = flow_cow.resample('M')['FlowValue'].mean()
monthly_average_df = monthly_average_flow.to_frame(name='AverageFlowValue')
monthly_average_df.head() 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flow_cow['Date'] = pd.to_datetime(flow_cow['Date'])
  monthly_average_flow = flow_cow.resample('M')['FlowValue'].mean()


Unnamed: 0_level_0,AverageFlowValue
Date,Unnamed: 1_level_1
2013-01-31,14.82129
2013-02-28,14.317143
2013-03-31,21.153548
2013-04-30,9.061333
2013-05-31,2.744194


In [293]:
# create last winter flow 
df_filtered = monthly_average_df[monthly_average_df.index.month.isin([10, 11])]
annual_average_flow = df_filtered.resample('A').mean()
annual_average_flow.index = annual_average_flow.index.year  
annual_average_flow.index = pd.to_datetime(annual_average_flow.index, format='%Y') + pd.DateOffset(years=1)
annual_average_flow.rename(columns={'AverageFlowValue': 'LastWinterFlow'}, inplace=True)
annual_average_flow.index = annual_average_flow.index.year 
annual_average_flow.rename_axis('Year', inplace=True)

annual_average_flow.head() 


  annual_average_flow = df_filtered.resample('A').mean()


Unnamed: 0_level_0,LastWinterFlow
Year,Unnamed: 1_level_1
2014,5.316866
2015,12.066258
2016,11.385591
2017,22.321763
2018,19.36822


discharge for mar to may (flow)

In [294]:
flow35 = monthly_average_df[monthly_average_df.index.month.isin([3, 4, 5])]
flow35['Year'] = flow35.index.year
flow35['Month'] = flow35.index.month

pivot_df = flow35.pivot_table(index='Year', columns='Month', values='AverageFlowValue')
df_combined = annual_average_flow.merge(pivot_df, on='Year', how='outer')

df_combined.rename(columns={3: 'marFlow'}, inplace=True)
df_combined.rename(columns={4: 'aprFlow'}, inplace=True)
df_combined.rename(columns={5: 'mayFlow'}, inplace=True)

df_combined.head() 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flow35['Year'] = flow35.index.year
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flow35['Month'] = flow35.index.month


Unnamed: 0_level_0,LastWinterFlow,marFlow,aprFlow,mayFlow
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2013,,21.153548,9.061333,2.744194
2014,5.316866,27.383871,8.355,3.838387
2015,12.066258,11.441936,5.063667,1.485129
2016,11.385591,29.127097,3.260333,0.955323
2017,22.321763,27.607419,17.052667,3.889355


temp for dec - feb

In [295]:
# create monthly temp 
monthly_average_temp = tem_cow.resample('M')['TEMP'].mean()
monthly_average_temp = monthly_average_temp.to_frame(name='AverageTemp')

  monthly_average_temp = tem_cow.resample('M')['TEMP'].mean()


In [296]:
# last winter temp 
df_filtered_temp = monthly_average_temp[monthly_average_temp.index.month.isin([12, 1, 2])]
df_filtered_temp['RollingMean'] = df_filtered_temp['AverageTemp'].rolling(window=3, min_periods=3).mean()
filtered_mean_temp = df_filtered_temp[df_filtered_temp.index.month.isin([2])]
filtered_mean_temp = filtered_mean_temp.drop('AverageTemp', axis=1)

filtered_mean_temp.index = filtered_mean_temp.index.year  # Adjust index to show only the year
filtered_mean_temp.rename_axis('Year', inplace=True)
filtered_mean_temp.rename(columns={'RollingMean': 'LastWinterTemp'}, inplace=True)

filtered_mean_temp.head() 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered_temp['RollingMean'] = df_filtered_temp['AverageTemp'].rolling(window=3, min_periods=3).mean()


Unnamed: 0_level_0,LastWinterTemp
Year,Unnamed: 1_level_1
2014,2.82403
2015,5.555037
2016,4.686489
2017,1.328632
2018,3.191699


temp for mar - may 

In [297]:
df_filtered_marmay = monthly_average_temp[monthly_average_temp.index.month.isin([3,4,5])]

df_filtered_marmay['Year'] = df_filtered_marmay.index.year
df_filtered_marmay['Month'] = df_filtered_marmay.index.month

pivot_df_temp = df_filtered_marmay.pivot_table(index='Year', columns='Month', values='AverageTemp')

df_combined_temp = filtered_mean_temp.merge(pivot_df_temp, on='Year', how='outer')

df_combined_temp.rename(columns={3: 'marTemp'}, inplace=True)
df_combined_temp.rename(columns={4: 'aprTemp'}, inplace=True)
df_combined_temp.rename(columns={5: 'mayTemp'}, inplace=True)
df_combined_temp.head() 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered_marmay['Year'] = df_filtered_marmay.index.year
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered_marmay['Month'] = df_filtered_marmay.index.month


Unnamed: 0_level_0,LastWinterTemp,marTemp,aprTemp,mayTemp
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2014,2.82403,5.937401,9.200833,13.870775
2015,5.555037,7.991129,8.587107,14.717418
2016,4.686489,7.5125,11.871944,14.515945
2017,1.328632,5.671237,8.566111,13.086828
2018,3.191699,5.477688,9.018712,15.219624


In [298]:
# merge flow and temp 
flowTemp = df_combined_temp.merge(df_combined, on='Year', how='outer')
flowTemp.head() 

Unnamed: 0_level_0,LastWinterTemp,marTemp,aprTemp,mayTemp,LastWinterFlow,marFlow,aprFlow,mayFlow
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2013,,,,,,21.153548,9.061333,2.744194
2014,2.82403,5.937401,9.200833,13.870775,5.316866,27.383871,8.355,3.838387
2015,5.555037,7.991129,8.587107,14.717418,12.066258,11.441936,5.063667,1.485129
2016,4.686489,7.5125,11.871944,14.515945,11.385591,29.127097,3.260333,0.955323
2017,1.328632,5.671237,8.566111,13.086828,22.321763,27.607419,17.052667,3.889355


In [299]:
df_salmon['date'] = pd.to_datetime(df_salmon['date'])  # Convert 'date' to datetime

# Create a mask for April 15 to October 15 for any year
mask = ((df_salmon['date'].dt.month > 4) | ((df_salmon['date'].dt.month == 4) & (df_salmon['date'].dt.day >= 15))) & \
       ((df_salmon['date'].dt.month < 10) | ((df_salmon['date'].dt.month == 10) & (df_salmon['date'].dt.day <= 15)))

# Apply mask to filter data
df_filtered = df_salmon.loc[mask]

# Determine the range of years in your data
start_year = df_filtered['date'].dt.year.min()
end_year = df_filtered['date'].dt.year.max()

# Generate all dates from April 15 to October 15 for each year within the range
all_dates = []
for year in range(start_year, end_year + 1):
    start_date = f"{year}-04-15"
    end_date = f"{year}-10-15"
    all_dates.extend(pd.date_range(start=start_date, end=end_date))

# Check which dates are missing
existing_dates = set(df_filtered['date'])
missing_dates = [date for date in all_dates if date not in existing_dates]

# Create a DataFrame for missing dates with all other columns filled with 0 or appropriate defaults
df_missing = pd.DataFrame({
    'date': missing_dates,
    'count': [0] * len(missing_dates),
    '70.2': [0] * len(missing_dates),
    'cow bay': [0] * len(missing_dates),
    'mainstem fence': [0] * len(missing_dates),
    'skutz': [0] * len(missing_dates),
    'vimy pool': [0] * len(missing_dates),
    'ck': [False] * len(missing_dates),
    'co': [False] * len(missing_dates)
})

# Concatenate the original filtered data with the missing data DataFrame
df_final = pd.concat([df_filtered, df_missing]).sort_values('date')

In [301]:
df_final.replace({True: 1, False: 0, np.nan: 0}, inplace=True)
df_final.fillna(0, inplace=True)
df_final.head()

  df_final.replace({True: 1, False: 0, np.nan: 0}, inplace=True)


Unnamed: 0,date,count,70.2,cow bay,mainstem fence,skutz,vimy pool,ck,co
0,2014-04-15,0,0.0,0.0,0.0,0.0,0.0,0,0
1,2014-04-16,0,0.0,0.0,0.0,0.0,0.0,0,0
2,2014-04-17,0,0.0,0.0,0.0,0.0,0.0,0,0
3,2014-04-18,0,0.0,0.0,0.0,0.0,0.0,0,0
4,2014-04-19,0,0.0,0.0,0.0,0.0,0.0,0,0


In [302]:
df_final['Year'] = df_final['date'].dt.year
df_final.head() 

Unnamed: 0,date,count,70.2,cow bay,mainstem fence,skutz,vimy pool,ck,co,Year
0,2014-04-15,0,0.0,0.0,0.0,0.0,0.0,0,0,2014
1,2014-04-16,0,0.0,0.0,0.0,0.0,0.0,0,0,2014
2,2014-04-17,0,0.0,0.0,0.0,0.0,0.0,0,0,2014
3,2014-04-18,0,0.0,0.0,0.0,0.0,0.0,0,0,2014
4,2014-04-19,0,0.0,0.0,0.0,0.0,0.0,0,0,2014


In [303]:
wMacro = df_final.merge(flowTemp, how='left', left_on='Year', right_index=True)
wMacro.head() 

Unnamed: 0,date,count,70.2,cow bay,mainstem fence,skutz,vimy pool,ck,co,Year,LastWinterTemp,marTemp,aprTemp,mayTemp,LastWinterFlow,marFlow,aprFlow,mayFlow
0,2014-04-15,0,0.0,0.0,0.0,0.0,0.0,0,0,2014,2.82403,5.937401,9.200833,13.870775,5.316866,27.383871,8.355,3.838387
1,2014-04-16,0,0.0,0.0,0.0,0.0,0.0,0,0,2014,2.82403,5.937401,9.200833,13.870775,5.316866,27.383871,8.355,3.838387
2,2014-04-17,0,0.0,0.0,0.0,0.0,0.0,0,0,2014,2.82403,5.937401,9.200833,13.870775,5.316866,27.383871,8.355,3.838387
3,2014-04-18,0,0.0,0.0,0.0,0.0,0.0,0,0,2014,2.82403,5.937401,9.200833,13.870775,5.316866,27.383871,8.355,3.838387
4,2014-04-19,0,0.0,0.0,0.0,0.0,0.0,0,0,2014,2.82403,5.937401,9.200833,13.870775,5.316866,27.383871,8.355,3.838387


In [305]:
df_joined = pd.merge(wMacro, tem_cow, how='left', left_on='date', right_index = True)
df_joined.head() 

Unnamed: 0,date,count,70.2,cow bay,mainstem fence,skutz,vimy pool,ck,co,Year,...,mayTemp,LastWinterFlow,marFlow,aprFlow,mayFlow,RELATIVE_HUMIDITY,WIND_SPEED,TEMP,WINDCHILL,DEW_POINT_TEMP
0,2014-04-15,0,0.0,0.0,0.0,0.0,0.0,0,0,2014,...,13.870775,5.316866,27.383871,8.355,3.838387,52.416667,10.375,11.6375,,1.929167
1,2014-04-16,0,0.0,0.0,0.0,0.0,0.0,0,0,2014,...,13.870775,5.316866,27.383871,8.355,3.838387,88.291667,4.333333,9.508333,,7.583333
2,2014-04-17,0,0.0,0.0,0.0,0.0,0.0,0,0,2014,...,13.870775,5.316866,27.383871,8.355,3.838387,88.208333,6.041667,9.975,,8.05
3,2014-04-18,0,0.0,0.0,0.0,0.0,0.0,0,0,2014,...,13.870775,5.316866,27.383871,8.355,3.838387,63.291667,8.041667,9.441667,,2.55
4,2014-04-19,0,0.0,0.0,0.0,0.0,0.0,0,0,2014,...,13.870775,5.316866,27.383871,8.355,3.838387,75.25,3.25,8.529167,,4.15


In [306]:
df_joined.drop(['RELATIVE_HUMIDITY', 'WIND_SPEED', 'WINDCHILL', 'DEW_POINT_TEMP'], axis=1, inplace=True)
df_joined.head() 

Unnamed: 0,date,count,70.2,cow bay,mainstem fence,skutz,vimy pool,ck,co,Year,LastWinterTemp,marTemp,aprTemp,mayTemp,LastWinterFlow,marFlow,aprFlow,mayFlow,TEMP
0,2014-04-15,0,0.0,0.0,0.0,0.0,0.0,0,0,2014,2.82403,5.937401,9.200833,13.870775,5.316866,27.383871,8.355,3.838387,11.6375
1,2014-04-16,0,0.0,0.0,0.0,0.0,0.0,0,0,2014,2.82403,5.937401,9.200833,13.870775,5.316866,27.383871,8.355,3.838387,9.508333
2,2014-04-17,0,0.0,0.0,0.0,0.0,0.0,0,0,2014,2.82403,5.937401,9.200833,13.870775,5.316866,27.383871,8.355,3.838387,9.975
3,2014-04-18,0,0.0,0.0,0.0,0.0,0.0,0,0,2014,2.82403,5.937401,9.200833,13.870775,5.316866,27.383871,8.355,3.838387,9.441667
4,2014-04-19,0,0.0,0.0,0.0,0.0,0.0,0,0,2014,2.82403,5.937401,9.200833,13.870775,5.316866,27.383871,8.355,3.838387,8.529167


In [308]:
df_complete = pd.merge(df_joined, flow_cow, how='left', left_on='date', right_index = True)
df_complete

Unnamed: 0,date,count,70.2,cow bay,mainstem fence,skutz,vimy pool,ck,co,Year,...,LastWinterFlow,marFlow,aprFlow,mayFlow,TEMP,STATION_NUMBER,YEAR,MONTH,DAY,FlowValue
0,2014-04-15,0,0.0,0.0,0.0,0.0,0.0,0,0,2014,...,5.316866,27.383871,8.355000,3.838387,11.637500,08HA003,2014,4,15,4.300
1,2014-04-16,0,0.0,0.0,0.0,0.0,0.0,0,0,2014,...,5.316866,27.383871,8.355000,3.838387,9.508333,08HA003,2014,4,16,4.570
2,2014-04-17,0,0.0,0.0,0.0,0.0,0.0,0,0,2014,...,5.316866,27.383871,8.355000,3.838387,9.975000,08HA003,2014,4,17,6.020
3,2014-04-18,0,0.0,0.0,0.0,0.0,0.0,0,0,2014,...,5.316866,27.383871,8.355000,3.838387,9.441667,08HA003,2014,4,18,7.470
4,2014-04-19,0,0.0,0.0,0.0,0.0,0.0,0,0,2014,...,5.316866,27.383871,8.355000,3.838387,8.529167,08HA003,2014,4,19,6.110
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1423,2023-10-11,0,0.0,0.0,0.0,0.0,0.0,0,0,2023,...,1.746417,7.648065,12.996333,2.918065,12.458333,08HA003,2023,10,11,0.571
1424,2023-10-12,0,0.0,0.0,0.0,0.0,0.0,0,0,2023,...,1.746417,7.648065,12.996333,2.918065,9.975000,08HA003,2023,10,12,0.597
1425,2023-10-13,0,0.0,0.0,0.0,0.0,0.0,0,0,2023,...,1.746417,7.648065,12.996333,2.918065,10.087500,08HA003,2023,10,13,0.524
1426,2023-10-14,0,0.0,0.0,0.0,0.0,0.0,0,0,2023,...,1.746417,7.648065,12.996333,2.918065,12.341667,08HA003,2023,10,14,0.496


In [309]:
df_complete.drop(['STATION_NUMBER', 'YEAR', 'DAY', 'Year'], axis=1, inplace=True)
df_complete

Unnamed: 0,date,count,70.2,cow bay,mainstem fence,skutz,vimy pool,ck,co,LastWinterTemp,marTemp,aprTemp,mayTemp,LastWinterFlow,marFlow,aprFlow,mayFlow,TEMP,MONTH,FlowValue
0,2014-04-15,0,0.0,0.0,0.0,0.0,0.0,0,0,2.824030,5.937401,9.200833,13.870775,5.316866,27.383871,8.355000,3.838387,11.637500,4,4.300
1,2014-04-16,0,0.0,0.0,0.0,0.0,0.0,0,0,2.824030,5.937401,9.200833,13.870775,5.316866,27.383871,8.355000,3.838387,9.508333,4,4.570
2,2014-04-17,0,0.0,0.0,0.0,0.0,0.0,0,0,2.824030,5.937401,9.200833,13.870775,5.316866,27.383871,8.355000,3.838387,9.975000,4,6.020
3,2014-04-18,0,0.0,0.0,0.0,0.0,0.0,0,0,2.824030,5.937401,9.200833,13.870775,5.316866,27.383871,8.355000,3.838387,9.441667,4,7.470
4,2014-04-19,0,0.0,0.0,0.0,0.0,0.0,0,0,2.824030,5.937401,9.200833,13.870775,5.316866,27.383871,8.355000,3.838387,8.529167,4,6.110
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1423,2023-10-11,0,0.0,0.0,0.0,0.0,0.0,0,0,3.049505,5.093952,7.577862,15.933737,1.746417,7.648065,12.996333,2.918065,12.458333,10,0.571
1424,2023-10-12,0,0.0,0.0,0.0,0.0,0.0,0,0,3.049505,5.093952,7.577862,15.933737,1.746417,7.648065,12.996333,2.918065,9.975000,10,0.597
1425,2023-10-13,0,0.0,0.0,0.0,0.0,0.0,0,0,3.049505,5.093952,7.577862,15.933737,1.746417,7.648065,12.996333,2.918065,10.087500,10,0.524
1426,2023-10-14,0,0.0,0.0,0.0,0.0,0.0,0,0,3.049505,5.093952,7.577862,15.933737,1.746417,7.648065,12.996333,2.918065,12.341667,10,0.496


In [310]:
def lag_df(df, date_col, columns_to_lag, min_lag, max_lag):
    # Ensure the DataFrame is sorted by date
    df = df.sort_values(date_col)
    
    # Initialize an empty DataFrame to store the lagged data
    lagged_df = pd.DataFrame(index=df.index)
    
    # Loop through each specified column in the DataFrame
    for col in columns_to_lag:
        # Create lagged versions of the column for the specified range
        for i in range(min_lag, max_lag + 1):
            lagged_name = f'{col}_t-{i}'  # Create a name for the lagged column
            lagged_df[lagged_name] = df[col].shift(i)
    
    # Combine the original DataFrame with the lagged DataFrame
    result_df = pd.concat([df, lagged_df], axis=1)
    
    # Drop rows with missing values resulting from the lagging
    result_df.dropna(inplace=True)

    return result_df

In [311]:
columns_to_lag = ['TEMP', 'FlowValue']
lagged_data = lag_df(df_complete, 'date', columns_to_lag, min_lag=30, max_lag=40)
lagged_data.head() 

Unnamed: 0,date,count,70.2,cow bay,mainstem fence,skutz,vimy pool,ck,co,LastWinterTemp,...,FlowValue_t-31,FlowValue_t-32,FlowValue_t-33,FlowValue_t-34,FlowValue_t-35,FlowValue_t-36,FlowValue_t-37,FlowValue_t-38,FlowValue_t-39,FlowValue_t-40
9,2014-05-25,36,0.0,0.0,0.0,0.0,0.0,1,0,2.82403,...,9.09,6.5,6.99,6.97,6.98,6.11,7.47,6.02,4.57,4.3
10,2014-05-26,351,0.0,0.0,0.0,0.0,0.0,1,0,2.82403,...,11.0,9.09,6.5,6.99,6.97,6.98,6.11,7.47,6.02,4.57
11,2014-05-27,803,0.0,0.0,0.0,0.0,0.0,1,0,2.82403,...,9.09,11.0,9.09,6.5,6.99,6.97,6.98,6.11,7.47,6.02
12,2014-05-28,698,0.0,0.0,0.0,0.0,0.0,1,0,2.82403,...,10.0,9.09,11.0,9.09,6.5,6.99,6.97,6.98,6.11,7.47
31,2014-05-29,0,0.0,0.0,0.0,0.0,0.0,0,0,2.82403,...,10.5,10.0,9.09,11.0,9.09,6.5,6.99,6.97,6.98,6.11


In [312]:
cols = [col for col in lagged_data.columns if col != 'count'] + ['count']
lagged_data = lagged_data[cols]

# Modeling 

In [313]:
X = lagged_data.drop(columns=["count", 'date'])
y = lagged_data["count"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=123
) 

In [314]:
pipe = make_pipeline(StandardScaler(), Ridge())

In [315]:
pipe.fit(X_train, y_train)

In [316]:
kk = pipe.predict(X_test)

In [317]:
predict = pd.DataFrame(kk, columns = ['prediction']) 

In [318]:
test = pd.DataFrame(y_test).reset_index() 

In [319]:
df_horizontal = pd.concat([test, predict], axis=1)
df_horizontal.head() 

Unnamed: 0,index,count,prediction
0,800,0,-31.283342
1,1061,0,-13.660415
2,384,0,60.041356
3,40,3,92.376187
4,582,31,191.98499


In [320]:
df_horizontal['test_bi'] = df_horizontal['count'].apply(lambda x: 1 if x >= 1 else 0)
df_horizontal.head() 

Unnamed: 0,index,count,prediction,test_bi
0,800,0,-31.283342,0
1,1061,0,-13.660415,0
2,384,0,60.041356,0
3,40,3,92.376187,1
4,582,31,191.98499,1


In [321]:
df_sorted = df_horizontal.sort_values('prediction')
df_sorted.drop(['index'], axis=1, inplace=True)
df_sorted.head() 

Unnamed: 0,count,prediction,test_bi
289,0,-102.522089,0
159,0,-75.989889,0
348,0,-75.197285,0
70,0,-74.307642,0
145,0,-74.097901,0


In [322]:
chart = alt.Chart(df_sorted).mark_point().encode(
    x='index:Q',  # Quantitative scale for indices
    y= 'prediction:Q',
    color='test_bi:N'  # Nominal scale for binary values, differentiating lines by color
).transform_window(
    index='count()'  # Generate an index for x-axis
)

In [323]:
chart 

Logistic Regression 

In [324]:
lagged_data['count_bi'] = lagged_data['count'].apply(lambda x: 1 if x >= 1 else 0)
lagged_data.head() 

Unnamed: 0,date,70.2,cow bay,mainstem fence,skutz,vimy pool,ck,co,LastWinterTemp,marTemp,...,FlowValue_t-33,FlowValue_t-34,FlowValue_t-35,FlowValue_t-36,FlowValue_t-37,FlowValue_t-38,FlowValue_t-39,FlowValue_t-40,count,count_bi
9,2014-05-25,0.0,0.0,0.0,0.0,0.0,1,0,2.82403,5.937401,...,6.99,6.97,6.98,6.11,7.47,6.02,4.57,4.3,36,1
10,2014-05-26,0.0,0.0,0.0,0.0,0.0,1,0,2.82403,5.937401,...,6.5,6.99,6.97,6.98,6.11,7.47,6.02,4.57,351,1
11,2014-05-27,0.0,0.0,0.0,0.0,0.0,1,0,2.82403,5.937401,...,9.09,6.5,6.99,6.97,6.98,6.11,7.47,6.02,803,1
12,2014-05-28,0.0,0.0,0.0,0.0,0.0,1,0,2.82403,5.937401,...,11.0,9.09,6.5,6.99,6.97,6.98,6.11,7.47,698,1
31,2014-05-29,0.0,0.0,0.0,0.0,0.0,0,0,2.82403,5.937401,...,9.09,11.0,9.09,6.5,6.99,6.97,6.98,6.11,0,0


In [325]:
X_1 = lagged_data.drop(columns=["count", 'date', 'count_bi'])
y_1 = lagged_data["count_bi"]

X_1_train, X_1_test, y_1_train, y_1_test = train_test_split(
    X_1, y_1, test_size=0.2, random_state=123
) 

In [326]:
pipe_1 = make_pipeline(StandardScaler(), LogisticRegression())

In [327]:
pipe_1.fit(X_1_train, y_1_train)

In [328]:
cv_scores = cross_val_score(pipe_1, X_1_train, y_1_train, cv=10)
cv_scores

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])

In [329]:
pipe_1.score(X_1_test, y_1_test)

1.0

Logistic Regression without Shuffle 

In [330]:
X_2_train, X_2_test, y_2_train, y_2_test = train_test_split(
    X_1, y_1, test_size=0.2, shuffle= False 
) 

In [331]:
pipe_2 = make_pipeline(StandardScaler(), LogisticRegression())

In [332]:
pipe_2.fit(X_2_train, y_2_train)

In [333]:
pipe_2.score(X_2_test, y_2_test)

1.0