In [2]:
import pandas as pd
import altair as alt
from torch.utils.data import TensorDataset, DataLoader
import torch
import torch.nn as nn
import torch.optim as optim
from datetime import timedelta
import numpy as np 
from sklearn.linear_model import LogisticRegression, LinearRegression, Ridge
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate, train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler
from itertools import product

alt.data_transformers.enable("vegafusion")

DataTransformerRegistry.enable('vegafusion')

# Read In Data 

Flow Data

| Location    | Station Number |
|-------------|----------------|
| cowichan    | 08HA003        |
| englishman  | 08HA001        |

In [3]:
flow_ori = pd.read_csv('data/flow_2023.csv')
flow_columns = [col for col in flow_ori.columns if 'FLOW' in col]
flow_long = pd.melt(flow_ori, id_vars=["STATION_NUMBER", "YEAR", "MONTH"], value_vars=flow_columns, var_name="FlowType", value_name="FlowValue")
flow_long['FlowType'] = flow_long['FlowType'].str.replace("FLOW", "").astype(int)
sorted_flow = flow_long.sort_values(by=["STATION_NUMBER", "YEAR", "MONTH", "FlowType"])
sorted_flow.rename(columns={'FlowType': 'DAY'}, inplace=True)
sorted_flow = sorted_flow.dropna()
sorted_flow['Date'] = pd.to_datetime(sorted_flow[['YEAR', 'MONTH', 'DAY']])
sorted_flow = sorted_flow.reset_index(drop=True)

# select only cowichan 
flow_cow = sorted_flow[sorted_flow['STATION_NUMBER'] == '08HA003'] = sorted_flow[sorted_flow['STATION_NUMBER'] == '08HA003']
flow_cow.head() 

Unnamed: 0,STATION_NUMBER,YEAR,MONTH,DAY,FlowValue,Date
4018,08HA003,2013,1,1,10.4,2013-01-01
4019,08HA003,2013,1,2,9.18,2013-01-02
4020,08HA003,2013,1,3,8.1,2013-01-03
4021,08HA003,2013,1,4,7.69,2013-01-04
4022,08HA003,2013,1,5,7.48,2013-01-05


Temperature Data 

In [4]:
tem_cow = pd.read_csv('data/northcochiwan_daily_temp-2.csv')
tem_cow['UTC_DATE'] = pd.to_datetime(tem_cow['UTC_DATE'])
tem_cow.set_index('UTC_DATE', inplace=True)
tem_cow.rename_axis('Date', inplace=True)
tem_cow.head() 

Unnamed: 0_level_0,RELATIVE_HUMIDITY,WIND_SPEED,TEMP,WINDCHILL,DEW_POINT_TEMP
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2013-09-02,75.818182,2.727273,19.127273,,14.372727
2013-09-03,83.125,2.458333,18.045833,,14.766667
2013-09-04,85.791667,2.0,17.0625,,14.379167
2013-09-05,94.708333,1.541667,16.8375,,15.9
2013-09-06,91.916667,1.583333,16.954167,,15.504167


Salmon Data

In [5]:
df_salmon = pd.read_csv('data/salmon_concat.csv') 
df_salmon.head() 

Unnamed: 0,date,count,70.2,cow bay,mainstem fence,skutz,vimy pool,ck,co
0,2014-05-01,7,,,,,,True,False
1,2014-05-02,34,,,,,,True,False
2,2014-05-07,21,,,,,,True,False
3,2014-05-08,136,,,,,,True,False
4,2014-05-13,74,,,,,,True,False


# Create Features

discharge of oct and nov (flow)

In [6]:
# create monthly flow 
flow_cow['Date'] = pd.to_datetime(flow_cow['Date'])
flow_cow.set_index('Date', inplace=True)
monthly_average_flow = flow_cow.resample('M')['FlowValue'].mean()
monthly_average_df = monthly_average_flow.to_frame(name='AverageFlowValue')
monthly_average_df.head() 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flow_cow['Date'] = pd.to_datetime(flow_cow['Date'])
  monthly_average_flow = flow_cow.resample('M')['FlowValue'].mean()


Unnamed: 0_level_0,AverageFlowValue
Date,Unnamed: 1_level_1
2013-01-31,14.82129
2013-02-28,14.317143
2013-03-31,21.153548
2013-04-30,9.061333
2013-05-31,2.744194


In [7]:
# create last winter flow 
df_filtered = monthly_average_df[monthly_average_df.index.month.isin([10, 11])]
annual_average_flow = df_filtered.resample('A').mean()
annual_average_flow.index = annual_average_flow.index.year  
annual_average_flow.index = pd.to_datetime(annual_average_flow.index, format='%Y') + pd.DateOffset(years=1)
annual_average_flow.rename(columns={'AverageFlowValue': 'LastWinterFlow'}, inplace=True)
annual_average_flow.index = annual_average_flow.index.year 
annual_average_flow.rename_axis('Year', inplace=True)

annual_average_flow.head() 

  annual_average_flow = df_filtered.resample('A').mean()


Unnamed: 0_level_0,LastWinterFlow
Year,Unnamed: 1_level_1
2014,5.316866
2015,12.066258
2016,11.385591
2017,22.321763
2018,19.36822


discharge for mar to may (flow)

In [8]:
flow35 = monthly_average_df[monthly_average_df.index.month.isin([3, 4, 5])]
flow35['Year'] = flow35.index.year
flow35['Month'] = flow35.index.month

pivot_df = flow35.pivot_table(index='Year', columns='Month', values='AverageFlowValue')
df_combined = annual_average_flow.merge(pivot_df, on='Year', how='outer')

df_combined.rename(columns={3: 'marFlow'}, inplace=True)
df_combined.rename(columns={4: 'aprFlow'}, inplace=True)
df_combined.rename(columns={5: 'mayFlow'}, inplace=True)

df_combined.head() 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flow35['Year'] = flow35.index.year
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flow35['Month'] = flow35.index.month


Unnamed: 0_level_0,LastWinterFlow,marFlow,aprFlow,mayFlow
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2013,,21.153548,9.061333,2.744194
2014,5.316866,27.383871,8.355,3.838387
2015,12.066258,11.441936,5.063667,1.485129
2016,11.385591,29.127097,3.260333,0.955323
2017,22.321763,27.607419,17.052667,3.889355


temp for dec - feb

In [9]:
# create monthly temp 
monthly_average_temp = tem_cow.resample('M')['TEMP'].mean()
monthly_average_temp = monthly_average_temp.to_frame(name='AverageTemp')

  monthly_average_temp = tem_cow.resample('M')['TEMP'].mean()


In [10]:
# last winter temp 
df_filtered_temp = monthly_average_temp[monthly_average_temp.index.month.isin([12, 1, 2])]
df_filtered_temp['RollingMean'] = df_filtered_temp['AverageTemp'].rolling(window=3, min_periods=3).mean()
filtered_mean_temp = df_filtered_temp[df_filtered_temp.index.month.isin([2])]
filtered_mean_temp = filtered_mean_temp.drop('AverageTemp', axis=1)

filtered_mean_temp.index = filtered_mean_temp.index.year  # Adjust index to show only the year
filtered_mean_temp.rename_axis('Year', inplace=True)
filtered_mean_temp.rename(columns={'RollingMean': 'LastWinterTemp'}, inplace=True)

filtered_mean_temp.head() 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered_temp['RollingMean'] = df_filtered_temp['AverageTemp'].rolling(window=3, min_periods=3).mean()


Unnamed: 0_level_0,LastWinterTemp
Year,Unnamed: 1_level_1
2014,2.82403
2015,5.555037
2016,4.686489
2017,1.328632
2018,3.191699


temp for mar - may 

In [11]:
df_filtered_marmay = monthly_average_temp[monthly_average_temp.index.month.isin([3,4,5])]

df_filtered_marmay['Year'] = df_filtered_marmay.index.year
df_filtered_marmay['Month'] = df_filtered_marmay.index.month

pivot_df_temp = df_filtered_marmay.pivot_table(index='Year', columns='Month', values='AverageTemp')

df_combined_temp = filtered_mean_temp.merge(pivot_df_temp, on='Year', how='outer')

df_combined_temp.rename(columns={3: 'marTemp'}, inplace=True)
df_combined_temp.rename(columns={4: 'aprTemp'}, inplace=True)
df_combined_temp.rename(columns={5: 'mayTemp'}, inplace=True)
df_combined_temp.head() 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered_marmay['Year'] = df_filtered_marmay.index.year
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered_marmay['Month'] = df_filtered_marmay.index.month


Unnamed: 0_level_0,LastWinterTemp,marTemp,aprTemp,mayTemp
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2014,2.82403,5.937401,9.200833,13.870775
2015,5.555037,7.991129,8.587107,14.717418
2016,4.686489,7.5125,11.871944,14.515945
2017,1.328632,5.671237,8.566111,13.086828
2018,3.191699,5.477688,9.018712,15.219624


In [12]:
# merge flow and temp 
flowTemp = df_combined_temp.merge(df_combined, on='Year', how='outer')
flowTemp.head() 

Unnamed: 0_level_0,LastWinterTemp,marTemp,aprTemp,mayTemp,LastWinterFlow,marFlow,aprFlow,mayFlow
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2013,,,,,,21.153548,9.061333,2.744194
2014,2.82403,5.937401,9.200833,13.870775,5.316866,27.383871,8.355,3.838387
2015,5.555037,7.991129,8.587107,14.717418,12.066258,11.441936,5.063667,1.485129
2016,4.686489,7.5125,11.871944,14.515945,11.385591,29.127097,3.260333,0.955323
2017,1.328632,5.671237,8.566111,13.086828,22.321763,27.607419,17.052667,3.889355


In [13]:
# replace NAs with False 
locations = ['70.2', 'cow bay', 'mainstem fence', 'skutz', 'vimy pool']
df_salmon[locations] = df_salmon[locations].fillna(False)

  df_salmon[locations] = df_salmon[locations].fillna(False)


In [14]:
# Convert the 'date' column to datetime format
df_salmon['date'] = pd.to_datetime(df_salmon['date'])

# Define the mask for dates within April 15th to October 15th
mask = ((df_salmon['date'].dt.month > 3) | ((df_salmon['date'].dt.month == 3) & (df_salmon['date'].dt.day >= 15))) & \
       ((df_salmon['date'].dt.month < 10) | ((df_salmon['date'].dt.month == 10) & (df_salmon['date'].dt.day <= 15)))
df_filtered = df_salmon.loc[mask]

# Determine the range of years within the filtered data
start_year = df_filtered['date'].dt.year.min()
end_year = df_filtered['date'].dt.year.max()

# Generate all dates within the specified range for each year
all_dates = pd.date_range(start=f"{start_year}-03-15", end=f"{end_year}-10-15")

# Define the locations and species
species = [True, False]

# Create all possible combinations of date, location, and species
all_combinations = pd.DataFrame(list(product(all_dates, locations, species, species)), columns=['date', 'location', 'ck', 'co'])
all_combinations['count'] = 0

# Merge existing data by expanding df_filtered to have rows for all combinations
# Reshape df_filtered to have 'location', 'ck', 'co' as columns with True/False flags
df_long = pd.melt(df_filtered, id_vars=['date', 'count'], value_vars=locations, var_name='location')
df_long = df_long[df_long['value']].drop(columns='value')  # Keep only True entries
df_long['ck'] = df_filtered['ck']
df_long['co'] = df_filtered['co']

# Merge to find missing entries
df_full = pd.merge(all_combinations, df_long, on=['date', 'location', 'ck', 'co'], how='left', suffixes=('', '_existing'))
df_full['count'] = df_full['count_existing'].fillna(0).astype(int)
df_full.drop(columns='count_existing', inplace=True)

# Final DataFrame with all combinations filled where missing
df_final = df_full.sort_values('date')
filtered_df = df_final[(df_final['ck'] != df_final['co'])]
filtered_df.head() 

Unnamed: 0,date,location,ck,co,count
18,2014-03-15,vimy pool,False,True,0
17,2014-03-15,vimy pool,True,False,0
13,2014-03-15,skutz,True,False,0
10,2014-03-15,mainstem fence,False,True,0
14,2014-03-15,skutz,False,True,0


In [15]:
df_final = pd.get_dummies(filtered_df, columns=['location'])
df_final = df_final.reset_index() 
df_final = df_final.drop(columns = ['index'])
df_final.replace({True: 1, False: 0, np.nan: 0}, inplace=True)
df_final.fillna(0, inplace=True)
df_final.head()

  df_final.replace({True: 1, False: 0, np.nan: 0}, inplace=True)


Unnamed: 0,date,ck,co,count,location_70.2,location_cow bay,location_mainstem fence,location_skutz,location_vimy pool
0,2014-03-15,0,1,0,0,0,0,0,1
1,2014-03-15,1,0,0,0,0,0,0,1
2,2014-03-15,1,0,0,0,0,0,1,0
3,2014-03-15,0,1,0,0,0,1,0,0
4,2014-03-15,0,1,0,0,0,0,1,0


In [16]:
# merge with macro indicators 
df_final['Year'] = df_final['date'].dt.year
wMacro = df_final.merge(flowTemp, how='left', left_on='Year', right_index=True)
wMacro.head() 

Unnamed: 0,date,ck,co,count,location_70.2,location_cow bay,location_mainstem fence,location_skutz,location_vimy pool,Year,LastWinterTemp,marTemp,aprTemp,mayTemp,LastWinterFlow,marFlow,aprFlow,mayFlow
0,2014-03-15,0,1,0,0,0,0,0,1,2014,2.82403,5.937401,9.200833,13.870775,5.316866,27.383871,8.355,3.838387
1,2014-03-15,1,0,0,0,0,0,0,1,2014,2.82403,5.937401,9.200833,13.870775,5.316866,27.383871,8.355,3.838387
2,2014-03-15,1,0,0,0,0,0,1,0,2014,2.82403,5.937401,9.200833,13.870775,5.316866,27.383871,8.355,3.838387
3,2014-03-15,0,1,0,0,0,1,0,0,2014,2.82403,5.937401,9.200833,13.870775,5.316866,27.383871,8.355,3.838387
4,2014-03-15,0,1,0,0,0,0,1,0,2014,2.82403,5.937401,9.200833,13.870775,5.316866,27.383871,8.355,3.838387


In [17]:
# merge with temperature 
df_joined = pd.merge(wMacro, tem_cow, how='left', left_on='date', right_index = True)
df_joined.drop(['RELATIVE_HUMIDITY', 'WIND_SPEED', 'WINDCHILL', 'DEW_POINT_TEMP'], axis=1, inplace=True)
df_joined.head() 

Unnamed: 0,date,ck,co,count,location_70.2,location_cow bay,location_mainstem fence,location_skutz,location_vimy pool,Year,LastWinterTemp,marTemp,aprTemp,mayTemp,LastWinterFlow,marFlow,aprFlow,mayFlow,TEMP
0,2014-03-15,0,1,0,0,0,0,0,1,2014,2.82403,5.937401,9.200833,13.870775,5.316866,27.383871,8.355,3.838387,7.020833
1,2014-03-15,1,0,0,0,0,0,0,1,2014,2.82403,5.937401,9.200833,13.870775,5.316866,27.383871,8.355,3.838387,7.020833
2,2014-03-15,1,0,0,0,0,0,1,0,2014,2.82403,5.937401,9.200833,13.870775,5.316866,27.383871,8.355,3.838387,7.020833
3,2014-03-15,0,1,0,0,0,1,0,0,2014,2.82403,5.937401,9.200833,13.870775,5.316866,27.383871,8.355,3.838387,7.020833
4,2014-03-15,0,1,0,0,0,0,1,0,2014,2.82403,5.937401,9.200833,13.870775,5.316866,27.383871,8.355,3.838387,7.020833


In [18]:
# merge with daily flow 
df_complete = pd.merge(df_joined, flow_cow, how='left', left_on='date', right_index = True)
df_complete.drop(['STATION_NUMBER', 'YEAR', 'DAY', 'Year'], axis=1, inplace=True)
df_complete

Unnamed: 0,date,ck,co,count,location_70.2,location_cow bay,location_mainstem fence,location_skutz,location_vimy pool,LastWinterTemp,marTemp,aprTemp,mayTemp,LastWinterFlow,marFlow,aprFlow,mayFlow,TEMP,MONTH,FlowValue
0,2014-03-15,0,1,0,0,0,0,0,1,2.824030,5.937401,9.200833,13.870775,5.316866,27.383871,8.355000,3.838387,7.020833,3,25.700001
1,2014-03-15,1,0,0,0,0,0,0,1,2.824030,5.937401,9.200833,13.870775,5.316866,27.383871,8.355000,3.838387,7.020833,3,25.700001
2,2014-03-15,1,0,0,0,0,0,1,0,2.824030,5.937401,9.200833,13.870775,5.316866,27.383871,8.355000,3.838387,7.020833,3,25.700001
3,2014-03-15,0,1,0,0,0,1,0,0,2.824030,5.937401,9.200833,13.870775,5.316866,27.383871,8.355000,3.838387,7.020833,3,25.700001
4,2014-03-15,0,1,0,0,0,0,1,0,2.824030,5.937401,9.200833,13.870775,5.316866,27.383871,8.355000,3.838387,7.020833,3,25.700001
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35017,2023-10-15,1,0,0,0,1,0,0,0,3.049505,5.093952,7.577862,15.933737,1.746417,7.648065,12.996333,2.918065,13.312500,10,0.528000
35018,2023-10-15,0,1,0,0,1,0,0,0,3.049505,5.093952,7.577862,15.933737,1.746417,7.648065,12.996333,2.918065,13.312500,10,0.528000
35019,2023-10-15,0,1,0,1,0,0,0,0,3.049505,5.093952,7.577862,15.933737,1.746417,7.648065,12.996333,2.918065,13.312500,10,0.528000
35020,2023-10-15,1,0,0,1,0,0,0,0,3.049505,5.093952,7.577862,15.933737,1.746417,7.648065,12.996333,2.918065,13.312500,10,0.528000


# This is where stopped- below code need revisit to make lag data. Proceed with ARIMA model for out migration 

In [19]:
def contextual_lag_df(df, date_col, group_cols, columns_to_lag, min_lag, max_lag):
    df = df.sort_values([date_col] + group_cols)
    
    lagged_df = pd.DataFrame(index=df.index)
    
    for _, group in df.groupby(group_cols):
        for col in columns_to_lag:
            for i in range(min_lag, max_lag + 1):
                lagged_name = f'{col}_t-{i}'
                lagged_df.loc[group.index, lagged_name] = group[col].shift(i)
    
    result_df = pd.concat([df, lagged_df], axis=1)
    result_df.dropna(inplace=True)  
    return result_df

exclude_lagging = ['LastWinterTemp', 'marTemp', 'aprTemp', 'mayTemp', 'LastWinterFlow', 'marFlow', 'aprFlow', 'mayFlow']
group_columns = ['date', 'ck', 'co'] + ['location_70.2', 'location_cow bay', 'location_mainstem fence', 'location_skutz', 'location_vimy pool']

columns_to_lag = ['TEMP', 'FlowValue']

lagged_data = contextual_lag_df(df_complete, 'date', group_columns, columns_to_lag, 30, 40) 

lagged_data.head() 

Unnamed: 0,date,ck,co,count,location_70.2,location_cow bay,location_mainstem fence,location_skutz,location_vimy pool,LastWinterTemp,...,FlowValue_t-31,FlowValue_t-32,FlowValue_t-33,FlowValue_t-34,FlowValue_t-35,FlowValue_t-36,FlowValue_t-37,FlowValue_t-38,FlowValue_t-39,FlowValue_t-40


In [41]:
# def lag_df(df, date_col, columns_to_lag, min_lag, max_lag):
#     df = df.sort_values(date_col)
#     lagged_df = pd.DataFrame(index=df.index)
    
#     for col in columns_to_lag:
#         for i in range(min_lag, max_lag + 1):
#             lagged_name = f'{col}_t-{i}'  
#             lagged_df[lagged_name] = df[col].shift(i)
    
#     result_df = pd.concat([df, lagged_df], axis=1)
#     result_df.dropna(inplace=True)

#     return result_df

In [42]:
# columns_to_lag = ['TEMP', 'FlowValue']
# lagged_data = lag_df(df_complete, 'date', columns_to_lag, min_lag=30, max_lag=40)
# lagged_data.head() 

In [43]:
cols = [col for col in lagged_data.columns if col != 'count'] + ['count']
lagged_data = lagged_data[cols]

In [44]:
lagged_data

Unnamed: 0,date,ck,co,location_70.2,location_cow bay,location_mainstem fence,location_skutz,location_vimy pool,LastWinterTemp,marTemp,...,FlowValue_t-2,FlowValue_t-3,FlowValue_t-4,FlowValue_t-5,FlowValue_t-6,FlowValue_t-7,FlowValue_t-8,FlowValue_t-9,FlowValue_t-10,count


# Modeling 

Logistic Regression 

In [45]:
lagged_data['count_bi'] = lagged_data['count'].apply(lambda x: 1 if x >= 1 else 0)
lagged_data.head() 

Unnamed: 0,date,ck,co,location_70.2,location_cow bay,location_mainstem fence,location_skutz,location_vimy pool,LastWinterTemp,marTemp,...,FlowValue_t-3,FlowValue_t-4,FlowValue_t-5,FlowValue_t-6,FlowValue_t-7,FlowValue_t-8,FlowValue_t-9,FlowValue_t-10,count,count_bi


In [46]:
X_1 = lagged_data.drop(columns=["count", 'date', 'count_bi', 'TEMP', 'FlowValue', 'aprTemp', 'mayTemp', 'aprFlow', 'aprTemp'])
y_1 = lagged_data["count_bi"]

Logistic Regression without Shuffle 

In [47]:
X_2_train, X_2_test, y_2_train, y_2_test = train_test_split(
    X_1, y_1, test_size=0.2, shuffle= False 
) 

ValueError: With n_samples=0, test_size=0.2 and train_size=None, the resulting train set will be empty. Adjust any of the aforementioned parameters.

In [None]:
pipe_2 = make_pipeline(StandardScaler(), LogisticRegression())

In [None]:
pipe_2.fit(X_2_train, y_2_train)

In [None]:
pipe_2.score(X_2_test, y_2_test)

In [None]:
train_3 = lagged_data[lagged_data['date'].dt.year != 2023]
test_3 = lagged_data[lagged_data['date'].dt.year == 2023]

In [None]:
X_3_train = train_3.drop(columns=["count", 'date', 'count_bi', 'TEMP', 'FlowValue', 'aprTemp', 'mayTemp', 'aprFlow', 'mayFlow'])
X_3_test = test_3.drop(columns=["count", 'date', 'count_bi', 'TEMP', 'FlowValue', 'aprTemp', 'mayTemp', 'aprFlow', 'mayFlow'])
y_3_train = train_3["count_bi"]
y_3_test = test_3["count_bi"]

In [None]:
pipe_3 = make_pipeline(StandardScaler(), LogisticRegression())

In [None]:
pipe_3.fit(X_3_train, y_3_train)

In [None]:
pipe_3.score(X_3_test, y_3_test)

In [None]:
prediction3 = pd.DataFrame(pipe_3.predict(X_3_test), columns=['prediction'])

In [None]:
X_3_test

In [None]:
y_3_test_withdate = test_3[["date", "count_bi"]].reset_index()
y_3_test_withdate = y_3_test_withdate.drop(columns=["index"])

In [None]:
result = pd.merge(y_3_test_withdate, prediction3, left_index=True, right_index=True, how='inner')
result

In [None]:
test_3 = test_3.reset_index() 
test_3 = test_3.drop(columns = ["index"])

In [None]:
test_3

In [None]:
result = pd.merge(test_3, prediction3, left_index=True, right_index=True, how='inner')
result = result[result['date'] <= '2023-06-16']

result

In [None]:
coefficients = pipe_3.named_steps['logisticregression'].coef_[0]
feature_names = X_3_train.columns
coef_df = pd.DataFrame({
    'Feature': feature_names,
    'Coefficient': coefficients
}).sort_values(by='Coefficient', ascending=False).reset_index().drop(columns = ['index'])

coef_df

In [None]:
# coef_df.to_csv('data/coef_df.csv', index=False)

In [None]:
array_of_thousands = np.full(34, 1000)
reshaped = array_of_thousands.reshape(1, 34)
reshaped.shape

In [None]:
pipe_3.predict(reshaped)

In [None]:
test_3