In [2]:
import pandas as pd
import numpy as np

In [17]:
def preprocessing(salmon_df_name, temp_df_name, level_df_name, flow_df_name):
      df_salmon = pd.read_csv(f"data/{salmon_df_name}")[["date", "count"]]
      df_temp = pd.read_csv(f"data/{temp_df_name}")
      df_level = pd.read_csv(f"data/{level_df_name}")
      df_flow = pd.read_csv(f"data/{flow_df_name}")

      df_salmon = df_salmon.groupby(["date"]).sum("count")
      df_salmon = df_salmon.reset_index()

      flow_df_pivoted = df_flow.melt(id_vars=["STATION_NUMBER", "YEAR", "MONTH"], var_name="Day", value_name="Flow")
      flow_df_pivoted["Day"] = flow_df_pivoted["Day"].str.replace("FLOW", "").astype(int)
      flow_df_pivoted["Date"] = flow_df_pivoted["YEAR"].astype(str) + "-" + flow_df_pivoted["MONTH"].astype(str) + "-" + flow_df_pivoted["Day"].astype(str)
      flow_df_pivoted["Date"] = pd.to_datetime(flow_df_pivoted["Date"], errors='coerce', format='mixed')
      flow_df_pivoted = flow_df_pivoted.dropna(subset=["Date"]).sort_values(by="Date")
      flow_df_pivoted = flow_df_pivoted.groupby("Date").mean("Flow").reset_index()

      level_df_pivoted = df_level.melt(id_vars=["STATION_NUMBER", "YEAR", "MONTH"], var_name="Day", value_name="Level")
      level_df_pivoted["Day"] = level_df_pivoted["Day"].str.replace("LEVEL", "").astype(int)
      level_df_pivoted["Date"] = level_df_pivoted["YEAR"].astype(str) + "-" + level_df_pivoted["MONTH"].astype(str) + "-" + level_df_pivoted["Day"].astype(str)
      level_df_pivoted["Date"] = pd.to_datetime(level_df_pivoted["Date"], errors='coerce', format='mixed')
      level_df_pivoted = level_df_pivoted.dropna(subset=["Date"]).sort_values(by="Date")
      level_df_pivoted = level_df_pivoted.groupby("Date").mean("Level").reset_index()

      comb = df_salmon.merge(df_temp[["UTC_DATE", "TEMP"]], left_on="date", right_on="UTC_DATE", how="right")
      comb = comb.drop(["date"], axis=1)
      comb["count"] = comb["count"].fillna(0)
      comb = comb.rename(columns={"UTC_DATE": "date"})
      comb["date"] = pd.to_datetime(comb["date"])
      comb["month"] = comb["date"].dt.month
      comb["year"] = comb["date"].dt.year

      comb_df = comb.merge(flow_df_pivoted, left_on="date", right_on="Date")
      comb_df = comb_df.merge(level_df_pivoted, left_on="date", right_on="Date")
      keep_cols = ["date","month", "year",  "TEMP", "Flow", "Level", "count"]
      comb_df = comb_df[keep_cols]
      comb_df = comb_df.rename(columns={"TEMP": "Temp"})

      month_key = {
            1: "january",
            2: "feburary",
            3: "march",
            4: "april",
            5: "may",
            6: "june",
            7: "july",
            8: "august",
            10: "october",
            11: "november",
            12: "december"}
      
      def parse_variables(df, variable, months, dict_key=month_key):

            for month in months:
                  df[f"{dict_key[month]}_{variable}"] = 0

            for year in df["year"].unique().tolist():

                  for month in months:
                        if month == 1 or month == 2:
                              temp_df = df[df["year"] == year]
                        else:
                              temp_df = df[df["year"] == year-1]
                        month_df = temp_df[temp_df["month"] == month]
                        month_avg = month_df[f"{variable}"].mean()

                        df.loc[(df["year"] == year), f"{dict_key[month]}_{variable}"] = month_avg

            return df
      
      new_df = parse_variables(comb_df, "Flow", [10,11])
      new_df = parse_variables(new_df, "Temp", [12, 1, 2])
      new_df = parse_variables(new_df, "Level", [10, 11])

      def parse_rolling_means(df, variable, mean_metric, window_start, window_end):
            diff = window_start - window_end
            df[f"rolling_{variable}_{mean_metric}_{diff}"] = 0

            for ind, row in df.iterrows():
                  if ind <= window_start:
                        df[f"rolling_{variable}_{mean_metric}_{diff}"] = None
                  else:
                        recent_window_days = df.iloc[(ind-window_start):(ind-window_end),:]
                        if mean_metric == "mean":
                              avg_var_value = recent_window_days[variable].mean()
                        else:
                              avg_var_value = recent_window_days[variable].std()
                        df.loc[ind, f"rolling_{variable}_{mean_metric}_{diff}"] = avg_var_value
            
            return df

      new_df = parse_rolling_means(new_df, "Temp", "mean", 45, 30)
      new_df = parse_rolling_means(new_df, "Temp", "mean", 40, 30)
      new_df = parse_rolling_means(new_df, "Temp", "mean", 35, 30)
      new_df = parse_rolling_means(new_df, "Flow", "mean", 45, 30)
      new_df = parse_rolling_means(new_df, "Flow", "mean", 40, 30)
      new_df = parse_rolling_means(new_df, "Flow", "mean", 35, 30)
      new_df = parse_rolling_means(new_df, "Level", "mean", 45, 30)
      new_df = parse_rolling_means(new_df, "Level", "mean", 40, 30)
      new_df = parse_rolling_means(new_df, "Level", "mean", 35, 30)
      new_df = parse_rolling_means(new_df, "Temp", "std", 45, 30)
      new_df = parse_rolling_means(new_df, "Temp", "std", 40, 30)
      new_df = parse_rolling_means(new_df, "Temp", "std", 35, 30)
      new_df = parse_rolling_means(new_df, "Flow", "std", 45, 30)
      new_df = parse_rolling_means(new_df, "Flow", "std", 40, 30)
      new_df = parse_rolling_means(new_df, "Flow", "std", 35, 30)
      new_df = parse_rolling_means(new_df, "Level", "std", 45, 30)
      new_df = parse_rolling_means(new_df, "Level", "std", 40, 30)
      new_df = parse_rolling_means(new_df, "Level", "std", 35, 30)

      missing_cols = new_df.columns[new_df.isna().any()].tolist()

      for col in missing_cols:
            median_value = new_df[col].median()
            new_df[col].fillna(median_value, inplace=True)
      
      return new_df

In [20]:
salmon_path = "salmon_concat.csv"
temp_path = "northcochiwan_daily_temp-2.csv"
flow_path = "flow_2023.csv"
level_path = "level_2023.csv"

final_df = preprocessing(salmon_path, temp_path, level_path, flow_path)
print(final_df.columns)
final_df.head()

Index(['date', 'month', 'year', 'Temp', 'Flow', 'Level', 'count',
       'october_Flow', 'november_Flow', 'december_Temp', 'january_Temp',
       'feburary_Temp', 'october_Level', 'november_Level',
       'rolling_Temp_mean_15', 'rolling_Temp_mean_10', 'rolling_Temp_mean_5',
       'rolling_Flow_mean_15', 'rolling_Flow_mean_10', 'rolling_Flow_mean_5',
       'rolling_Level_mean_15', 'rolling_Level_mean_10',
       'rolling_Level_mean_5', 'rolling_Temp_std_15', 'rolling_Temp_std_10',
       'rolling_Temp_std_5', 'rolling_Flow_std_15', 'rolling_Flow_std_10',
       'rolling_Flow_std_5', 'rolling_Level_std_15', 'rolling_Level_std_10',
       'rolling_Level_std_5'],
      dtype='object')


Unnamed: 0,date,month,year,Temp,Flow,Level,count,october_Flow,november_Flow,december_Temp,...,rolling_Level_mean_5,rolling_Temp_std_15,rolling_Temp_std_10,rolling_Temp_std_5,rolling_Flow_std_15,rolling_Flow_std_10,rolling_Flow_std_5,rolling_Level_std_15,rolling_Level_std_10,rolling_Level_std_5
0,2013-09-02,9,2013,19.127273,0.8755,0.548,0.0,7.679129,28.3825,3.965188,...,0.8237,2.160008,1.950006,1.571429,2.041962,1.416962,0.74561,0.060489,0.042766,0.024903
1,2013-09-03,9,2013,18.045833,0.7905,0.5405,0.0,7.679129,28.3825,3.965188,...,0.8237,2.160008,1.950006,1.571429,2.041962,1.416962,0.74561,0.060489,0.042766,0.024903
2,2013-09-04,9,2013,17.0625,0.749,0.5365,0.0,7.679129,28.3825,3.965188,...,0.8237,2.160008,1.950006,1.571429,2.041962,1.416962,0.74561,0.060489,0.042766,0.024903
3,2013-09-05,9,2013,16.8375,0.6945,0.53,0.0,7.679129,28.3825,3.965188,...,0.8237,2.160008,1.950006,1.571429,2.041962,1.416962,0.74561,0.060489,0.042766,0.024903
4,2013-09-06,9,2013,16.954167,0.668,0.528,0.0,7.679129,28.3825,3.965188,...,0.8237,2.160008,1.950006,1.571429,2.041962,1.416962,0.74561,0.060489,0.042766,0.024903
