In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Lasso
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import mean_squared_error
import xgboost as xgb

- df_name = jenny/final_preprocessed
- prediction_year = 2023 
- lower_percentile = 5 
- upper_percentile = 10 

In [2]:
def prediction(df_name, prediction_year, lower_percentile, upper_percentile):
    df = pd.read_csv(f"data/{df_name}.csv")
    df["date"] = pd.to_datetime(df["date"], format='%Y-%m-%d')
    df['count_diff'] = df['count'].diff().dropna()

    # create lag features for 31-33 days
    def create_lags(df, variable, start_lag, end_lag):
        n = start_lag
        while n <= end_lag:
                df[f"{variable}_lag_{n}"] = df[f"{variable}"].shift(n)
                n += 1
        return df

    df = create_lags(df, "count_diff", 31, 33)
    df = create_lags(df, "Temp", 31, 33)
    df = create_lags(df, "Flow", 31, 33)
    df = create_lags(df, "Level", 31, 33)
    
    # impute NAs using median value 
    # def impute_for_missing(df):
    #     missing_cols = df.columns[df.isna().any()].tolist()
    #     for col in missing_cols:
    #             median_value = df[col].median()
    #             df[col].fillna(median_value, inplace=True)
    #     return df


    def impute_for_missing(df):
        missing_cols = df.columns[df.isna().any()].tolist()
        for col in missing_cols:
            median_value = df[col].median()
            df.loc[:, col] = df[col].fillna(median_value)
        return df





    df = impute_for_missing(df)
    
    # train test splitting based on the year trying to predict 
    def split_test_train(df, year=2023, summer_months=False):
        if summer_months:
                df = df[df["month"].isin([4,5,6,7,8])]
        train = df[df["year"] < year]
        test = df[df["year"] >= year]
        return train, test

    train, test = split_test_train(df, year=prediction_year, summer_months=True)
    
    # remove all level data because it's highly correlated to flow data 
    def remove_features(df, keyword):
        keep = [col for col in df.columns.tolist() if keyword not in col]
        df = df[keep]
        return df

    df = remove_features(df, "Level")
    
    # remove the count_diff so there is no data leakage 
    use_for_feats = df.iloc[:,7:].columns.tolist()
    use_for_feats.remove("count_diff")

    X_train = train[use_for_feats]
    y_train = train['count_diff']

    X_test = test[use_for_feats]
    y_test = test['count_diff']

    # scale the features 
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # select features 
    def lasso_select(df, X_train, y_train, alpha=0.1):
        scaler = StandardScaler()
        X_scaled = scaler.fit_transform(X_train)
        lasso = Lasso(alpha=alpha)
        lasso.fit(X_scaled, y_train)
        selected_features = SelectFromModel(lasso, prefit=True).get_support()
        X_selected = X_train.loc[:, selected_features]
        X_list = X_selected.columns.tolist()
        # print(f"Out of {len(df.columns.tolist())} features, we reduced it down to {len(X_list)} features.")
        return X_list

    lasso_features = lasso_select(train, X_train, y_train, 0.15)
    
    def XGBoost_predict(X_train, y_train, X_test, y_test, select_feats):
        X_train = X_train[select_feats]
        X_test = X_test[select_feats]

        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)

        model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, learning_rate=0.1)
        model.fit(X_train_scaled, y_train)

        y_pred = model.predict(X_test_scaled)

        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        # print(f'RMSE: {rmse:.3f}')
        return y_pred, rmse
    
    alpha_vals = np.arange(0.1, 1.0, 0.05)
    alpha_dict = {}
    for alpha in alpha_vals:
        temp_test = test.copy()
        # print(f"Currently investingating alpha value of: {alpha:.3f}.")
        feats_lasso_select = lasso_select(train, X_train, y_train, alpha)
        preds, rmse = XGBoost_predict(X_train, y_train, X_test, y_test, feats_lasso_select)
        temp_test["pred"] = preds
        alpha_dict[alpha] = rmse
        
    min_val = min(list(alpha_dict.values()))

    def find_key_by_value(dictionary, value):
        for key, val in dictionary.items():
            if val == value:
                return key  
        return None

    best_alpha = find_key_by_value(alpha_dict, min_val)
    lasso_selected = lasso_select(train, X_train, y_train, best_alpha)
    preds, rmse = XGBoost_predict(X_train, y_train, X_test, y_test, lasso_selected)

    test["pred"] = preds
    
    # get rid of unresaonable predictions 
    adjust_val = lambda row: 0 if row["pred"] <= 0 else row["pred"]
    test["pred_adjust"] =test.apply(adjust_val, axis=1)
    
    def cumulative_percentile(df, col, year=None, start_percent=None, end_percent=None):
        cum_percent = 0
        if year != None:
                df = df[df["year"] == year]
        total_percentage = df[col].sum()
        df[f"{col}_cumulative_percentage"] = 0.0
        for ind, row in df.iterrows():
                df.loc[ind, f"{col}_cumulative_percentage"] = round((row[col] / total_percentage) * 100 + cum_percent, 3)
                cum_percent = (row[col] / total_percentage) * 100 + cum_percent
        if (start_percent != None) & (end_percent != None):
                df = df[(df[f"{col}_cumulative_percentage"] >= start_percent) & (df[f"{col}_cumulative_percentage"] <= end_percent)]

                start_date_ts = df.iloc[0]['date']
                end_date_ts = df.iloc[-1]['date']
                
                start_date_str = start_date_ts.strftime("%Y-%m-%d")
                end_date_str = end_date_ts.strftime("%Y-%m-%d")

                print(f"{start_percent}% to {end_percent}% of salmon are predicted to be tagged between: {start_date_str} to {end_date_str}.")
    
    
    cum = cumulative_percentile(test, "pred_adjust", prediction_year, lower_percentile, upper_percentile)
    return cum

In [3]:
prediction('jenny/final_preprocessed', 2023, 5, 10)

5% to 10% of salmon are predicted to be tagged between: 2023-04-19 to 2023-04-26.
