In [35]:
import pandas as pd
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import ipywidgets as widgets
from imblearn.over_sampling import SMOTE
from IPython.display import display
from sklearn.ensemble import RandomForestRegressor

In [44]:
pop = pd.read_csv(r'C:\Users\User\Desktop\EDurHack\Dataset\cleanData\woodchucks_with_wood_volume.csv')

pop = pop.fillna(0)

years = pop['year'].unique().tolist()

yearly_data = pop.sort_values(by='year')

lats = yearly_data['latitude'].unique().tolist()

In [None]:
def forecast(years, lats):
    all_results = []  # collect results across all years, lats, and lons

    for year in years:
        for lat in lats:
            lons = yearly_data[yearly_data['latitude'] == lat]['longitude'].unique().tolist()
            
            for lon in lons:
                location_data = yearly_data[
                    (yearly_data['latitude'] == lat) &
                    (yearly_data['longitude'] == lon)
                ].copy()
                
                location_data = location_data.sort_values('year').reset_index(drop=True)

                location_data['time_index'] = range(len(location_data))
                location_data['lag_1'] = location_data['total_wood_chucked_lbs'].shift(1)
                location_data['lag_2'] = location_data['total_wood_chucked_lbs'].shift(2)
                location_data['lag_3'] = location_data['total_wood_chucked_lbs'].shift(3)
                location_data['rolling_mean_3'] = (
                    location_data['total_wood_chucked_lbs']
                    .rolling(window=3, min_periods=1)
                    .mean()
                )

                # Clean missing lag values
                location_data = location_data.dropna(subset=['lag_1', 'lag_2', 'lag_3'])

                if len(location_data) < 3:
                    continue  # skip if too little data for forecasting

                X = location_data[['year', 'time_index', 'lag_1', 'lag_2', 'lag_3', 'rolling_mean_3']]
                y = location_data['total_wood_chucked_lbs']
                
                model = RandomForestRegressor(
                    n_estimators=100,
                    max_depth=10,
                    min_samples_split=2,
                    random_state=42,
                    n_jobs=-1
                )
                model.fit(X, y)

                # Forecast next 16 years
                future_predictions = []
                last_year = location_data['year'].iloc[-1]
                last_time_index = location_data['time_index'].iloc[-1]
                last_values = location_data['total_wood_chucked_lbs'].tail(3).values
                
                for i in range(16):
                    future_year = last_year + i + 1
                    future_time = last_time_index + i + 1
                    
                    future_features = {
                        'year': future_year,
                        'time_index': future_time,
                        'lag_1': last_values[-1],
                        'lag_2': last_values[-2],
                        'lag_3': last_values[-3],
                        'rolling_mean_3': np.mean(last_values[-3:])
                    }
                    future_X = pd.DataFrame([future_features])
                    pred = model.predict(future_X)[0]
                    future_predictions.append({
                        'latitude': lat,
                        'longitude': lon,
                        'year': future_year,
                        'total_wood_chucked_lbs': pred,
                        'type': 'forecast'
                    })
                    last_values = np.append(last_values[1:], pred)
                
                # Combine historical + forecast data
                past = location_data[['latitude', 'longitude', 'year', 'total_wood_chucked_lbs']].copy()
                past['type'] = 'historical'
                future = pd.DataFrame(future_predictions)
                combined = pd.concat([past, future])
                all_results.append(combined)

    final_df = pd.concat(all_results).sort_values(['year', 'latitude', 'longitude']).reset_index(drop=True)

    final_df.to_csv("woodchuck_forecast_all.csv", index=False)
    print("âœ… Saved all forecasts to woodchuck_forecast_all.csv")

    return final_df

forecast(years, lats)
