In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
pd.set_option("display.max_rows", 200)
pd.set_option("display.max_columns", 200)

In [None]:
mt_tree = pd.read_csv("MT_TREE.csv")
fia_trees = pd.read_csv("fia_treenames.csv")
# all_tree = pd.read_csv("ENTIRE_TREE.csv")

In [None]:
fia_trees[fia_trees["FIA Code"] == 202]

In [None]:
list(mt_tree.columns)

In [None]:
mt_tree.SPCD.value_counts()

In [None]:
mt_tree_202 = mt_tree[mt_tree["SPCD"] == 202]

In [None]:
# drop rows where volcfnet is nan
# mt_tree_202 = mt_tree_202[mt_tree_202["VOLCFNET"].notna()]
mt_tree = mt_tree[mt_tree["VOLCFNET"].notna()]
# all_tree = all_tree[all_tree["VOLCFNET"].notna()]
# all_tree = all_tree[all_tree["VOLCFGRS"].notna()]
# all_tree = all_tree[all_tree["HT"].notna()]

In [None]:
mt_tree.CARBON_AG.isna().sum()

In [None]:
mt_tree.isna().sum()

In [None]:
# plot DIAH^2*H vs VOLCFNET
import matplotlib.pyplot as plt

mt_tree_202["D2H"] = (mt_tree_202["DIA"]**2)

plt.scatter(mt_tree_202["VOLCFNET"], mt_tree_202["DIA"], s=1)
plt.show()

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

def train_random_forest(df: pd.DataFrame, feature_cols: list, target_col: str):
    """
    Trains a Random Forest Regressor model, evaluates it, and returns the trained model.

    Args:
        df (pd.DataFrame): The input dataframe containing the data.
        feature_cols (list): A list of column names to be used as features.
        target_col (str): The name of the target column.

    Returns:
        A trained scikit-learn pipeline object.
    """
    print("--- Starting Model Training ---")

    # 1. Define Features (X) and Target (y)
    X = df[feature_cols]
    y = df[target_col]

    # 2. Identify categorical and numerical features
    categorical_features = X.select_dtypes(include=['object', 'category']).columns
    numerical_features = X.select_dtypes(include=['int64', 'float64']).columns

    print(f"Identified {len(numerical_features)} numerical features: {list(numerical_features)}")
    print(f"Identified {len(categorical_features)} categorical features: {list(categorical_features)}")

    # 3. Create a preprocessor object
    # This step handles categorical features by one-hot encoding them.
    # 'passthrough' means numerical features will be left as they are.
    # 'handle_unknown='ignore'' prevents errors if a category appears in test data
    # but not in training data.
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', 'passthrough', numerical_features),
            ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
        ])

    # 4. Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.1, random_state=42
    )
    print(f"\nData split complete. Training set size: {len(X_train)}, Test set size: {len(X_test)}")


    # 5. Define the model
    # We use a random_state for reproducibility
    rf_model = RandomForestRegressor(n_estimators=200, random_state=42, n_jobs=-1)

    # 6. Create the full pipeline
    # This pipeline first preprocesses the data then feeds it to the model.
    model_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                     ('regressor', rf_model)])

    # 7. Train the model
    print("\nTraining the model...")
    model_pipeline.fit(X_train, y_train)
    print("Training complete.")

    # 8. Make predictions on the test set
    y_pred = model_pipeline.predict(X_test)

    # 9. Evaluate the model
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    print("\n--- Model Evaluation ---")
    print(f"Mean Absolute Error (MAE): {mae:.4f}")
    print(f"Mean Squared Error (MSE): {mse:.4f}")
    print(f"R-squared (R²): {r2:.4f}")
    print("------------------------\n")

    return model_pipeline

# --- Example Usage ---
if __name__ == "__main__":

    # Define feature columns and the target column
    features = ['HT', 'DIA', 'SPCD']
    target = 'VOLCFGRS'

    # Train the model using the function
    trained_model = train_random_forest(df=mt_tree, feature_cols=features, target_col=target)

In [None]:
new_tree_data = pd.DataFrame({
    'HT': [88],
    'DIA': [15.8],
    'SPCD': [202],
    'PLOT': [0]
})

prediction = trained_model.predict(new_tree_data)
print(f"Predicted value for the new tree: {prediction[0]:,.2f}")

In [None]:
mt_tree[(mt_tree['SPCD'] == 202) & (mt_tree.DIA == 7.3) & (mt_tree.HT == 34)]#[["DIA", "HT", "SPCD", "VOLCFGRS", "VOLCFNET"]]