# Evaluate which parameters have most influence on metrics of interest

In [35]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
import numpy as np
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 1000)
np.set_printoptions(threshold=1000)

In [36]:
path_to_metrics = '/glade/work/adamhb/processed_output/CZ2_equilibrium_042324_XX_-17e2acb6a_FATES-1449c787/afterOakFix_2560_042323_metrics_and_params.csv'
df = pd.read_csv(path_to_metrics)

## Functions

In [38]:
def get_corr(df,var,nrows = 20):
    df = df.copy()
    param_cols = df.columns.str.contains('fates',case=False)
    var_col = df.columns.isin([var])
    df = df.loc[:, param_cols | var_col]
    var_correlation = df.corrwith(df[var])
    cor_df = pd.DataFrame(var_correlation)
    cor_df.columns = ["cor"]
    cor_df["abs_cor"] = abs(cor_df["cor"])
    cor_df = cor_df.sort_values("abs_cor",ascending=False)
    return cor_df[:nrows]

def lasso_regression(df,var,nrows = 20):

    param_cols = df.columns.str.contains('fates',case=False)
    var_col = df.columns.isin([var])
    df = df.loc[:, param_cols | var_col]

    y = df[var]
    X = df.drop(var,axis = 1)
    
    # Standardize the features (important for Lasso)
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.05, random_state=0)
    
    # Initialize Lasso model
    # You can change the alpha parameter, higher values give stronger regularization
    lasso = Lasso(alpha=0.01, max_iter=10000)
    
    # Fit the model
    lasso.fit(X_train, y_train)
    
    print(f"R-squared on training data: {lasso.score(X_train, y_train):.2f}")
    
    # Predict on the test data
    y_pred = lasso.predict(X_test)
    
    # Calculate the mean squared error of the predictions
    mse = mean_squared_error(y_test, y_pred)
    print(f"Mean Squared Error on test data: {mse:.2f}")
    
    # View the coefficients of the model
    coef = pd.DataFrame()
    coef['Feature'] = X.columns
    coef['Coefficient'] = lasso.coef_
    
    # After computing the coefficients
    coef['Abs_Coefficient'] = coef['Coefficient'].abs()  # Add a column for the absolute values
    coef_sorted = coef.sort_values(by='Abs_Coefficient', ascending=False).reset_index(drop=True)  # Sort the DataFrame
    
    return coef_sorted[:nrows]

## Get correlation coefficients

In [39]:
get_corr(df,"BA_oak",20)

Unnamed: 0,cor,abs_cor
BA_oak,1.0,1.0
fates_fire_nignitions_0,-0.489492,0.489492
fates_frag_maxdecomp_0.3,0.393666,0.393666
fates_frag_maxdecomp_0.2,0.393666,0.393666
fates_frag_maxdecomp_0.1,0.393666,0.393666
fates_frag_maxdecomp_0,0.393666,0.393666
fates_leaf_slatop_4,0.351912,0.351912
fates_turnover_leaf_4,0.282946,0.282946
fates_leaf_vcmax25top_4,0.202696,0.202696
fates_fire_drying_ratio_0,0.164232,0.164232


## Lasso regression

In [40]:
lasso_regression(df,"BA_oak",nrows = 20)

R-squared on training data: 0.70
Mean Squared Error on test data: 1.32


Unnamed: 0,Feature,Coefficient,Abs_Coefficient
0,fates_fire_nignitions_0,-1.079874,1.079874
1,fates_frag_maxdecomp_0,0.787622,0.787622
2,fates_leaf_slatop_4,0.781598,0.781598
3,fates_turnover_leaf_4,0.548294,0.548294
4,fates_leaf_vcmax25top_4,0.465781,0.465781
5,fates_fire_drying_ratio_0,0.369186,0.369186
6,fates_allom_d2ca_coefficient_max_4,0.174321,0.174321
7,fates_leaf_slatop_3,-0.174107,0.174107
8,fates_allom_d2ca_coefficient_max_3,-0.117351,0.117351
9,fates_leaf_vcmax25top_3,-0.090467,0.090467


### Random Forest

In [None]:
#import pandas as pd
# from sklearn.model_selection import train_test_split
# from sklearn.ensemble import RandomForestRegressor
# from sklearn.metrics import mean_squared_error

# # Load your dataframe
# # df = pd.read_csv('your_data.csv')

# # Suppose 'target' is the column you want to predict and rest are features
# X = df.drop(columns=to_drop)
# y = df['ShannonE']

# # Split the data into training and testing sets
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05, random_state=0)

# # Initialize Random Forest Regressor
# rf = RandomForestRegressor(n_estimators=100, random_state=0)

# # Fit the model
# rf.fit(X_train, y_train)

# # Check the R-squared on the training data
# print(f"R-squared on training data: {rf.score(X_train, y_train):.2f}")

# # Predict on the test data
# y_pred = rf.predict(X_test)

# # Calculate the mean squared error of the predictions
# mse = mean_squared_error(y_test, y_pred)
# print(f"Mean Squared Error on test data: {mse:.2f}")

# # Getting the feature importances
# importances = rf.feature_importances_

# # Creating a DataFrame for the feature importances
# features = pd.DataFrame()
# features['Feature'] = X.columns
# features['Importance'] = importances

# # Sorting the features based on their importances
# features_sorted = features.sort_values(by='Importance', ascending=False).reset_index(drop=True)

# # Displaying the feature importances
# print(features_sorted)