# Predict climate from berry fungal communities

> using `ritme` (https://github.com/adamovanja/ritme)

note. use the **ritme_model** environment! 


In [31]:
from ritme.find_best_model_config import (
    _load_experiment_config,
    _load_phylogeny,
    _load_taxonomy,
    find_best_model_config,
)
from ritme.split_train_test import _load_data, split_train_test
from ritme.evaluate_tuned_models import evaluate_tuned_models

import pandas as pd

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [32]:
workdir = '/home/lfloerl/cloud/lfloerl/Microterroir/artifacts/ITS/lavaux/climate-berries'
%cd $workdir

/home/lfloerl/cloud/lfloerl/Microterroir/artifacts/ITS/lavaux/climate-berries


In [33]:
# set experiment configuration path - this is downloaded from the github repo
# here we specify the column to be stratified by and the target! 
model_config_path = "/home/lfloerl/microterroir/Microbiome/Other_scripts/config/r_local_linreg_py.json"

# define path to feature table, metadata, and taxonomy (no phylogeny)
path_to_ft = "climate_filtered_table.qza"
path_to_md = "/home/lfloerl/microterroir/Microbiome/Metadata/ITS_Lavaux_Climate.tsv"
path_to_tax = "/home/lfloerl/cloud/lfloerl/Microterroir/artifacts/ITS/taxonomy.qza"

# define train size
train_size = 0.8

In [34]:
config = _load_experiment_config(model_config_path)

In [35]:
md_df = pd.read_csv(path_to_md, sep='\t')
md_df.head()

Unnamed: 0,id,SAMPLE_NAME,sample_type,Plot_ID,COLLECTION_DATE,Year,Year_Cat,Plot,Altitude,Average_slope,...,maximum_rh,minimum_rh,cv_rh,GDD,average_temperature,median_temperature,maximum_temperature,minimum_temperature,accumulated_temperature,cv_temperature
0,364526_290-LP3-ITS-0866,Lavaux_2021-08-31_bark_Plot4,bark,Lavaux_Plot_4,2021-08-31 00:00:00 +0200,2021,Year2021,4,450,30,...,99.036583,33.078261,16.567322,1325.721843,18.154261,18.650958,26.689636,8.40175,2940.990343,22.587667
1,364526_289-LP3-ITS-0865,Lavaux_2021-08-31_bark_Plot5,bark,Lavaux_Plot_5,2021-08-31 00:00:00 +0200,2021,Year2021,5,400,30,...,95.964333,32.830458,16.331714,1379.347141,18.494554,19.047366,26.351409,8.816375,2996.117777,22.104867
2,364526_285-LP3-ITS-0861,Lavaux_2021-08-31_bark_Plot9,bark,Lavaux_Plot_9,2021-08-31 00:00:00 +0200,2021,Year2021,9,520,35,...,98.181542,32.794125,16.570451,1286.254021,17.902585,18.256629,25.850773,8.052083,2900.218734,22.831881
3,364526_287-LP3-ITS-0863,Lavaux_2021-08-31_bark_Plot11,bark,Lavaux_Plot_11,2021-08-31 00:00:00 +0200,2021,Year2021,11,490,20,...,96.437167,32.357792,16.118351,1297.271087,17.964399,18.541438,26.775286,7.966542,2910.232671,23.246939
4,364526_282-LP3-ITS-0858,Lavaux_2021-08-31_bark_Plot12,bark,Lavaux_Plot_12,2021-08-31 00:00:00 +0200,2021,Year2021,12,520,15,...,97.113875,32.104636,16.661423,1286.822875,17.897069,18.244984,27.509143,7.779875,2899.325119,23.615955


## Read & split data


In [36]:
md, ft = _load_data(path_to_md, path_to_ft)
print(md.shape, ft.shape)

(595, 35) (326, 6091)


In [37]:
train_val, test = split_train_test(
    md,
    ft,
    stratify_by_column=config["stratify_by_column"],
    feature_prefix=config["feature_prefix"],
    train_size=train_size,
    seed=config["seed_data"],
)

Train: (221, 6126), Test: (105, 6126)




## Find best model config


In [39]:
tax = _load_taxonomy(path_to_tax)

best_model_dict, path_to_exp = find_best_model_config(
    config, train_val, tax, path_store_model_logs="ritme_refact_logs_RF_LR")

0,1
Current time:,2025-02-11 12:08:29
Running for:,00:09:36.98
Memory:,10.8/91.6 GiB

Trial name,status,loc,bootstrap,data_aggregation,data_selection,data_selection_i,data_selection_t,data_transform,max_depth,max_features,min_impurity_decreas e,min_samples_leaf,min_samples_split,min_weight_fraction_ leaf,model,n_estimators,iter,total time (s),rmse_val,rmse_train,r2_val
train_rf_9a3983be,TERMINATED,172.31.181.85:3291874,True,,variance_threshold,,6.16517e-05,,,,0.0803584,0.0162992,0.00188113,0.00283828,rf,120,1,0.434552,0.951788,0.202687,0.0401544
train_rf_d1358641,TERMINATED,172.31.181.85:3291935,False,,abundance_ith,7.0,,,8.0,sqrt,0.497569,0.0903891,0.00452001,0.00334648,rf,160,1,1.35758,0.85023,0.219328,0.234061
train_rf_baa43e47,TERMINATED,172.31.181.85:3292028,True,,variance_threshold,,0.000245158,,32.0,0.3,0.138559,0.00276506,0.00113714,0.0030017,rf,100,1,0.359191,0.945276,0.232015,0.0532443
train_rf_b1eb494c,TERMINATED,172.31.181.85:3292095,True,,abundance_ith,18.0,,,32.0,0.2,0.462165,0.015499,0.0774752,0.0007347,rf,140,1,2.25172,0.853752,0.173506,0.227702
train_rf_fcf297b2,TERMINATED,172.31.181.85:3292168,False,,abundance_ith,20.0,,,4.0,0.5,0.455717,0.0193131,0.0691401,5.90936e-05,rf,100,1,0.801456,0.866908,0.307072,0.203718
train_rf_c5f28568,TERMINATED,172.31.181.85:3292235,True,,variance_threshold,,0.00452411,,,0.1,0.313265,0.00638526,0.00186582,0.00191711,rf,60,1,0.275107,1.09843,0.263291,-0.278399
train_rf_92a7cec7,TERMINATED,172.31.181.85:3292300,False,,abundance_ith,4.0,,,,0.3,0.329672,0.00218674,0.0558683,0.00912784,rf,40,1,0.550535,0.852758,0.176039,0.229501
train_rf_a8b7c8a6,TERMINATED,172.31.181.85:3292371,True,,abundance_ith,8.0,,,32.0,,0.160984,0.0793897,0.0140885,0.00777679,rf,200,1,2.52495,0.85044,0.172089,0.233684
train_rf_38c07b82,TERMINATED,172.31.181.85:3292438,False,,abundance_ith,4.0,,,4.0,0.3,0.212243,0.0946628,0.0646832,0.00587886,rf,200,1,0.912336,0.873181,0.30795,0.192152
train_rf_6ac83e8c,TERMINATED,172.31.181.85:3292502,False,,abundance_ith,16.0,,,16.0,0.3,0.132565,0.00372723,0.0110613,0.00507432,rf,200,1,2.91059,0.850821,0.174781,0.232997


2025-02-11 12:08:29,065	INFO tune.py:1009 -- Wrote the latest version of all result files and experiment state to '/home/lfloerl/cloud/lfloerl/Microterroir/artifacts/ITS/lavaux/climate-berries/ritme_refact_logs_RF_LR/r_local_models_py/rf' in 3.1365s.
2025-02-11 12:08:29,093	INFO tune.py:1041 -- Total run time: 577.11 seconds (573.85 seconds for the tuning loop).


## Evaluate best models


In [22]:
metrics = evaluate_tuned_models(best_model_dict, config, train_val, test)
metrics

Unnamed: 0,rmse_train,r2_train,rmse_test,r2_test
linreg,0.766429,-0.016003,0.648017,-0.464744
