In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import os
from evolvepro.src.data import load_dms_data
from evolvepro.src.evolve import directed_evolution_simulation
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import mean_squared_error
import MLDE_scripts.dimension_reduction as dimension_reduction
import simulation_pipeline
import math
import random
import umap

In [None]:
# load dataset
dataset_name = "jones"
model_name = "esm1b_t33_650M_UR50S"
embeddings_path = "output/dms/embeddings"
labels_path = "output/dms/labels"
embeddings_file_type = "csv"
embeddings_type_pt = "average"
embeddings, labels = load_dms_data(dataset_name, model_name, embeddings_path, labels_path, embeddings_file_type, embeddings_type_pt)

In [None]:
umap_model = umap.UMAP(n_components=5)
X_umap = umap_model.fit_transform(embeddings)
umap_df = pd.DataFrame(X_umap, index=embeddings.index)

In [None]:
model = RandomForestRegressor(n_estimators=200, criterion='friedman_mse', max_depth=None, min_samples_split=2,
                                    min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=round(math.log2(1280)),
                                    max_leaf_nodes=None, min_impurity_decrease=0.0, bootstrap=True, oob_score=False,
                                    n_jobs=None, random_state=1, verbose=0, warm_start=False, ccp_alpha=0.0,
                                    max_samples=None)
metrics_df = []
for i in range(0, 9):
    predicted_activities, metrics, final_cycl_predictions = simulation_pipeline.model_simulation(
        #embeddings = umap_df,
        embeddings = embeddings,
        labels = labels,
        output_dir = "output/RF_simulations",
        predict_all = True,
        activity = "activity_binary",
        cycles = 10,
        num_per_cycle = 16,
        #model = model,
        #random_seed = 1234,
        #selection_method= "limit_AA"
    )
    metrics_df.append(metrics)
metrics_df = pd.concat(metrics_df, ignore_index=True)
metrics_df.to_csv('./output/RF_average_performance.csv')

In [None]:
rf_performance_df = pd.read_csv('./output/RF_average_performance.csv')
rf_avg = rf_performance_df.loc[rf_performance_df['iteration'] == 9, 'activity_binary_percentage']
rf_avg.mean()

In [None]:
model = RandomForestRegressor(n_estimators=200, criterion='friedman_mse', max_depth=None, min_samples_split=2,
                                    min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=round(math.log2(1280)),
                                    max_leaf_nodes=None, min_impurity_decrease=0.0, bootstrap=True, oob_score=False,
                                    n_jobs=None, random_state=1, verbose=0, warm_start=False, ccp_alpha=0.0,
                                    max_samples=None)
predicted_activities, metrics, final_cycl_predictions = simulation_pipeline.model_simulation(
    embeddings = embeddings,
    labels = labels,
    output_dir = "output/RF_simulations",
    predict_all = True,
    activity = "activity",
    cycles = 10,
    num_per_cycle = 10,
    model = model,
    random_seed = 1234,
)
metrics

In [None]:
metrics_list = []
for i in range(0, 30):
    model = RandomForestRegressor(n_estimators=200, criterion='friedman_mse', max_depth=None, min_samples_split=2,
                                    min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=round(math.log2(1280)),
                                    max_leaf_nodes=None, min_impurity_decrease=0.0, bootstrap=True, oob_score=False,
                                    n_jobs=None, random_state=1, verbose=0, warm_start=False, ccp_alpha=0.0,
                                    max_samples=None)
    predicted_activities, metrics, final_cycle_predictions = simulation_pipeline.model_simulation(
        embeddings = embeddings,
        labels = labels,
        output_dir = "output/RF_simulations",
        predict_all = True,
        activity = "activity",
        cycles = 10,
        num_per_cycle = 10,
        model = model,
        random_seed = i,
        limit_AA_selection = False
    )
    metrics_list.append(metrics)


In [None]:
for i in range(0, 30):
    sns.lineplot(metrics_list[i]['activity_binary_percentage'])

In [None]:
metrics_list = []
for i in range(0, 30):
    model = RandomForestRegressor(n_estimators=200, criterion='friedman_mse', max_depth=None, min_samples_split=2,
                                    min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=round(math.log2(1280)),
                                    max_leaf_nodes=None, min_impurity_decrease=0.0, bootstrap=True, oob_score=False,
                                    n_jobs=None, random_state=1, verbose=0, warm_start=False, ccp_alpha=0.0,
                                    max_samples=None)
    predicted_activities, metrics, final_cycle_predictions = simulation_pipeline.model_simulation(
        embeddings = embeddings,
        labels = labels,
        output_dir = "output/RF_simulations",
        predict_all = True,
        activity = "activity",
        cycles = 10,
        num_per_cycle = 10,
        model = model,
        random_seed = i,
        limit_AA_selection = True
    )
    metrics_list.append(metrics)

In [None]:
for i in range(0, 30):
    sns.lineplot(metrics_list[i]['activity_binary_percentage'])

In [None]:
pca = dimension_reduction.Create_PCA(
    embedding_path = "output/dms/embeddings/jones_esm1b_t33_650M_UR50S.csv",
    labels_path = "output/RF_simulations/final_cycle_predictions.csv",
    label_name="predicted_activity",
    n_dims=5
)

pca.visualise_labels()