# Predict matrix intensities for custom molecules
Use this notebook to predict matrix intensities on your custom set of molecules.

#### Initial imports

In [None]:
import pickle
from pathlib import Path
% load_ext autoreload
% autoreload 2

from pred_spot_intensity.io import load_molecule_features
import pandas as pd
import numpy as np

## Load molecule properties
First, you should provide the list of molecules (as csv file) for which you want to predict the matrix intensities.

In total the csv file should have 8 columns, with the following names:
- `molecule_name`
- `pka_strongest_acidic`
- `pka_strongest_basic`
- `polar_surface_area`
- `polarizability`
- `acceptor_count`
- `donor_count`
- `physiological_charge`

You can have a look at the molecule features used for training the model at `./training_data/physchem_molecule_properties.csv`, to check how the final csv file should look like.

Finally, insert the path of the csv file in the cell below.

In [None]:
# Update this with the path of the csv file with molecule features:
PATH_CSV_FILE_MOLECULE_FEATURES = "./training_data/physchem_molecule_properties.csv"


In [None]:
# Load the molecule features:
molecule_features = load_molecule_features(PATH_CSV_FILE_MOLECULE_FEATURES,
                                           normalize=True)

## Predict the matrix intensities
In the cell below you can specify a custom path where to save the predictions. By default, predictions will be saved in `./prediction_results/predictions_my_custom_molecule_dataset.csv`.

In [None]:
# The output predictions will be saved in `<PREDICTIONS_DIR>/predictions_<EXP_NAME>.csv`
PREDICTIONS_DIR = Path("./prediction_results")
EXPERIMENT_NAME = "my_custom_molecule_dataset"

In [None]:
matrix_names = ['9AA', 'CHCA', 'ClCCA', 'DAN', 'DHAP', 'DHB', 'MAPS', 'NEDC', 'NOR', 'CMBT', 'pNA']
polarities = ["positive", "negative"]

regression_models_dir = Path("./training_results/paper_results/regression_on_detected_per_mol_sum/trained_models")
classification_models_dir = Path("./training_results/paper_results/detection_per_mol_sum/trained_models")
PREDICTIONS_DIR.mkdir(exist_ok=True, parents=True)

In [None]:
# Create inputs of the model:
adduct_names = ['+Cl', '+H', '+K', '+Na', '-H', '[M]+', '[M]-']

# Compute predictions for every matrix/polarity:
predictions_collected = []
for matrix in matrix_names:
    for polarity in polarities:
        # Load models from disk:
        regression_model_path = regression_models_dir / f"trained_regressor_model_{matrix}_{polarity}.pkl"
        regression_model = pickle.load(open(regression_model_path, 'rb'))
        classification_model_path = classification_models_dir / f"trained_classifier_model_{matrix}_{polarity}.pkl"
        classification_model = pickle.load(open(classification_model_path, 'rb'))

        # for adduct in adduct_names:
        #     model_input.loc[:, adduct_names] = 0
        #     model_input.loc[: adduct] = 1
        loc_results = pd.DataFrame({"Molecule name": molecule_features.index.tolist(),
                                    "matrix": matrix,
                                    "polarity": polarity,
                                    "Predicted intensity (log10[intensity+1])": regression_model.predict(molecule_features.to_numpy()),
                                    "Predicted as detected": classification_model.predict(molecule_features.to_numpy()) > 0.5
                                    })
        predictions_collected.append(loc_results)
predictions_collected = pd.concat(predictions_collected).reset_index(drop=True)
predictions_collected.loc[~predictions_collected["Predicted as detected"], "Predicted intensity (log10[intensity+1])"] = ''
predictions_collected.to_csv(PREDICTIONS_DIR / f"predictions_{EXPERIMENT_NAME}.csv")