# Generating Chemical Space Plots

The aim of this file is to produce plots of the chemical space after applying dimensionality reduction.

## Setup

### Change Working Directory

In [3]:
# Change working directory to the parent of the parent of the script

import os

# Get the current working directory
current_directory = os.getcwd()

# Get the parent of the parent directory
parent_parent_directory = os.path.abspath(os.path.join(current_directory, '..', '..'))

# Change the working directory to the parent of the parent directory
os.chdir(parent_parent_directory)

# Verify the change by printing the new working directory
print("New working directory:", os.getcwd())


New working directory: /Users/gordianimperial/Documents/Group Project/bo_molecules


### Imports

In [4]:
# Standard library
from typing import List, Callable, Tuple
import pandas as pd

# Third-party
from gauche.kernels.graph_kernels import WeisfeilerLehmanKernel
from dash import Dash, html, dcc, callback, Input, Output
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import plotly.express as px
from gauche import NonTensorialInputs


In [5]:
# Module Imports
from modules.utils.read_sample_store import read_compressed_or_regular_json, buffer_to_dataframe
from modules.utils.molecular_data_conversion import smiles_to_graph

### Configurations

In [6]:
buffer_file_path = "experiments/results/mestranol_similarity/2024-04-21_02-11-54_mestranol_similarity_1000/buffer.json.gz"

## Function Definitions

### Dimensionality Reductions

In [7]:
def tsne_df(kernel_data: pd.DataFrame,
            output_dims: int = 2,
            pca_components: int = 50,
            perplexity: int = 40,
            ) -> pd.DataFrame:
    """ 
    Perform t-SNE on the kernel data.
    
    Args:
        kernel_data (pd.DataFrame): a pandas dataframe that contains the kernel data.
    
    Returns:
        pd.DataFrame: a dataframe containing the t-SNE results.
    """
    assert output_dims in [2, 3], "Please enter a valid number of dimensions of either 2 or 3"
    
    pca_50 = PCA(n_components=pca_components)
    pca_result_50 = pca_50.fit_transform(kernel_data)
    
    if output_dims == 2:
        tsne = TSNE(n_components=2, verbose=0, perplexity=perplexity, n_iter=300)
        tsne_results = tsne.fit_transform(pca_result_50)
        return pd.DataFrame(tsne_results, columns=["Component 1", "Component 2"])
        
        
    elif output_dims == 3:
        tsne = TSNE(n_components=3, verbose=0, perplexity=perplexity, n_iter=300)
        tsne_results = tsne.fit_transform(pca_result_50)
        return pd.DataFrame(tsne_results, columns=["Component 1", "Component 2", "Component 3"])
    
def pca_df(kernel_data: pd.DataFrame,
           output_dims: int = 2
           ) -> pd.DataFrame:
    """ 
    Perform PCA on the kernel data.
    
    Args:
        kernel_data (pd.DataFrame): a pandas dataframe that contains the kernel data.
        output_dims (int): number of components to keep.
    
    Returns:
        pd.DataFrame: a dataframe containing the PCA results.
    """
    assert output_dims in [2, 3], "Please enter a valid number of dimensions of either 2 or 3"

    pca = PCA(n_components = output_dims)
    pca_arr = pca.fit_transform(kernel_data)

    if output_dims == 2:
        return pd.DataFrame(pca_arr, columns=["Component 1", "Component 2"])
    
    elif output_dims == 3:
        return pd.DataFrame(pca_arr, columns=["Component 1", "Component 2", "Component 3"])

### Applying Kernels

In [8]:
def applying_graph_kernel_to_molecules(df: pd.DataFrame,
                                       kernel_function: Callable,
                                       graph_col: str = "graph") -> pd.DataFrame:
    """ 
    Apply a kernel function to the molecules in a dataframe.
    
    Args:
        df (pd.DataFrame): a pandas dataframe that contains the data.
        kernel_function (Callable): a kernel function that takes two molecules
                                    as input and returns a float.
        graph_col (str): name of the column in df containing the nx.Graph objects
        
    Returns:
        pd.DataFrame: a dataframe containing the kernel data.
    """
    assert graph_col in df, "The specified graph_col is not in the dataframe"
    
    graphs = df[graph_col].tolist()
    non_tensorial_graphs = NonTensorialInputs(graphs)
    kernel = kernel_function(node_label="element")
    
    raw_kernel_data = kernel(non_tensorial_graphs)
    
    df["kernel_data"] = list(raw_kernel_data.detach().numpy())
    
    return df

def applying_smiles_kernel_to_molecules(df: pd.DataFrame,
                                        kernel_function: Callable,
                                        smiles_col: str = "smiles") -> pd.DataFrame:
    """ 
    Apply a kernel function to the molecules in a dataframe.
    
    Args:
        df (pd.DataFrame): a pandas dataframe that contains the data.
        kernel_function (Callable): a kernel function that takes two molecules
                                    as input and returns a float.
        smiles_col (str): name of the column in df containing the smiles
        
    Returns:
        pd.DataFrame: a dataframe containing the kernel data.
    """ 
    assert smiles_col in df, "The specified smiles_col is not in the dataframe"
    
    df["kernel_data"] = kernel_function(df[smiles_col])
    
    return df  


### Graphing Chemical Space

In [9]:
def graph_chemical_space(
    df: pd.DataFrame,
    smiles_col: str = "smiles",
    kernel_col: str = "kernel_data",
    color_col: str = None,
    id_col: str = None,
    dim_reduction: str = "tsne",
    graph_title: str = None,
    hover_data: List[str] = [],
    size_col: str = None,
    opacity_col: str = None,
    fixed_size: float = 10,
    fixed_opacity: float = 0.5,
    number_of_dimensions: int = 2,
    min_iteration_slider: int = 1,
    max_iteration_slider: int = 100,
    min_oracle_score_slider: float = 0,
    max_oracle_score_slider: float = 1,
    color_continuous_scale=List[Tuple[float, str]]) -> px.scatter:
    """
    Visualise the chemical space of a pandas dataframe.
    
    Can visualise molecules in 2D or 3D chemical space using PCA or t-SNE. One 
    column of the input dataframe contains data that was calculated using a
    kernel function. The kernel function is used to calculate the distance
    between molecules. The distance data is then used for dimension reduction.
    
    Either way, the SMILES strings will be included in the hover data.
    
    Args:
        df: a pandas dataframe that contains the data.
        smiles_col: name of the column in df containing the smiles
                          plotted in fig (default 'smiles').
        kernel_col: column name of the disance data to be used for the
                          dimension reduction
        color_col: name of the column in df that will specify the point
                         colors plotted in fig (default None)
        id_col: name of the column in df that will specify id of the
                        points plotted in fig (default None)
        dim_reduction: name of desired dimension reduction technique
        graph_title: title of graph
        hover_data: list of column names to include in the hover data
        size: size of the points plotted in fig
        opacity: opacity of the points plotted in fig
        number_of_dimensions: number of dimensions to visualise the data;
                                    either 2 or 3.
        min_iteration_slider: minimum number of iterations for t-SNE
        max_iteration_slider: maximum number of iterations for t-SNE
        min_oracle_score_slider: minimum oracle value being graphed
        max_oracle_score_slider: maximum oracle value being graphed
    """
                
    assert smiles_col in df, "The specified smiles_col is not in the dataframe"
    assert kernel_col in df, "The specified kernel_col is not in the dataframe"
    assert color_col in df, "The specified color_col is not in the dataframe"
    assert id_col in df, "The specified id_col is not in the dataframe"
    
    assert number_of_dimensions in [2, 3], "Please enter a valid number of dimensions of either 2 or 3"   
    
    assert dim_reduction in ["pca", "tsne"], "Please enter a valid dimension reduction technique"
    
    if dim_reduction == "pca":
        dim_reduction_function = pca_df
    elif dim_reduction == "tsne":
        dim_reduction_function = tsne_df
    
    if graph_title is None:
        graph_title = f"{dim_reduction} plot of chemical space"
    if size_col is not None:
        assert size_col in df, "The specified size_col is not in the dataframe"
        size = df[size_col]
    else:
        size = fixed_size
    if opacity_col is not None:
        assert opacity_col in df, "The specified opacity_col is not in the dataframe"
        opacity = df[opacity_col]
    else:  
        opacity = fixed_opacity
    for col in hover_data:
        assert col in df, f"The specified column {col} is not in the dataframe" 
    
    if number_of_dimensions == 2:
        reduced_data = dim_reduction_function(list(df["kernel_data"]), output_dims=2)
        df = df.join(pd.DataFrame(reduced_data))
        
        # Adding one to the min and max values to make the graph look better
        min_component_1 = min(df["Component 1"]) -1
        max_component_1 = max(df["Component 1"]) +1
        min_component_2 = min(df["Component 2"]) -1
        max_component_2 = max(df["Component 2"]) +1
        
        # Filter out the rows according to the min and max iteration slider
        df = df[(df["iteration"] >= min_iteration_slider) & (df["iteration"] <= max_iteration_slider)]
        
        # Filter out the rows according to the min and max oracle slider
        df = df[(df["oracle_score"] >= min_oracle_score_slider) & (df["oracle_score"] <= max_oracle_score_slider)]
        
        
        fig = px.scatter(df, x="Component 1", y="Component 2", 
                         color=color_col,
                        #  size=size, opacity=opacity,
                         hover_data=hover_data, title=graph_title,
                         range_x=[min_component_1, max_component_1],
                         range_y=[min_component_2, max_component_2],
                         color_continuous_scale=[(0.0, "blue"),
                                         (0.5, "green"),
                                         (1.0, "red")])
        
    
    if number_of_dimensions == 3:
        reduced_data = dim_reduction_function(list(df["kernel_data"]), output_dims=3)
        df = df.join(pd.DataFrame(reduced_data))
        min_component_1 = min(df["Component 1"]) -1
        max_component_1 = max(df["Component 1"]) +1
        min_component_2 = min(df["Component 2"]) -1
        max_component_2 = max(df["Component 2"]) +1
        min_component_3 = min(df["Component 3"]) -1
        max_component_3 = max(df["Component 3"]) +1
        
        # Filter out the rows according to the min and max iteration slider
        df = df[(df["iteration"] >= min_iteration_slider) & (df["iteration"] <= max_iteration_slider)]
        
        # Filter out the rows according to the min and max oracle slider
        df = df[(df["oracle_score"] >= min_oracle_score_slider) & (df["oracle_score"] <= max_oracle_score_slider)]
        
        fig = px.scatter_3d(df, x="Component 1", y="Component 2", 
                         z = "Component 3",
                         color=color_col,
                        #  size=size, opacity=opacity,
                         hover_data=hover_data, title=graph_title,
                         range_x=[min_component_1, max_component_1],
                         range_y=[min_component_2, max_component_2],
                         range_z=[min_component_3, max_component_3],
                        color_continuous_scale=[(0.0, "blue"),
                                                (0.5, "green"),
                                                (1.0, "red")])
        
    return fig 
    
def applying_kernel_and_graphing(df: pd.DataFrame,
                                 is_graph: bool,
                                 kernel_function: Callable,
                                 smiles_col: str = "smiles",
                                 color_col: str = None,
                                 id_col: str = None,
                                 dim_reduction: str = "tsne",
                                 graph_title: str = None,
                                 hover_data: List[str] = [],
                                 size_col: str = None,
                                 opacity_col: str = None,
                                 fixed_size: float = 10,
                                 fixed_opacity: float = 0.5,
                                 number_of_dimensions: int = 2):
    
    if is_graph:
        df = applying_graph_kernel_to_molecules(df, kernel_function)
    else:
        df = applying_smiles_kernel_to_molecules(df, kernel_function)
    
    fig = graph_chemical_space(df, smiles_col = smiles_col,
                               color_col=color_col, id_col=id_col,
                                 dim_reduction=dim_reduction, graph_title=graph_title,
                                 hover_data=hover_data, size_col=size_col,
                                 opacity_col=opacity_col, fixed_size=fixed_size,
                                 fixed_opacity=fixed_opacity,
                                 number_of_dimensions=number_of_dimensions)
    
    return fig

## Dash Experiment

In [10]:
buffer_data = read_compressed_or_regular_json(buffer_file_path)
buffer_df = buffer_to_dataframe(buffer_data)
buffer_df["graph"] = buffer_df["smiles"].apply(smiles_to_graph)
buffer_df = applying_graph_kernel_to_molecules(buffer_df, WeisfeilerLehmanKernel, "graph")
print(buffer_df.shape)
buffer_df.head()

(990, 5)


Unnamed: 0,smiles,oracle_score,iteration,graph,kernel_data
0,O=C1CCC[C@H]1CCOc1ccccc1,0.245262,10,"(O:0, C:1, C:2, C:3, C:4, C:5, C:6, C:7, O:8, ...","[262.02765, 101.237946, 120.59226, 270.9604, 2..."
1,COC1=CC=C(OC)C1,0.106061,11,"(C:0, O:1, C:2, C:3, C:4, C:5, O:6, C:7, C:8)","[101.237946, 105.70433, 44.663803, 116.125885,..."
2,c1ccccc1,0.015326,12,"(C:0, C:1, C:2, C:3, C:4, C:5)","[120.59226, 44.663803, 160.78969, 116.125885, ..."
3,O=C1CCC[C@H]1CCOCCOCC[C@@H]1CCCC1=O,0.157078,13,"(O:0, C:1, C:2, C:3, C:4, C:5, C:6, C:7, O:8, ...","[270.9604, 116.125885, 116.125885, 393.04144, ..."
4,Fc1ccc(F)c(OCC[C@H]2CC=CCC2)c1,0.280922,14,"(F:0, C:1, C:2, C:3, C:4, F:5, C:6, O:7, C:8, ...","[215.13065, 97.51597, 120.59226, 241.18452, 28..."


In [11]:
app = Dash(__name__)

app.layout = html.Div([
    # Main title
    html.H1("Chemical Space Visualization", style={'text-align': 'center'}),
    
    # Subtitle
    html.H2("Explore the dimensionality and properties of chemical compounds", style={'text-align': 'center'}),
    
    # Dropdown for selecting the dimension
    html.Div([
        html.Label("Select Dimensionality:", style={'font-weight': 'bold'}),
        dcc.Dropdown(['2D', '3D'], '3D', id='dimension_dropdown')
    ], style={'margin': '10px'}),
    
    # Dropdown for selecting the dimension reduction method
    html.Div([
        html.Label("Select Dimension Reduction Method:", style={'font-weight': 'bold'}),
        dcc.Dropdown(['PCA', 'TSNE'], 'TSNE', id="dim_reduction_dropdown")
    ], style={'margin': '10px'}),
    
        # Slider for selecting iteration range
    html.Div([
        html.Label("Select Iteration Range:", style={'font-weight': 'bold'}),
        dcc.RangeSlider(1, 1000, value=[0, 200], id='iteration_slider', marks={i: str(i) for i in range(0, 1001, 100)})
    ], style={'margin': '20px'}),
    
    # Slider for selecting Oracle score range
    html.Div([
        html.Label("Select Oracle Score Range:", style={'font-weight': 'bold'}),
        dcc.RangeSlider(0.0, 1.0, value=[0.0, 0.4], id='oracle_score_slider', marks={i/10: str(i/10) for i in range(0, 11, 1)})
    ], style={'margin': '20px'}),
    
    # Graph component
    dcc.Graph(id='chemical_space_graph')
])

@callback(
    Output('chemical_space_graph', 'figure'),
    Input('dimension_dropdown', 'value'),
    Input('dim_reduction_dropdown', 'value'),
    Input('iteration_slider', 'value'),
    Input('oracle_score_slider', 'value'))
def update_figure(dimension, dim_reduction_method,
                  iteration_range, oracle_score_range):
    if dimension == '2D':
        dimension = 2
    elif dimension == '3D':
        dimension = 3
    else:
        raise ValueError("Invalid dimension value")
    
    if dim_reduction_method == 'PCA':
        dim_reduction = "pca"
    elif dim_reduction_method == 'TSNE':
        dim_reduction = "tsne"
    else:
        raise ValueError("Invalid dimension reduction method")
    
    
    fig = graph_chemical_space(buffer_df, smiles_col="smiles", kernel_col="kernel_data", color_col="oracle_score", id_col="iteration",
                         dim_reduction=dim_reduction, graph_title="Chemical Space", hover_data=["oracle_score", "iteration"],
                         number_of_dimensions= dimension,
                         min_iteration_slider=iteration_range[0], max_iteration_slider=iteration_range[1],
                         min_oracle_score_slider=oracle_score_range[0], max_oracle_score_slider=oracle_score_range[1])
    return fig

if __name__ == '__main__':
    app.run_server(debug=True, port=8001, use_reloader=False, open_browser=True)




Found Intel OpenMP ('libiomp') and LLVM OpenMP ('libomp') loaded at
the same time. Both libraries are known to be incompatible and this
can cause random crashes or deadlocks on Linux when loaded in the
same Python program.
Using threadpoolctl may cause crashes or deadlocks. For more
information and possible workarounds, please see
    https://github.com/joblib/threadpoolctl/blob/master/multiple_openmp.md




### Plots for Report

In [17]:
min_iteration_slider = 0
max_iteration_slider = 1000
min_oracle_score_slider = 0.0
max_oracle_score_slider = 1.0
graph_title = "Chemical Space"

In [18]:
buffer_data = read_compressed_or_regular_json(buffer_file_path)
buffer_df = buffer_to_dataframe(buffer_data)
buffer_df["graph"] = buffer_df["smiles"].apply(smiles_to_graph)
buffer_df = applying_graph_kernel_to_molecules(buffer_df, WeisfeilerLehmanKernel, "graph")
print(buffer_df.shape)
buffer_df.head()

(990, 5)


Unnamed: 0,smiles,oracle_score,iteration,graph,kernel_data
0,O=C1CCC[C@H]1CCOc1ccccc1,0.245262,10,"(O:0, C:1, C:2, C:3, C:4, C:5, C:6, C:7, O:8, ...","[262.02765, 101.237946, 120.59226, 270.9604, 2..."
1,COC1=CC=C(OC)C1,0.106061,11,"(C:0, O:1, C:2, C:3, C:4, C:5, O:6, C:7, C:8)","[101.237946, 105.70433, 44.663803, 116.125885,..."
2,c1ccccc1,0.015326,12,"(C:0, C:1, C:2, C:3, C:4, C:5)","[120.59226, 44.663803, 160.78969, 116.125885, ..."
3,O=C1CCC[C@H]1CCOCCOCC[C@@H]1CCCC1=O,0.157078,13,"(O:0, C:1, C:2, C:3, C:4, C:5, C:6, C:7, O:8, ...","[270.9604, 116.125885, 116.125885, 393.04144, ..."
4,Fc1ccc(F)c(OCC[C@H]2CC=CCC2)c1,0.280922,14,"(F:0, C:1, C:2, C:3, C:4, F:5, C:6, O:7, C:8, ...","[215.13065, 97.51597, 120.59226, 241.18452, 28..."


In [51]:
reduced_data = tsne_df(list(buffer_df["kernel_data"]), output_dims=2)
df = buffer_df.join(pd.DataFrame(reduced_data))

# Adding one to the min and max values to make the graph look better
min_component_1 = min(df["Component 1"]) -1
max_component_1 = max(df["Component 1"]) +1
min_component_2 = min(df["Component 2"]) -1
max_component_2 = max(df["Component 2"]) +1

# Filter out the rows according to the min and max iteration slider
df = df[(df["iteration"] >= min_iteration_slider) & (df["iteration"] <= max_iteration_slider)]

# Filter out the rows according to the min and max oracle slider
df = df[(df["oracle_score"] >= min_oracle_score_slider) & (df["oracle_score"] <= max_oracle_score_slider)]

labels_dict = {
    "Component 1": "Principal Component 1",  # New label for x-axis
    "Component 2": "Principal Component 2",  # New label for y-axis
    "oracle_score": "Oracle Score",  # New label for color scale
    # If "iteration" is also a column and you want to rename it, add it here
    "iteration": "Iterations"
}

fig = px.scatter(df, x="Component 1", y="Component 2", 
                    color="oracle_score",
                    range_x=[min_component_1, max_component_1],
                    range_y=[min_component_2, max_component_2],
                    color_continuous_scale="Magma",
                    labels=labels_dict)

fig.update_layout(template="seaborn")
fig.show()

fig2 = px.scatter(df, x="Component 1", y="Component 2", 
                    color="iteration",
                    range_x=[min_component_1, max_component_1],
                    range_y=[min_component_2, max_component_2],
                    color_continuous_scale="Greens",
                    labels=labels_dict)

fig2.update_layout(template="seaborn")
# Customize the color bar's tick marks
fig2.update_traces(marker=dict(
    cmax=1000,
    cmin=0,
    colorbar=dict(
        tickvals=[0, 200, 400, 600, 800, 1000],
        ticktext=['0', '200', '400', '600', '800', '1000']
    )
))

fig2.show()