# Investigate Graph Kernel

The aim of this file is to investigate the Graph Kernel.

We created a csv file with SMILES strings that are highly similar and are only distinguished in one of three ways. We aim to investigate whether this structural similarity will be seen in the graph kernel output.

## Setup

### Change Working Directory

In [1]:
# Change working directory to the parent of the parent of the script

import os

# Get the current working directory
current_directory = os.getcwd()

# Get the parent of the parent directory
parent_parent_directory = os.path.abspath(os.path.join(current_directory, '..', '..'))

# Change the working directory to the parent of the parent directory
os.chdir(parent_parent_directory)

# Verify the change by printing the new working directory
print("New working directory:", os.getcwd())


New working directory: /Users/gordianimperial/Documents/Group Project/bo_molecules


### Imports

In [2]:
# Standard library
import random

# Third-party
from tdc import Oracle
from gauche.kernels.graph_kernels import WeisfeilerLehmanKernel
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import plotly.express as px
from gauche import NonTensorialInputs

In [3]:
# Module Imports
from modules.utils.read_and_sample_molecules import sample_graphs_from_smiles_csv
from modules.utils.molecular_data_conversion import graph_to_smiles, smiles_to_graph
from modules.visualisation.visualising_chemical_space import applying_graph_kernel_to_molecules, applying_smiles_kernel_to_molecules, graph_chemical_space, pca_df, tsne_df, applying_kernel_and_graphing

### Specifications

In [4]:
path_to_csv = "data/SMILES_for_graph_kernel_testing.csv"

kernel = WeisfeilerLehmanKernel

number_of_dimensions = 2

# Options are either 'pca' or 'tsne'
dim_reduction = 'pca'

## Experiment

In [5]:
# Load csv as a pandas dataframe
df = pd.read_csv(path_to_csv)

df['graph'] = [smiles_to_graph(smiles) for smiles in df['SMILES']]

df = applying_graph_kernel_to_molecules(df, kernel, 'graph')
print(df['kernel_data'])

0     [139.94658, 110.91511, 174.93323, 174.93323, 1...
1     [110.91511, 136.969, 173.44443, 173.44443, 109...
2     [174.93323, 173.44443, 326.04575, 297.75867, 1...
3     [174.93323, 173.44443, 297.75867, 326.04575, 1...
4     [110.91511, 109.426315, 173.44443, 176.42201, ...
5     [174.93323, 176.42201, 297.75867, 297.75867, 1...
6     [176.42201, 174.93323, 299.24747, 299.24747, 1...
7     [183.12158, 178.65521, 302.96945, 302.96945, 1...
8     [186.84357, 185.35478, 317.11298, 317.11298, 1...
9     [110.91511, 109.426315, 176.42201, 173.44443, ...
10    [112.4039, 110.91511, 174.93323, 174.93323, 11...
11    [121.33666, 119.84787, 197.26512, 197.26512, 1...
12    [121.33666, 111.6595, 183.12158, 183.12158, 11...
13    [110.91511, 119.10347, 181.6328, 181.6328, 110...
14    [174.93323, 174.18883, 315.6242, 295.52548, 17...
15    [174.93323, 174.18883, 295.52548, 315.6242, 17...
16    [110.91511, 110.17071, 181.6328, 184.61038, 11...
17    [174.93323, 177.16641, 295.52548, 295.5254

In [6]:
assert number_of_dimensions in [2, 3], "Please enter a valid number of dimensions of either 2 or 3"   
    
assert dim_reduction in ["pca", "tsne"], "Please enter a valid dimension reduction technique"
if dim_reduction == "pca":
    dim_reduction_function = pca_df
elif dim_reduction == "tsne":
    dim_reduction_function = tsne_df

if number_of_dimensions == 2:
    if dim_reduction == "pca":
        reduced_data = pca_df(list(df["kernel_data"]), output_dims=2)
        df["Component 1"] = reduced_data["Component 1"]
        df["Component 2"] = reduced_data["Component 2"]
    else:
        reduced_data = tsne_df(list(df["kernel_data"]), output_dims=2, pca_components=15, perplexity=10)
        df["Component 1"] = reduced_data["Component 1"]
        df["Component 2"] = reduced_data["Component 2"]
    
    # Adding one to the min and max values to make the graph look better
    min_component_1 = min(df["Component 1"]) -1
    max_component_1 = max(df["Component 1"]) +1
    min_component_2 = min(df["Component 2"]) -1
    max_component_2 = max(df["Component 2"]) +1
    
    fig = px.scatter(df, x="Component 1", y="Component 2", 
                        color="colour_col",
                        hover_data=["SMILES"],
                        range_x=[min_component_1, max_component_1],
                        range_y=[min_component_2, max_component_2])
    

if number_of_dimensions == 3:
    if dim_reduction == "pca":
        reduced_data = pca_df(list(df["kernel_data"]), output_dims=3)
        df["Component 1"] = reduced_data["Component 1"]
        df["Component 2"] = reduced_data["Component 2"]
        df["Component 3"] = reduced_data["Component 3"]
    else:
        reduced_data = tsne_df(list(df["kernel_data"]), output_dims=3, pca_components=15, perplexity=10)
        df["Component 1"] = reduced_data["Component 1"]
        df["Component 2"] = reduced_data["Component 2"]
        df["Component 3"] = reduced_data["Component 3"]
    

    min_component_1 = min(df["Component 1"]) -1
    max_component_1 = max(df["Component 1"]) +1
    min_component_2 = min(df["Component 2"]) -1
    max_component_2 = max(df["Component 2"]) +1
    min_component_3 = min(df["Component 3"]) -1
    max_component_3 = max(df["Component 3"]) +1
    
    fig = px.scatter_3d(df, x="Component 1", y="Component 2", z = "Component 3",
                        color="colour_col",
                        hover_data=["SMILES"],
                        range_x=[min_component_1, max_component_1],
                        range_y=[min_component_2, max_component_2],
                        range_z=[min_component_3, max_component_3])
    
fig.show()

### PCA

In [None]:
if dim_reduction == "pca":
    dim_reduction_function = pca_df
elif dim_reduction == "tsne":
    dim_reduction_function = tsne_df

In [None]:
# Adding one to the min and max values to make the graph look better
min_component_1 = min(df["Component 1"]) -1
max_component_1 = max(df["Component 1"]) +1
min_component_2 = min(df["Component 2"]) -1
max_component_2 = max(df["Component 2"]) +1

fig = px.scatter(df, x="Component 1", y="Component 2", 
                    range_x=[min_component_1, max_component_1],
                    range_y=[min_component_2, max_component_2],
                    color = "colour_col")

fig.show()

### TSNE

In [None]:
dim_reduction = 'tsne'
if dim_reduction == "pca":
    reduced_data = pca_df(list(df["kernel_data"]), output_dims=2)
elif dim_reduction == "tsne":
    reduced_data = tsne_df(list(df["kernel_data"]), output_dims=2,
                                     pca_components=10, perplexity = 10)

print(reduced_data.shape)
print(df.shape)

df["Component 1"] = reduced_data["Component 1"]
df["Component 2"] = reduced_data["Component 2"]
print(df.head())

In [None]:
# Adding one to the min and max values to make the graph look better
min_component_1 = min(df["Component 1"]) -1
max_component_1 = max(df["Component 1"]) +1
min_component_2 = min(df["Component 2"]) -1
max_component_2 = max(df["Component 2"]) +1

fig = px.scatter(df, x="Component 1", y="Component 2", 
                    range_x=[min_component_1, max_component_1],
                    range_y=[min_component_2, max_component_2],
                    color = "colour_col")

fig.show()