# Notebook to plot the different chemical spaces

In [1]:

from DomAdpQSAR.QSARsettings import Settings
from DomAdpQSAR.QSARsrgan import DomAdpQSARSRGAN

settings = Settings()

import torch
import numpy as np
import pandas as pd
from DomAdpQSAR.utility import MixtureModel, gpu

from scipy.stats import norm

In [2]:
#Load in model
model_name = "SR GAN Ax LLx50 Feat size 32 Ang False Norm feat Norm True"

model_path = "logs/" + model_name + "/"

settings.load_model_path = model_path

SRGAN = DomAdpQSARSRGAN(settings)


SRGAN.dataset_setup()
SRGAN.model_setup()
SRGAN.prepare_optimizers()
SRGAN.load_models()
SRGAN.eval_mode()


dataset rank:  None
7
Model loaded from `logs/SR GAN Ax LLx50 Feat size 32 Ang False Norm feat Norm True/model_1400.pth`.


In [3]:
# Load in dataframe and concanate into global dataframe

global_df = pd.concat([SRGAN.federated_dataframe, SRGAN.clean_dataframe, SRGAN.validation_dataframe, SRGAN.test_dataframe], ignore_index=True)

# global_df.head()

In [4]:
global_df = pd.read_parquet(model_name+"_global_df.parquet")

### Create generated data

In [4]:
# Create Noise vectors

z = torch.tensor(MixtureModel([norm(-settings.mean_offset, 1),
                                norm(settings.mean_offset, 1)]
                                ).rvs(size=[settings.batch_size,
                                            SRGAN.G.input_size]).astype(np.float32)).to(gpu)
# Create fake fingerprints
fake_FP = SRGAN.G(z)
role = "Fake"
fake_FP = fake_FP.detach().cpu().numpy()

# Create dataframe
fake_df = pd.DataFrame()
# Add to dataframe
fake_df["FP"] = fake_FP.tolist()
fake_df["ROLE"] = role



Transfer    50000
Training     7772
Test         4383
Fake         1000
Name: ROLE, dtype: int64

In [5]:
# concatentate fake and global dataframe
global_df = pd.concat([global_df, fake_df], ignore_index=True)

In [7]:
role_mapping = {
    'Transfer': 0,
    'Training': 1,
    'Test': 2,
    'Fake': 3
}
global_df["Role"] = global_df["ROLE"].map(role_mapping)

0        0
1        0
2        0
3        0
4        0
        ..
63150    3
63151    3
63152    3
63153    3
63154    3
Name: Role, Length: 63155, dtype: int64

In [77]:
# Compute tanimoto similarity against federated dataset

from utils import calculate_target_similarity, calculate_set_similarity

# Calculate similarity against federated dataset
similarity_scores = []
for idx,FP in enumerate(global_df["FP"]):
    print(100*(idx/len(global_df)), end="\r")
    scores = calculate_target_similarity(FP, SRGAN.clean_dataframe, mean=None)
    similarity_scores.append(scores)

global_df["Similarity"] = similarity_scores

5.1144010767160165643

In [58]:
#Compute DNN and GAN features, logits

FPs = torch.tensor(global_df["FP"]).to(gpu)
DNN_logits = SRGAN.DNN(FPs).detach().cpu().numpy()
GAN_logits = SRGAN.D(FPs).detach().cpu().numpy()
DNN_features = SRGAN.DNN.features.detach().cpu().numpy()
GAN_features = SRGAN.D.features.detach().cpu().numpy()

# calculate root squared difference between DNN and GAN logits and features
logit_diff = np.sqrt(np.square(DNN_logits - GAN_logits))
feature_diff = np.sqrt(np.square(DNN_features - GAN_features))


# Add to dataframe
global_df["DNN_Logits"] = DNN_logits.tolist()
global_df["GAN_Logits"] = GAN_logits.tolist()
global_df["DNN_Features"] = DNN_features.tolist()
global_df["GAN_Features"] = GAN_features.tolist()
global_df["Logit_Diff"] = logit_diff.tolist()
global_df["Feature_Diff"] = feature_diff.tolist()

AxisError: axis 1 is out of bounds for array of dimension 1

0        0.487051
1        0.142762
2        0.006265
3        0.013557
4        0.043268
           ...   
63150    0.917429
63151    0.895201
63152    0.924473
63153    0.923102
63154    0.939591
Name: Logit_Diff, Length: 63155, dtype: float64


[0.2679201  0.16472697 0.20852098 ... 0.8389806  0.6579721  0.02896678]
[[5.8604997e-01 2.9445589e-03 4.3479800e-03 ... 4.3742085e+00
  9.7864866e-03 1.5914142e-03]
 [1.8017637e+00 1.1906178e+00 1.1273171e+00 ... 6.5069717e-01
  5.5891079e-01 7.5602680e-02]
 [8.8214558e-01 2.9445589e-03 4.3479800e-03 ... 1.9417758e+00
  1.3676919e+00 1.5029877e+00]
 ...
 [4.0481687e-03 2.7485309e+00 4.3479800e-03 ... 1.1650026e-02
  9.5187836e+00 8.0041771e+00]
 [2.2246838e-01 6.1458950e+00 1.3298645e+01 ... 2.3307071e+00
  3.6221719e+00 1.5914142e-03]
 [4.0481687e-03 6.2673812e+00 1.0051172e+01 ... 1.1650026e-02
  5.3421354e-01 1.1291233e+00]]


In [8]:
# Compare real logits and classes

DNN_distances = []
GAN_distances = []

for index, row in global_df.iterrows():
    # print(row)
    if row["ROLE"] == "Fake":
        DNN_distance = None
        GAN_distance = None
    else:
        DNN_distance = np.sqrt(np.square(row["DNN_Logits"] - row["CLASS"]))
        GAN_distance = np.sqrt(np.square(row["GAN_Logits"] - row["CLASS"]))

    DNN_distances.append(DNN_distance)
    GAN_distances.append(GAN_distance)

global_df["DNN_Distance"] = DNN_distances
global_df["GAN_Distance"] = GAN_distances


In [None]:
# save dataframe to pickle
# global_df.to_pickle(model_name+"_global_df.pkl")


In [13]:
global_df.to_parquet(model_name+"_global_df.parquet")

In [None]:
# PHATE on FPs
import phate
phate_operator = phate.PHATE(n_components=3, n_jobs=-1)
phate_train = phate_operator.fit_transform(FPs)

In [19]:
# make a new df from FP and ROLE 
colour_indexes = ["Role", "CLASS", "DNN_Logits", "GAN_Logits", "Logit_Diff" ,"DNN_Distance", "GAN_Distance" ]

plotting_df = pd.DataFrame()
plotting_df["FP"] = global_df["FP"]
plotting_df["ROLE"] = global_df["ROLE"]

for index in colour_indexes:
    plotting_df[index] = global_df[index]

In [41]:
from sklearn.manifold import TSNE
import phate

colour_indexes = ["Role", "CLASS", "DNN_Logits", "GAN_Logits", "Logit_Diff" ,"DNN_Distance", "GAN_Distance" ]

plotting_df = pd.DataFrame()
plotting_df["FP"] = global_df["FP"]
plotting_df["ROLE"] = global_df["ROLE"]

for index in colour_indexes:
    plotting_df[index] = global_df[index]

# Define the features and dimension reducers
features = ["FP", "DNN_Features", "GAN_Features"]
dim_reducers = ["T-SNE_2D", "PHATE_2D", "PHATE_3D"]

# Compute TSNE and PHATE for each feature and dimension reducer
for feature in features:
    data = np.stack(global_df[feature].to_numpy(), axis=0)
    for dim_reducer in dim_reducers[:2]:
        print(f"Computing {dim_reducer} for {feature}")
        if dim_reducer == "T-SNE_2D":
            tsne_operator = TSNE(n_components=2)
            transformed_data = tsne_operator.fit_transform(data)
        elif dim_reducer == "PHATE_2D":
            phate_operator = phate.PHATE(n_components=2, n_jobs=-1)
            transformed_data = phate_operator.fit_transform(data)
        elif dim_reducer == "PHATE_3D":
            phate_operator = phate.PHATE(n_components=3, n_jobs=-1)
            transformed_data = phate_operator.fit_transform(data)
        
        # Create a column name combining the method and features used
        column_name = f"{dim_reducer}_{feature}"
        
        plotting_df[column_name] = [row for row in transformed_data]
plotting_df.to_parquet(model_name+"_plotting_df.parquet")

Computing T-SNE_2D for FP
Computing PHATE_2D for FP
Calculating PHATE...
  Running PHATE on 63155 observations and 2048 variables.
  Calculating graph and diffusion operator...
    Calculating PCA...
    Calculated PCA in 7.94 seconds.
    Calculating KNN search...
    Calculated KNN search in 97.62 seconds.
    Calculating affinities...




    Calculated affinities in 0.88 seconds.
  Calculated graph and diffusion operator in 106.48 seconds.
  Calculating landmark operator...
    Calculating SVD...


  K.data = np.exp(-1 * np.power(K.data, self.decay))


    Calculated SVD in 4.18 seconds.
    Calculating KMeans...
    Calculated KMeans in 3.28 seconds.
  Calculated landmark operator in 8.48 seconds.
  Calculating optimal t...
    Automatically selected t = 61
  Calculated optimal t in 3.21 seconds.
  Calculating diffusion potential...
  Calculated diffusion potential in 0.65 seconds.
  Calculating metric MDS...
  Calculated metric MDS in 2.22 seconds.
Calculated PHATE in 121.05 seconds.
Computing T-SNE_2D for DNN_Features
Computing PHATE_2D for DNN_Features
Calculating PHATE...
  Running PHATE on 63155 observations and 32 variables.
  Calculating graph and diffusion operator...
    Calculating KNN search...
    Calculated KNN search in 20.69 seconds.
    Calculating affinities...
    Calculated affinities in 0.20 seconds.


  K.data = np.exp(-1 * np.power(K.data, self.decay))


  Calculated graph and diffusion operator in 20.93 seconds.
  Calculating landmark operator...
    Calculating SVD...
    Calculated SVD in 4.03 seconds.
    Calculating KMeans...
    Calculated KMeans in 4.15 seconds.
  Calculated landmark operator in 9.31 seconds.
  Calculating optimal t...
    Automatically selected t = 33
  Calculated optimal t in 3.82 seconds.
  Calculating diffusion potential...
  Calculated diffusion potential in 0.46 seconds.
  Calculating metric MDS...
  Calculated metric MDS in 2.22 seconds.
Calculated PHATE in 36.74 seconds.
Computing T-SNE_2D for GAN_Features
Computing PHATE_2D for GAN_Features
Calculating PHATE...
  Running PHATE on 63155 observations and 32 variables.
  Calculating graph and diffusion operator...
    Calculating KNN search...
    Calculated KNN search in 4.40 seconds.
    Calculating affinities...
    Calculated affinities in 0.18 seconds.


  K.data = np.exp(-1 * np.power(K.data, self.decay))


  Calculated graph and diffusion operator in 4.61 seconds.
  Calculating landmark operator...
    Calculating SVD...
    Calculated SVD in 3.81 seconds.
    Calculating KMeans...
    Calculated KMeans in 4.06 seconds.
  Calculated landmark operator in 8.91 seconds.
  Calculating optimal t...
    Automatically selected t = 26
  Calculated optimal t in 5.64 seconds.
  Calculating diffusion potential...
  Calculated diffusion potential in 0.44 seconds.
  Calculating metric MDS...
  Calculated metric MDS in 2.46 seconds.
Calculated PHATE in 22.06 seconds.


In [33]:
for row in transformed_data:
    print(row)
    break

[ 0.01794917 -0.0150245  -0.0069665 ]


In [28]:

plotting_df.to_parquet(model_name+"_plotting_df.parquet")

In [42]:
plotting_df.columns

Index(['FP', 'ROLE', 'Role', 'CLASS', 'DNN_Logits', 'GAN_Logits', 'Logit_Diff',
       'DNN_Distance', 'GAN_Distance', 'T-SNE_2D_FP', 'PHATE_2D_FP',
       'T-SNE_2D_DNN_Features', 'PHATE_2D_DNN_Features',
       'T-SNE_2D_GAN_Features', 'PHATE_2D_GAN_Features'],
      dtype='object')

In [26]:
        # Add the transformed data as a column to the plotting_df DataFrame



Index(['FP', 'ROLE', 'Role', 'CLASS', 'DNN_Logits', 'GAN_Logits', 'Logit_Diff',
       'DNN_Distance', 'GAN_Distance'],
      dtype='object')


In [None]:
# PCA on DNN features
# Colour by dataset
# Colour by class
# Colour by logit distance from class

In [None]:
# PCA on GAN features
# Colour by dataset
# Colour by class
# Colour by logit distance from class

In [None]:
# 