# Notebook to plot the different chemical spaces

In [1]:

from DomAdpQSAR.QSARsettings import Settings
from DomAdpQSAR.QSARsrgan import DomAdpQSARSRGAN

settings = Settings()

import torch
import numpy as np
import pandas as pd
from DomAdpQSAR.utility import MixtureModel, gpu

from scipy.stats import norm

In [2]:
#Load in model
model_name = "SR GAN Ax LLx50 Feat size 32 Ang False Norm feat Norm True"

model_path = "logs/" + model_name + "/"

settings.load_model_path = model_path

SRGAN = DomAdpQSARSRGAN(settings)
SRGAN.model_setup()
SRGAN.eval_mode()


In [3]:
# Load in dataframe and concanate into global dataframe

global_df = pd.concat([SRGAN.federated_dataframe, SRGAN.clean_dataframe, SRGAN.validation_dataframe, SRGAN.test_dataframe], ignore_index=True)

# global_df.head()

### Create generated data

In [4]:
# Create Noise vectors

z = torch.tensor(MixtureModel([norm(-settings.mean_offset, 1),
                                norm(settings.mean_offset, 1)]
                                ).rvs(size=[settings.batch_size,
                                            SRGAN.G.input_size]).astype(np.float32)).to(gpu)
# Create fake fingerprints
fake_FP = SRGAN.G(z)
role = "Fake"
fake_FP = fake_FP.detach().cpu().numpy()

# Create dataframe
fake_df = pd.DataFrame()
# Add to dataframe
fake_df["FP"] = fake_FP.tolist()
fake_df["ROLE"] = role



Transfer    50000
Training     7772
Test         4383
Fake         1000
Name: ROLE, dtype: int64

In [5]:
# concatentate fake and global dataframe
global_df = pd.concat([global_df, fake_df], ignore_index=True)

In [7]:
role_mapping = {
    'Transfer': 0,
    'Training': 1,
    'Test': 2,
    'Fake': 3
}
global_df["Role"] = global_df["ROLE"].map(role_mapping)

0        0
1        0
2        0
3        0
4        0
        ..
63150    3
63151    3
63152    3
63153    3
63154    3
Name: Role, Length: 63155, dtype: int64

In [77]:
# Compute tanimoto similarity against federated dataset

from utils import calculate_target_similarity, calculate_set_similarity

# Calculate similarity against federated dataset
similarity_scores = []
for idx,FP in enumerate(global_df["FP"]):
    print(100*(idx/len(global_df)), end="\r")
    scores = calculate_target_similarity(FP, SRGAN.federated_dataframe, mean=None)
    similarity_scores.append(scores)

global_df["Similarity"] = similarity_scores

5.1144010767160165643

In [58]:
#Compute DNN and GAN features, logits

FPs = torch.tensor(global_df["FP"]).to(gpu)
DNN_logits = SRGAN.DNN(FPs).detach().cpu().numpy()
GAN_logits = SRGAN.D(FPs).detach().cpu().numpy()
DNN_features = SRGAN.DNN.features.detach().cpu().numpy()
GAN_features = SRGAN.D.features.detach().cpu().numpy()

# calculate root squared difference between DNN and GAN logits and features
logit_diff = np.sqrt(np.square(DNN_logits - GAN_logits))
feature_diff = np.sqrt(np.square(DNN_features - GAN_features))


# Add to dataframe
global_df["DNN_Logits"] = DNN_logits.tolist()
global_df["GAN_Logits"] = GAN_logits.tolist()
global_df["DNN_Features"] = DNN_features.tolist()
global_df["GAN_Features"] = GAN_features.tolist()
global_df["Logit_Diff"] = logit_diff.tolist()
global_df["Feature_Diff"] = feature_diff.tolist()

AxisError: axis 1 is out of bounds for array of dimension 1

[0.2679201  0.16472697 0.20852098 ... 0.8389806  0.6579721  0.02896678]
[[5.8604997e-01 2.9445589e-03 4.3479800e-03 ... 4.3742085e+00
  9.7864866e-03 1.5914142e-03]
 [1.8017637e+00 1.1906178e+00 1.1273171e+00 ... 6.5069717e-01
  5.5891079e-01 7.5602680e-02]
 [8.8214558e-01 2.9445589e-03 4.3479800e-03 ... 1.9417758e+00
  1.3676919e+00 1.5029877e+00]
 ...
 [4.0481687e-03 2.7485309e+00 4.3479800e-03 ... 1.1650026e-02
  9.5187836e+00 8.0041771e+00]
 [2.2246838e-01 6.1458950e+00 1.3298645e+01 ... 2.3307071e+00
  3.6221719e+00 1.5914142e-03]
 [4.0481687e-03 6.2673812e+00 1.0051172e+01 ... 1.1650026e-02
  5.3421354e-01 1.1291233e+00]]


In [None]:
# Compare real logits and classes

DNN_distances = []
GAN_distances = []

for index, row in global_df.iterrows():
    # print(row)
    if row["ROLE"] == "Fake":
        continue
    
    DNN_distance = np.sqrt(np.square(row["DNN_Logits"] - row["CLASS"]))
    GAN_distance = np.sqrt(np.square(row["GAN_Logits"] - row["CLASS"]))

    DNN_distances.append(DNN_distance)
    GAN_distances.append(GAN_distance)

global_df["DNN_Distance"] = DNN_distances
global_df["GAN_Distance"] = GAN_distances


[0.5518028140068054, 0.4242725372314453, 0.3973803222179413, 0.577191174030304, 0.6345763802528381, 0.6457944512367249, 0.6040388941764832, 0.4928770661354065, 0.48152801394462585, 0.457169771194458, 0.5613800883293152, 0.40844208002090454, 0.5639904141426086, 0.4832313358783722, 0.3114410638809204, 0.3651423454284668, 0.4813862442970276, 0.5174031257629395, 0.4237247109413147, 0.5076888203620911, 0.38507914543151855, 0.3995843231678009, 0.46204617619514465, 0.5789271593093872, 0.6665743589401245, 0.5267836451530457, 0.7055845856666565, 0.47105565667152405, 0.38110265135765076, 0.42285776138305664, 0.49772587418556213, 0.5357380509376526, 0.3947755694389343, 0.5105466246604919, 0.4529479146003723, 0.5917133688926697, 0.6488379240036011, 0.527290940284729, 0.5700823068618774, 0.644271969795227, 0.7507355809211731, 0.38780850172042847, 0.5438299179077148, 0.5596475601196289, 0.4814378619194031, 0.494184285402298, 0.34612369537353516, 0.5305931568145752, 0.5716675817966461, 0.438949882984

In [None]:
# save dataframe to pickle
global_df.to_pickle(model_name+"_global_df.pkl")


In [None]:
# PHATE on FPs


In [None]:
# PCA on tanimoto similarity
# Colour by dataset
# Colour by class
# Colour by DNN-GAN feature distance
# Colour by DNN-GAN logit distance 

In [None]:
# PCA on DNN features
# Colour by dataset
# Colour by class
# Colour by logit distance from class

In [None]:
# PCA on GAN features
# Colour by dataset
# Colour by class
# Colour by logit distance from class

In [None]:
# 