In [2]:
import numpy as np
import pandas as pd
import random
from sklearn.utils import shuffle

In [3]:
# For reproducibility.
np.random.seed(1)
random.seed(1)

# Existing dataset

In [4]:
df_receptors = pd.read_csv('data/Final_Receptor_dataset.csv', index_col='id')
df_ligands = pd.read_csv('data/Final_Ligand_dataset.csv', index_col='id')

In [5]:
# Check that they have the correct ordering.
print(f'Indices are the same: {np.all(df_receptors.index == df_ligands.index)}')

Indices are the same: True


In [6]:
feats_receptor = list(df_receptors.columns)
feats_ligand = list(df_ligands.columns)

print(f'Number of receptor features: {len(feats_receptor)}')
print(f'Number of ligand features: {len(feats_ligand)}')

Number of receptor features: 568
Number of ligand features: 43


### Concatenate all features into a single dataframe

In [7]:
df = pd.concat((df_receptors, df_ligands), axis=1, join='inner')

In [8]:
# The concatenated features should be the new columns.
feats = feats_receptor + feats_ligand

print(f'Number of complex (receptor-ligand) features: {len(feats)}')
print(f'Concatenation is correct: {list(df.columns) == feats}')

Number of complex (receptor-ligand) features: 611
Concatenation is correct: True


### Add a class column with 1's

In [9]:
df['class'] = 1

# Augmented dataset

Since ``df_receptors`` and ``df_ligands`` have the same ordering we can create invalid pairs as following:
1. Shuffle both of them (remove any bias in the original dataset) and reverse one of them.
2. Create new indices of the form ``receptor_ligand``.
3. Concatenate the new dataframes.
4. Add a class column with all ``0``.
5. Concatenate with the original dataset.

### 1. Shuffle the dataframes and reverse one of them

In [10]:
df_receptors_shuffled, df_ligands_shuffled = shuffle(df_receptors, df_ligands, random_state=1)
df_ligands_shuffled = df_ligands_shuffled[::-1]

### 2. Create and set new indices

In [15]:
receptors_idx = list(df_receptors_shuffled.index)
ligands_idx = list(df_ligands_shuffled.index)

new_indices = [f'{receptor}_{ligand}' for receptor, ligand in zip(receptors_idx, ligands_idx)]

# It is necessary to use the same indices, otherwise pd.concat will produce incorrect results.
for frame in (df_receptors_shuffled, df_ligands_shuffled):
    frame.set_index(np.array(new_indices), inplace=True)

# DO NOT USE df_receptors_shuffled and df_ligands_shuffled for retrieving data.
# USE df_receptors and df_ligands instead.

### 3. Concatenate them

In [16]:
df_invalid = pd.concat((df_receptors_shuffled, df_ligands_shuffled), axis=1, join='inner')

### 4. Add a class column with 0's

In [17]:
df_invalid['class'] = 0

### 5. Concatenate with the original dataset

In [18]:
df_augmentation = pd.concat((df, df_invalid), axis=0)

# Store the new dataset

In [28]:
df_augmentation.to_csv('data/augmented_dataset.csv', index=True, index_label='id')

In [24]:
df_invalid.head(1)

Unnamed: 0,ANDN920101,ARGP820101,ARGP820102,ARGP820103,BEGF750101,BEGF750102,BEGF750103,BHAR880101,BIGC670101,BIOV880101,...,chi1n,chi2n,chi3n,chi4n,hallKierAlpha,kappa1,kappa2,kappa3,Phi,class
5ulp_5td7,-0.670686,-1.140303,-1.190598,-1.326421,-1.021373,-1.229295,0.990042,1.048619,-1.446034,-0.936449,...,-0.652265,-0.799467,-0.799467,-0.915775,1.219509,-0.430922,-0.310395,0.128943,-0.065743,0


In [25]:
df_receptors.loc[['5ulp']]

Unnamed: 0_level_0,ANDN920101,ARGP820101,ARGP820102,ARGP820103,BEGF750101,BEGF750102,BEGF750103,BHAR880101,BIGC670101,BIOV880101,...,KARS160115,KARS160116,KARS160117,KARS160118,KARS160119,KARS160120,KARS160121,KARS160122,pocket volume,pocket surface
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5ulp,-0.670686,-1.140303,-1.190598,-1.326421,-1.021373,-1.229295,0.990042,1.048619,-1.446034,-0.936449,...,-1.380413,-0.85848,-0.949009,-1.181963,-1.431808,0.382601,-1.504937,-0.777399,0.358147,0.47082


In [23]:
df_ligands.loc[['5td7']]

Unnamed: 0_level_0,exactmw,amw,lipinskiHBA,lipinskiHBD,NumRotatableBonds,NumHBD,NumHBA,NumHeavyAtoms,NumAtoms,NumHeteroatoms,...,chi0n,chi1n,chi2n,chi3n,chi4n,hallKierAlpha,kappa1,kappa2,kappa3,Phi
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5td7,-0.707971,-0.708537,-0.695038,0.285538,0.055987,-0.084862,-0.956048,-0.750997,-0.482998,-0.443678,...,-0.685809,-0.652265,-0.799467,-0.799467,-0.915775,1.219509,-0.430922,-0.310395,0.128943,-0.065743
