In [293]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.cross_decomposition import PLSRegression


# Data Preparation

In [294]:
descriptors = pd.read_csv("descriptors.csv");
print('Dimensions of data frame is', descriptors.shape);

Dimensions of data frame is (1050, 247)


In [295]:
nNullVals = descriptors.isnull().sum().sum()
print('There are', nNullVals, 'null values in the dataframe')
# Remove columns where all values are na.
descriptors = descriptors.dropna(axis='columns', how='all');
print('Dimensions of data frame are', descriptors.shape);

There are 31503 null values in the dataframe
Dimensions of data frame are (1050, 217)


In [296]:
# There still might be some NaNs in each of the rows.
nans = np.isnan(descriptors).sum().sum()
print('There are', nans, 'nans in the dataframe')

There are 3 nans in the dataframe


In [298]:
# Method for imputing nans with mean of the column, 
# so we don't have to remove the whole row if a couple of values are missing
def imputeNansWithMean (X):
    # convert to numpy array
    header = list(X)
    X = X.values
    
    if (np.isnan(X).sum()==0):
        print("No nans")
        return X
    # Array of NaN indices
    nans= np.argwhere(np.isnan(X))
    # Loop through NaNs and impute indices with means of the whole column
    for i in range(np.size(nans,0)):
        X[nans[i][0],nans[i][1]] = np.nanmean(X[:,nans[i][1]])
    return pd.DataFrame(data = X, columns = header)
        

In [299]:
descriptors = imputeNansWithMean(descriptors)

In [315]:
# There still might be some NaNs in each of the rows.
nans = np.isnan(descriptors).sum().sum()
print('There are', nans, 'nans in the dataframe')

There are 0 nans in the dataframe


In [301]:
descriptors= descriptors.drop(['Unnamed: 0'], axis =1)
descriptors.head()

Unnamed: 0,molSIDs,XLogP,MW,LipinskiFailures,nRotB,MLogP,nAtomLAC,...,ATSc3,ATSc4,ATSc5,nHBDon,nHBAcc,bpol,apol
0,124897530.0,1.135,369.168856,0.0,10.0,2.89,3.0,...,-0.098779,0.094417,-0.105979,4.0,6.0,28.677761,57.044239
1,124897303.0,4.498,249.095378,0.0,2.0,3.11,2.0,...,-0.015299,0.003398,0.002364,0.0,1.0,15.641484,39.578516
2,124753561.0,4.554,657.230885,0.0,11.0,4.21,0.0,...,0.077083,0.204425,-0.347424,0.0,8.0,57.764452,98.933548
3,124753354.0,8.321,501.146461,1.0,9.0,2.89,4.0,...,-0.243732,0.222481,-0.064752,2.0,4.0,40.743589,70.138411
4,121286038.0,4.515,637.224657,0.0,10.0,4.32,2.0,...,0.081751,0.016048,0.040308,2.0,6.0,53.306245,97.709755


In [302]:
scoreData = pd.read_csv("data.csv");
sid_activityScore = scoreData.loc[5:,['PUBCHEM_SID','PUBCHEM_ACTIVITY_SCORE']];

  interactivity=interactivity, compiler=compiler, result=result)


In [303]:
# Because some columns have mixed values, the pandas dataframe will parse values as floating points.
sid_activityScore.head()

Unnamed: 0,PUBCHEM_SID,PUBCHEM_ACTIVITY_SCORE
5,842121.0,0.0
6,842122.0,0.0
7,842123.0,0.0
8,842124.0,0.0
9,842125.0,0.0


In [304]:
# Convert floating point values to integers. 
sid_activityScore['PUBCHEM_SID'] = sid_activityScore['PUBCHEM_SID'].astype(np.int64)
sid_activityScore['PUBCHEM_ACTIVITY_SCORE'] = sid_activityScore['PUBCHEM_ACTIVITY_SCORE'].astype(np.int64)

In [305]:
sid_activityScore.head()

Unnamed: 0,PUBCHEM_SID,PUBCHEM_ACTIVITY_SCORE
5,842121,0
6,842122,0
7,842123,0
8,842124,0
9,842125,0


In [306]:
# Insert activity score column with values 0
descriptors.insert(0,'ActivityScore',int(0))

In [307]:
descriptors.tail()

Unnamed: 0,ActivityScore,molSIDs,XLogP,MW,LipinskiFailures,nRotB,MLogP,...,ATSc3,ATSc4,ATSc5,nHBDon,nHBAcc,bpol,apol
1045,0,843609.0,3.454,321.114713,0.0,8.0,2.34,...,0.116596,0.019901,-0.292837,0.0,4.0,31.140933,47.675067
1046,0,843396.0,2.894,334.121195,0.0,3.0,2.01,...,0.466517,-0.486335,0.420218,0.0,6.0,34.433726,47.746274
1047,0,843202.0,1.316,423.179421,0.0,6.0,3.11,...,-0.15136,0.434316,-0.400282,1.0,7.0,38.656175,64.459825
1048,0,842577.0,2.752,271.024932,0.0,4.0,1.79,...,-0.03079,0.078024,-0.033099,1.0,3.0,19.27107,34.72693
1049,0,842218.0,3.82,368.140389,0.0,4.0,3.0,...,-0.115749,0.07217,0.032923,0.0,4.0,28.295347,56.584653


In [308]:
# Lookup activity scores for each molSID of descriptors matrix in the sid_activityScore dataframe 
# and insert activity score into descriptor matrix. 
for idx,val in enumerate(descriptors['molSIDs']):
    temp = sid_activityScore.loc[sid_activityScore['PUBCHEM_SID']==val];
    score = temp.iloc[0][1]
    descriptors.at[idx,'ActivityScore'] = score;


In [309]:
descriptors.head()

Unnamed: 0,ActivityScore,molSIDs,XLogP,MW,LipinskiFailures,nRotB,MLogP,...,ATSc3,ATSc4,ATSc5,nHBDon,nHBAcc,bpol,apol
0,0,124897530.0,1.135,369.168856,0.0,10.0,2.89,...,-0.098779,0.094417,-0.105979,4.0,6.0,28.677761,57.044239
1,0,124897303.0,4.498,249.095378,0.0,2.0,3.11,...,-0.015299,0.003398,0.002364,0.0,1.0,15.641484,39.578516
2,0,124753561.0,4.554,657.230885,0.0,11.0,4.21,...,0.077083,0.204425,-0.347424,0.0,8.0,57.764452,98.933548
3,10,124753354.0,8.321,501.146461,1.0,9.0,2.89,...,-0.243732,0.222481,-0.064752,2.0,4.0,40.743589,70.138411
4,0,121286038.0,4.515,637.224657,0.0,10.0,4.32,...,0.081751,0.016048,0.040308,2.0,6.0,53.306245,97.709755


In [310]:
# Initialise Data and Label variables X and y from dataframe
X, Y = descriptors.iloc[:,2:].values, descriptors.iloc[:,0].values

In [311]:
# Create test and training sets with 20:80 distribution
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 2423)
print('Training data dimensions',X_train.shape, 'and trainng labels dimensions', y_train.shape)
print('Test data dimensions', X_test.shape,'and trainng labels dimensions', y_test.shape)

Training data dimensions (840, 215) and trainng labels dimensions (840,)
Test data dimensions (210, 215) and trainng labels dimensions (210,)


In [312]:
pls2 = PLSRegression(n_components=2)
pls2.fit(X, Y)

PLSRegression(copy=True, max_iter=500, n_components=2, scale=True,tol=1e-06)

PLSRegression(copy=True, max_iter=500, n_components=2, scale=True, tol=1e-06)