In [4]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestRegressor

from tqdm import tqdm

from scipy.stats import pearsonr

### Predict descriptors for individual compounds

We will use the precalculated Morgan fingerprints from the challenge data, to predict the descriptors based on the 2015 DREAM dataset (in ../data/olfaction/).

TODO:
* try other molecular features
* calculate Morgan fingerprints for the whole 2015 DREAM data

In [33]:
### index pubchem CID, columns fingerprint features
features = pd.read_csv('../data/olfaction_mixture/Morgan_Fingerprint.csv', index_col=0)
features.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1014,1015,1016,1017,1018,1019,1020,1021,1022,1023
176,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
177,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
180,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
240,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
261,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [34]:
### index pubchem CID, columns smell descriptors
target = pd.read_csv('../data/olfaction/TrainSet.txt', sep='\t')
target = target[['Compound Identifier'] + list(target.columns[6:])]
target = target.groupby('Compound Identifier').mean()
target.head()

Unnamed: 0_level_0,INTENSITY/STRENGTH,VALENCE/PLEASANTNESS,BAKERY,SWEET,FRUIT,FISH,GARLIC,SPICES,COLD,SOUR,...,ACID,WARM,MUSKY,SWEATY,AMMONIA/URINOUS,DECAYED,WOOD,GRASS,FLOWER,CHEMICAL
Compound Identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
126,37.102041,49.85,0.5375,21.0875,7.05,0.1625,2.0,4.2125,4.9875,5.375,...,4.3,3.1875,7.1375,1.2875,2.75,4.4625,0.825,1.5375,7.4875,14.3625
176,8.05102,48.0,1.785714,4.904762,1.047619,1.809524,5.809524,6.619048,3.595238,11.642857,...,3.071429,5.190476,6.97619,7.238095,3.785714,3.380952,2.809524,3.166667,4.714286,7.595238
177,22.387755,48.666667,8.257576,17.924242,2.651515,0.636364,1.045455,2.181818,6.242424,9.363636,...,3.439394,3.515152,5.181818,1.621212,2.333333,3.363636,0.651515,2.878788,3.469697,14.090909
196,14.530612,46.472727,1.254545,11.527273,0.763636,0.418182,3.909091,5.945455,4.254545,9.618182,...,6.290909,7.2,8.418182,6.109091,1.8,4.963636,1.0,2.327273,5.6,8.763636
239,24.683673,50.575758,1.19697,13.575758,3.863636,1.30303,3.984848,5.621212,4.363636,8.121212,...,2.848485,5.80303,6.318182,1.348485,0.818182,4.848485,4.257576,5.787879,8.909091,9.909091


In [35]:
### shared CIDs between datasets
shared = list(set(target.index) & set(features.index))
target_shared = target.loc[shared]
features_shared = features.loc[shared]

At first we do a train / test split to check how well we can predict smell descriptors.

In [36]:
tr = np.random.choice(target_shared.index, 50, False).tolist()
ts = list(set(target_shared.index) - set(tr))
len(shared), len(tr), len(ts)

(77, 50, 27)

In [37]:
model = RandomForestRegressor(n_estimators=300, n_jobs=-1)
model.fit(features_shared.loc[tr], target_shared.loc[tr])

In [39]:
pred = pd.DataFrame(model.predict(features_shared.loc[ts]),
                    index=ts,
                    columns=target_shared.columns)

In [40]:
correlations = pd.Series(index=pred.columns)
for desc in correlations.index:
    correlations[desc] = pearsonr(pred[desc], target.loc[ts, desc])[0]

In [41]:
correlations.sort_values()

CHEMICAL                0.003102
WOOD                    0.057385
COLD                    0.066522
SPICES                  0.175838
WARM                    0.180124
BURNT                   0.279016
GRASS                   0.300777
AMMONIA/URINOUS         0.392813
INTENSITY/STRENGTH      0.423072
BAKERY                  0.425838
FLOWER                  0.472725
ACID                    0.482700
MUSKY                   0.558550
SWEATY                  0.562500
VALENCE/PLEASANTNESS    0.628693
SOUR                    0.649630
DECAYED                 0.669838
SWEET                   0.799813
GARLIC                  0.806876
FISH                    0.829527
FRUIT                   0.834600
dtype: float64

Looks like we can predict nicely the desciptors (or at least some of them).
So let's train a full model.

In [51]:
np.random.seed(1989)
model = RandomForestRegressor(n_estimators=300, n_jobs=-1)
model.fit(features_shared, target_shared)

In [52]:
pred = pd.DataFrame(model.predict(features),
                    index=features.index,
                    columns=target.columns)
pred

Unnamed: 0,INTENSITY/STRENGTH,VALENCE/PLEASANTNESS,BAKERY,SWEET,FRUIT,FISH,GARLIC,SPICES,COLD,SOUR,...,ACID,WARM,MUSKY,SWEATY,AMMONIA/URINOUS,DECAYED,WOOD,GRASS,FLOWER,CHEMICAL
176,14.101327,45.363021,2.275435,6.291366,1.417037,1.642045,5.095697,5.998557,3.717427,11.618481,...,3.873489,5.061521,7.970453,6.730368,3.968219,3.965545,2.768606,3.134885,4.591248,9.295177
177,23.150731,45.863241,6.131270,14.092372,2.219854,1.114443,2.567279,3.934538,5.091863,10.187188,...,3.922476,4.142599,6.956252,3.068619,3.227003,4.202162,1.563189,3.013345,3.888959,12.765263
180,18.203554,44.681382,2.880157,7.525532,1.521309,1.357815,4.408951,5.551266,4.090857,10.767666,...,4.818062,4.624739,8.035859,5.410785,4.012127,3.632105,2.812047,3.375950,4.528856,11.342950
240,45.987126,59.717315,10.540068,31.188542,18.698201,0.659014,1.416160,5.865233,4.648748,3.945944,...,2.552953,7.261761,4.684898,0.850160,2.795197,1.641013,1.210893,2.896361,10.538175,15.375629
261,49.067279,30.574401,2.236672,4.617792,1.254323,2.140267,4.614367,6.750028,1.890743,16.574681,...,8.528236,6.153786,18.208150,9.116642,8.429901,9.352686,2.863024,1.710076,3.559845,15.197495
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6259976,43.158673,52.463273,5.089786,18.270515,5.919370,0.784197,2.077145,6.529402,5.108830,5.496327,...,5.321679,5.739094,6.237150,2.059216,4.392370,2.343158,2.756298,3.675372,10.435668,15.727440
2307,39.985068,49.402842,3.375996,17.206187,7.561457,1.018812,2.472159,5.270678,5.421212,7.685839,...,5.133353,5.486879,7.059685,3.075463,4.605248,3.802840,2.513674,3.380844,8.348503,15.995055
612,19.149575,39.529585,2.512905,6.981641,2.188108,1.401922,4.012283,5.521726,2.928331,11.562768,...,5.267036,6.342120,10.390119,6.637820,4.791120,5.390400,2.939940,2.615952,3.984428,10.511469
81035,47.264133,47.829498,3.510346,14.611470,5.694095,0.903168,2.489215,6.140282,5.229755,7.061330,...,5.690103,5.982128,7.917024,3.365983,5.737896,4.374159,2.816827,3.039424,8.371991,16.308594


In [53]:
pred.to_csv('../results/features/mono-descriptors-v1.csv')

We will use these mono features to calulate mixture features.