In [1]:
import pandas as pd
from sklearn.metrics import precision_score, recall_score, hamming_loss
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from keras.models import Sequential
from keras.layers import Dense

Using TensorFlow backend.


In [2]:
# Read in the AlvaDesc property dataset
df_properties = pd.read_csv("flavornet_molecules_properties.txt", sep="\t")
df_properties.drop(['No.'], axis=1, inplace=True) # Drop numbered column

# Read in the RDKit property dataset
df = pd.read_csv("../Datasets/flavornet_dataset.csv")
df.dropna(inplace=True) # Drop N/A values
df.reset_index(drop=True, inplace=True)

# Add odor column to AlvaDesc properties dataframe
df_properties['Odor'] = df['Odor']
"""
Code to create .smiles file which AlvaDesc uses to generate molecular properties

df = df[['Odorant', 'SMILES representation']]
df.to_csv("molecules.smiles","\t",index=False, header=False, columns=['SMILES representation','Odorant'])
"""

df_properties.head(15)

Unnamed: 0,NAME,MW,AMW,Sv,Se,Sp,Si,Mv,Me,Mp,...,N%,O%,X%,nCsp3,nCsp2,nCsp,max_conj_path,nStructures,totalcharge,Odor
0,trimethylamine,59.13,4.548462,6.1286,12.6362,7.0513,15.1591,0.471431,0.972015,0.542408,...,7.692308,0.0,0,3,0,0,0,1,0,fish
1,ethanal,44.06,6.294286,3.7684,7.0945,3.9773,8.0398,0.538343,1.0135,0.568186,...,0.0,14.285714,0,1,1,0,0,1,0,"pungent, ether"
2,methanethiol,48.12,8.02,3.2407,5.8436,4.1705,6.7504,0.540117,0.973933,0.695083,...,0.0,0.0,0,1,0,0,0,1,0,"sulfur, gasoline, garlic"
3,propanal,58.09,5.809,5.2952,9.9781,5.7387,11.455,0.52952,0.99781,0.57387,...,0.0,10.0,0,2,1,0,0,1,0,"solvent, pungent"
4,pentane,72.17,4.245294,8.1608,16.3016,9.5684,19.4912,0.480047,0.958918,0.562847,...,0.0,0.0,0,5,0,0,0,1,0,alkane
5,propanol,60.11,5.009167,5.822,11.8617,6.5001,13.8702,0.485167,0.988475,0.541675,...,0.0,8.333333,0,3,0,0,0,1,0,"alcohol, pungent"
6,dimethylsulfide,62.15,6.905556,4.7675,8.7272,5.9319,10.1656,0.529722,0.969689,0.6591,...,0.0,0.0,0,2,0,0,0,1,0,"cabbage, sulfur, gasoline"
7,ethylformate,74.09,6.735455,6.01,11.3054,6.1932,12.6644,0.546364,1.027764,0.563018,...,0.0,18.181818,0,2,1,0,0,1,0,pungent
8,methylethylketone,72.12,5.547692,6.822,12.8617,7.5001,14.8702,0.524769,0.989362,0.576931,...,0.0,7.692308,0,3,1,0,0,1,0,ether
9,methylethylsulfide,76.18,6.348333,6.2943,11.6108,7.6933,13.5808,0.524525,0.967567,0.641108,...,0.0,0.0,0,3,0,0,0,1,0,"sulfur, garlic"


In [3]:
odors_series = df_properties['Odor']
odors_series = odors_series.str.split(",")

for index in odors_series.index:
    # Clean up the odor names for each molecule
    odors_series[index] = [odor.strip().strip(',').lower() for odor in odors_series[index]]

In [4]:
all_odors = [] # List of all the odors present in the dataset
def get_labels_counts(array):
    for label in array:
        if label not in all_odors:
            all_odors.append(label)

odors_series.apply(get_labels_counts)

mlb = MultiLabelBinarizer() # Encoding the labels as boolean arrays
encoding = mlb.fit_transform(odors_series.tolist())
classes = mlb.classes_

In [5]:
# Need to add it to the data frame because we shuffle before splitting
encoding_df = pd.DataFrame(encoding, columns=classes)
encoding_df.reset_index(drop=True, inplace=True)
final_df = pd.concat([df_properties, encoding_df], axis=1) # Adding the matrix as columns to the data frame

# final_df is now the data frame to apply algorithms on

In [6]:
labels_to_drop = all_odors.copy()
labels_to_drop += ['NAME', 'Odor'] # List of labels to drop for x_train and x_test

#odors_to_keep = [odor for odor in odor_counts if odor_counts[odor] > 1] # Only keep odors that appear more than once

x = final_df.drop(labels=labels_to_drop, axis=1)
y = final_df.loc[:, all_odors]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42)

In [11]:
# Apply neural network model #

model = Sequential()
model.add(Dense(75, activation='relu', input_dim=x_train.shape[1]))
model.add(Dense(100, activation='relu'))
model.add(Dense(80, activation='relu'))
model.add(Dense(y_train.shape[1], activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='sgd', metrics=['accuracy'])

model.fit(x_train, y_train,epochs=25, batch_size=500)

preds = model.predict(x_test) # Predictions
preds[preds>=0.5] = 1
preds[preds<0.5] = 0

print("Hamming loss = ", hamming_loss(y_test, preds))
print("Precision score = ", precision_score(y_test, preds, average='micro'))
print("Recall score = ", recall_score(y_test, preds, average='micro'))

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25
Hamming loss =  0.34701436130007557
Precision score =  0.008643422209200117
Recall score =  0.33053221288515405
