In [1]:
import pandas as pd
from sklearn.metrics import precision_score, recall_score, hamming_loss
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from keras.models import Sequential
from keras.layers import Dense

Using TensorFlow backend.


In [2]:
df = pd.read_csv("../Datasets/flavornet_dataset.csv")
df.dropna(inplace=True) # Drop N/A values

# Clean up the odor names by removing whitespace and commas

odors_series = df['Odor']
odors_series = odors_series.str.split(",")

for index in odors_series.index:
    # Clean up the odor names for each molecule
    odors_series[index] = [odor.strip().strip(',').lower() for odor in odors_series[index]]

In [3]:
all_odors = [] # List of all the odors present in the dataset
def get_labels_counts(array):
    for label in array:
        if label not in all_odors:
            all_odors.append(label)

odors_series.apply(get_labels_counts)

mlb = MultiLabelBinarizer() # Encoding the labels as boolean arrays
encoding = mlb.fit_transform(odors_series.tolist())
classes = mlb.classes_

In [4]:

# Need to add it to the data frame because we shuffle before splitting
encoding_df = pd.DataFrame(encoding, columns=classes)
df.reset_index(drop=True, inplace=True)
encoding_df.reset_index(drop=True, inplace=True)
final_df = pd.concat([df, encoding_df], axis=1) # Adding the matrix as columns to the data frame

# final_df is now the data frame to apply algorithms on

In [5]:
labels_to_drop = all_odors.copy()
labels_to_drop += ['Odorant', 'Odor', 'SMILES representation'] # List of labels to drop for x_train and x_test

x = final_df.drop(labels=labels_to_drop, axis=1)
y = final_df.loc[:, all_odors]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42)

In [14]:
# Apply neural network model #

model = Sequential()
model.add(Dense(100, activation='relu', input_dim=x_train.shape[1]))
model.add(Dense(75, activation='relu'))
model.add(Dense(75, activation='relu'))
model.add(Dense(y_train.shape[1], activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='sgd', metrics=['accuracy'])

model.fit(x_train, y_train,epochs=25,batch_size=400)

preds = model.predict(x_test) # Predictions
preds[preds>=0.5] = 1
preds[preds<0.5] = 0

print("Hamming loss = ", hamming_loss(y_test, preds))
print("Precision score = ", precision_score(y_test, preds, average='micro'))
print("Recall score = ", recall_score(y_test, preds, average='micro'))

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25
Hamming loss =  0.3672965482489292
Precision score =  0.009316127251397419
Recall score =  0.37815126050420167
