In [1]:
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import precision_score, hamming_loss, recall_score

In [2]:
df = pd.read_csv("../Datasets/flavornet_dataset.csv")
df.dropna(inplace=True) # Drop N/A values

In [3]:
# Clean up the odor names by removing whitespace and commas

odors_series = df['Odor']
odors_series = odors_series.str.split(",")

for index in odors_series.index:
    # Clean up the odor names for each molecule
    odors_series[index] = [odor.strip().strip(',').lower() for odor in odors_series[index]]

In [4]:
all_odors = [] # List of all the odors present in the dataset
def get_labels_counts(array):
    for label in array:
        if label not in all_odors:
            all_odors.append(label)

odors_series.apply(get_labels_counts)

mlb = MultiLabelBinarizer() # Encoding the labels as boolean arrays
encoding = mlb.fit_transform(odors_series.tolist())
classes = mlb.classes_

In [5]:
# Need to add it to the data frame because we shuffle before splitting
encoding_df = pd.DataFrame(encoding, columns=classes)
df.reset_index(drop=True, inplace=True)
encoding_df.reset_index(drop=True, inplace=True)
final_df = pd.concat([df, encoding_df], axis=1) # Adding the matrix as columns to the data frame

# final_df is now the data frame to apply algorithms on

In [6]:
# Implementing k-fold cross validation #

labels_to_drop = all_odors.copy()
labels_to_drop += ['Odorant', 'Odor', 'SMILES representation'] # List of labels to drop for x_train and x_test

k = 5 # number of splits

avg_loss_dict = {"hamming": 0, "precision": 0, "recall": 0}

kf = KFold(n_splits=k, shuffle=True, random_state=None)
for train_index, test_index in kf.split(final_df):

    train, test = final_df.iloc[train_index], final_df.iloc[test_index]
    x_train = train.drop(labels=labels_to_drop, axis=1)
    y_train = train.loc[:, all_odors]

    x_test = test.drop(labels=labels_to_drop, axis=1)
    y_test = test.loc[:, all_odors]

    # Fitting a random forest to the data #

    model = RandomForestClassifier(class_weight='balanced')
    model.fit(x_train, y_train)
    y_predict = model.predict(x_test)

    avg_loss_dict["hamming"] = avg_loss_dict.get("hamming", 0) + hamming_loss(y_test, y_predict)
    avg_loss_dict["precision"] = avg_loss_dict.get("precision", 0) + precision_score(y_test, y_predict, average='micro')
    avg_loss_dict["recall"] = avg_loss_dict.get("recall", 0) + recall_score(y_test, y_predict, average='micro')

print("Average loss values: \n")
for name in avg_loss_dict:
    print(name, " = ", avg_loss_dict[name]/k)


Average loss values: 

hamming  =  0.011216058310171546
precision  =  0.2220895147971876
recall  =  0.09720919281602068
