In [None]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import hamming_loss, accuracy_score
from sklearn.model_selection import train_test_split
from skmultilearn.problem_transform import BinaryRelevance
from sklearn.naive_bayes import GaussianNB

In [None]:
df = pd.read_csv("../Datasets/flavornet_dataset.csv")
df = df.dropna() # Drop N/A values

In [None]:
# Clean up the odor names by removing whitespace and commas

odors_series = df['Odor']
odors_series = odors_series.str.split(",")

for index in odors_series.index:
    # Clean up the odor names for each molecule
    odors_series[index] = [odor.strip().strip(',').lower() for odor in odors_series[index]]

In [None]:
all_odors = [] # List of all the odors present in the dataset
def get_labels_counts(array):
    for label in array:
        if label not in all_odors:
            all_odors.append(label)

odors_series.apply(get_labels_counts)

In [None]:
# Manual one-hot encoding because labels are in lists
for odor in all_odors:
    boolean_list = []
    for index in odors_series.index:
        if odor in odors_series[index]:
            boolean_list.append(1)
        else:
            boolean_list.append(0)

    df[odor] = boolean_list

In [None]:
# Create training and test sets
train, test = train_test_split(df, test_size=0.25, shuffle=True)

labels_to_drop = all_odors.copy()
labels_to_drop += ['Odorant', 'Odor', 'SMILES representation']

x_train = train.drop(labels=labels_to_drop, axis=1)
y_train = train.loc[:, all_odors]

x_test = test.drop(labels=labels_to_drop, axis=1)
y_test = test.loc[:, all_odors]

In [None]:
# Fitting a decision tree to the data #

model = DecisionTreeClassifier()
model.fit(x_train, y_train)

y_predict = model.predict(x_test)
print("Hamming loss = ",hamming_loss(y_test, y_predict))
print("Accuracy score = ",accuracy_score(y_test, y_predict))