Making the

In [1]:
!pip install pyrfume -q

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/62.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━[0m [32m41.0/62.4 kB[0m [31m1.4 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.4/62.4 kB[0m [31m948.8 kB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.0/99.0 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.7/41.7 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m128.8/128.8 kB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m8

In [2]:
import re
import pyrfume
import numpy as np
import pandas as pd
import seaborn as sns
import rdkit.Chem as Chem
import matplotlib.pyplot as plt
from rdkit.Chem import rdMolDescriptors
from sklearn.preprocessing import MultiLabelBinarizer

In [3]:
def leffingwell_reverse_one_hot(row):
    """
    Takes a row of the Leffingwell dataset
    and reverses one-hot-encoding.

    :param row: A given row of the Leffingwell dataset.
    :type row: pandas Dataframe row
    :return: A list of classes/labels for each row.
    :rtype: List
    """
    labels = [col for col in leffingwell.columns if row[col] == 1]
    return ';'.join(labels)

def dravnieks_top_n_columns(row, n):
    """
    This is for the Dravnieks dataset
    since its format is unique. It
    selects the labels with the highest
    values and returns them in the standard
    format used for the other datasets.

    :param row: A given row of the Dravnieks dataset.
    :type row: pandas Dataframe row
    :param n: The number of labels to consider.
    :type n: int
    :return: top n labels.
    :rtype: string
    """
    sorted_columns = row.sort_values(ascending=False).index
    top_n = sorted_columns[:n]
    return ';'.join(top_n)

def get_unique(df):
    """
    This function takes in an odor dataset as a
    dataframe and returns a dataframe containing
    all the unique labels of the input dataframe.

    :requirements: labels should be called 'Descriptors'.

    :param df: A multilabel dataframe with labels separated by ';'
    :type df: pandas Dataframe
    :return: A dataframe containing the unique descriptors.
    :rtype: pandas Dataframe
    """
    all_descriptors = []

    for des in df['Descriptors']:
        all_descriptors.extend(des.split(';'))

    unique_descriptors = list(set(all_descriptors))
    unique_descriptors.sort()

    df = pd.DataFrame(unique_descriptors)
    return df

def get_dataset(name):
    """
    This function takes in a string which is the
    name of the dataset and returns the fetched
    dataset.

    :param name: Name of the dataset according to Pyrfume
    :type name: string
    :return: The dataset called
    :rtype: Dataframe
    """
    # Load molecular and stimulus data
    mols =  pyrfume.load_data(f'{name}/molecules.csv')["IsomericSMILES"]
    stim =  pyrfume.load_data(f'{name}/stimuli.csv')

    # Deal with exceptions for behavior data
    try:
      behav =  pyrfume.load_data(f'{name}/behavior.csv')
    except:
      try:
        behav =  pyrfume.load_data(f'{name}/behavior_1_sparse.csv')
      except:
        behav =  pyrfume.load_data(f'{name}/behavior_1.csv')

    if name == 'ifra_2019':
      behav['Descriptor 1'] = behav[['Descriptor 1', 'Descriptor 2', 'Descriptor 3']].astype(str).apply(';'.join, axis=1)

      behav = behav['Descriptor 1']

    labels = pd.merge(stim, behav, on='Stimulus')

    # Deal with exceptions during Merging
    try:
      df = pd.merge(mols, labels, on='CID')
    except:
      labels.rename(columns={'new_CID': 'CID'}, inplace=True)
      df = pd.merge(mols, labels, on='CID')

    return df

def check_and_replace(description):
    """
    Iterates through a given ";" separated strings
    and replaces them with the mapping assigned by
    any list labelled "mapping".

    :param description: Text separated by ';'
    :type name: string
    :return: Text replaced according to the mapping
    :rtype: string
    """
    descriptors = description.split(';')
    new_descriptors = []

    for descriptor in descriptors:
        for row in replace:
            if descriptor == row[0]:
                new_descriptors.append(row[1])

    return ';'.join(new_descriptors)

def make_unique(labels):
   """
   Takes a text separated by ";" and makes them
   unique.

   :param description: Text separated by ';'.
   :type name: string
   :return: words within text made unique.
   :rtype: string
    """
   return ';'.join(list(set(labels.split(';'))))

def count_words(label_str):
    """
    Takes a ";" separated strings and counts them

   :param description: Text separated by ';'.
   :type name: string
   :return: number of words separated by ";"
   :rtype: int
    """
    return len(label_str.split(';'))

In [4]:
# Arctander
arctander = get_dataset('arctander_1960')
arctander.drop(['ChemicalName', 'CAS'], axis=1, inplace=True)
arctander.rename(columns={'Labels': 'Descriptors'}, inplace=True)
arctander = arctander.dropna()

# AromaDB
aromadb = get_dataset('aromadb')
aromadb.drop(['Raw Descriptors', 'Modifiers'], axis=1, inplace=True)
aromadb.rename(columns={'Filtered Descriptors': 'Descriptors'}, inplace=True)

# Dravnieks
dravnieks = get_dataset('dravnieks_1985')
dravnieks = dravnieks.drop(["Name", "Conc", "CAS"], axis=1)
dravnieks['Descriptors'] = dravnieks.drop(["CID", "IsomericSMILES"], axis=1).apply(lambda row: dravnieks_top_n_columns(row, 3), axis=1)
dravnieks = dravnieks[['CID', 'IsomericSMILES', 'Descriptors']]

# FlavorDB
flavordb = get_dataset('flavordb')
flavordb = flavordb[['CID', 'Odor Percepts', 'IsomericSMILES']]
flavordb.rename(columns={'Odor Percepts': 'Descriptors'}, inplace=True)
flavordb.rename(columns={'IsomericSMILES': 'IsomericSMILES'}, inplace=True)
flavordb = flavordb.dropna()

# Flavornet
flavornet = get_dataset('flavornet')
flavornet.rename(columns={'Descriptors':'Descriptors'}, inplace=True)
flavornet.rename(columns={'IsomericSMILES': 'IsomericSMILES'}, inplace=True)

# Goodscents
goodscents = get_dataset('goodscents')
goodscents.drop(['TGSC ID', 'Concentration %','Solvent'], axis=1, inplace=True)
goodscents = goodscents.dropna()

# IFRA
ifra = get_dataset('ifra_2019')
ifra.rename(columns={'Descriptor 1': 'Descriptors'}, inplace=True)

# Leffingwell
leffingwell = get_dataset('leffingwell')

#This filtering was done as CIDs below 0 were not well documented.
leffingwell = leffingwell[leffingwell['CID']>0]

# Apply the function to create the 'labels' column
leffingwell['Descriptors'] = leffingwell.apply(leffingwell_reverse_one_hot, axis=1)

leffingwell.rename(columns={'IsomericSMILES_x':'IsomericSMILES'}, inplace=True)
leffingwell = leffingwell[['CID', 'IsomericSMILES', 'Descriptors']]
leffingwell = leffingwell.dropna()


Merging the Datasets.

In [5]:
# Concatentating all datasets
all_compounds = pd.concat([arctander, aromadb, dravnieks, flavordb, flavornet, goodscents, ifra, leffingwell], axis=0, ignore_index=True)

# Here all rows are merged on CID and labels are joined together.
all_compounds = all_compounds.astype({'CID': 'int64'}) #Setting the CID column to dtype int64 because it works better with other code.

all_compounds.reset_index(drop=True, inplace=True)
agg_functions = {'CID': 'first', 'IsomericSMILES': 'first', 'Descriptors': lambda x: ';'.join(x)} #Takes the first CID and SMILES and joins all the descriptors together.
all_compounds = all_compounds.groupby('CID').aggregate(agg_functions)
print(all_compounds.shape)

(7541, 3)


Removing SMILES with dots in them as they are mixtures.

In [6]:
#Get SMILES with '.' in them
all_compounds['HasDot'] = all_compounds['IsomericSMILES'].apply(lambda x: '.' in x)

#Drop all rows with '.' in SMILES
all_compounds = all_compounds[all_compounds['HasDot'] == False]
all_compounds.drop(['HasDot'], axis=1, inplace=True)
print(all_compounds.shape)

(7267, 3)


Removing Capitalization and Duplicates in the labels.

In [7]:
all_compounds['Descriptors'] = all_compounds['Descriptors'].str.lower()
all_compounds['Descriptors'] = all_compounds['Descriptors'].str.split(';').apply(set).apply(list).apply(lambda x: ';'.join(x))

Canonicalization step

In [8]:
# labels_df is changed to a list called "replace" to make it go
# through the check_and_replace function.

replace = pd.read_excel('labels_canonicalization.xlsx')
replace = replace.fillna('')
replace = replace[::-1]
replace = replace.values.tolist()

#Normalizing the labels in the combined dataset and removing hedonic descriptors
all_compounds['Descriptors'] = all_compounds['Descriptors'].apply(check_and_replace)
all_compounds['Descriptors'] = all_compounds['Descriptors'].apply(make_unique)
all_compounds['Descriptors'] = all_compounds['Descriptors'].dropna()
all_compounds = all_compounds[all_compounds['Descriptors'] != '']
print(all_compounds.shape)

(7258, 3)


Remove odor descriptors that occur less than 30 times

In [9]:
#Checks descriptor frequency
labels_to_remove = all_compounds['Descriptors'].str.split(';', expand=True).stack().value_counts()

#Remove labels that occur less than 30 times
labels_to_remove = labels_to_remove[labels_to_remove < 30]
all_compounds['Descriptors'] = all_compounds['Descriptors'].apply(lambda x: ';'.join([item for item in x.split(';') if item not in labels_to_remove.index]))
all_compounds = all_compounds[all_compounds['Descriptors'] != '']
print(all_compounds.shape)

(7221, 3)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  all_compounds['Descriptors'] = all_compounds['Descriptors'].apply(lambda x: ';'.join([item for item in x.split(';') if item not in labels_to_remove.index]))


Remove compounds that are heavier than 296 g/mol as it is the heaviest odorant.

In [10]:
#Get molecular weights of compounds.
all_compounds['MolecularWeight'] = all_compounds['IsomericSMILES'].apply(lambda x: rdMolDescriptors.CalcExactMolWt(Chem.MolFromSmiles(x)))

#Remove compounds with molecular weights above 296.
all_compounds = all_compounds[all_compounds['MolecularWeight'] <= 296]
all_compounds = all_compounds.drop(['MolecularWeight'], axis=1)
print(all_compounds.shape)

(6969, 3)


In [11]:
#Remove compounds containing other elements besides C, N, O, S, P.
all_compounds['HasOtherElements'] = all_compounds['IsomericSMILES'].apply(lambda x: Chem.MolFromSmiles(x).HasSubstructMatch(Chem.MolFromSmarts('[!#6;!#7;!#8;!#16;!#15]')))

all_compounds = all_compounds[all_compounds['HasOtherElements'] == False]
all_compounds.drop(['HasOtherElements'], axis=1, inplace=True)
print(all_compounds.shape)

(6933, 3)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  all_compounds.drop(['HasOtherElements'], axis=1, inplace=True)


One hot encode the labels

In [12]:
combined_dataset = all_compounds.copy()

#Turn descriptors column into lists
combined_dataset['Descriptors'] = combined_dataset['Descriptors'].apply(lambda x: x.split(';'))

mlb = MultiLabelBinarizer(sparse_output=True)
mlb.fit(combined_dataset['Descriptors'])

data_bin = combined_dataset.join(pd.DataFrame.sparse.from_spmatrix(mlb.transform(combined_dataset['Descriptors']), index=combined_dataset.index, columns=mlb.classes_))
data_bin.drop([''], axis=1, inplace=True)
# uncomment to download
data_bin.to_csv('alldesc_dataset.csv', index=False)

## Getting the dataset with a Computationally derived Ontology (CO)

In [13]:
labels_df = pd.read_excel('computer_derived_ontology_11.xlsx')

# Labels that did not cluster with any other labels were labelled as 'NaN'
labels_to_remove = labels_df[labels_df['Umbrella Terms'].isna()]['Original Descriptors']
labels_df = labels_df.dropna()
replace = labels_df.values.tolist()

In [14]:
# Removing NaN values
umbrella_dataset = all_compounds.copy()
umbrella_dataset['Descriptors'] = all_compounds['Descriptors'].apply(lambda x: ';'.join([item for item in x.split(';') if item not in labels_to_remove.index]))
umbrella_dataset = umbrella_dataset[umbrella_dataset['Descriptors'] != '']
print(umbrella_dataset.shape)

(6933, 3)


In [15]:
# Replacing the odor descriptors with the umbrella terms
umbrella_dataset['Descriptors'] = umbrella_dataset['Descriptors'].apply(check_and_replace)
umbrella_dataset['Descriptors'] = umbrella_dataset['Descriptors'].apply(make_unique)
umbrella_dataset['Descriptors'] = umbrella_dataset['Descriptors'].dropna()
umbrella_dataset = umbrella_dataset[umbrella_dataset['Descriptors'] != '']
print(umbrella_dataset.shape)

(6933, 3)


In [16]:
#Turn descriptors column into lists
umbrella_dataset['Descriptors'] = umbrella_dataset['Descriptors'].apply(lambda x: x.split(';'))

mlb = MultiLabelBinarizer(sparse_output=True)
mlb.fit(umbrella_dataset['Descriptors'])

data_bin = umbrella_dataset.join(pd.DataFrame.sparse.from_spmatrix(mlb.transform(umbrella_dataset['Descriptors']), index=umbrella_dataset.index, columns=mlb.classes_))
data_bin.to_csv('computer_dataset_11.csv', index=False)