In [1]:
!pip install annoy
!pip install faiss
!sudo apt-get install libomp-dev
!pip install rank_bm25
!python -m spacy download en_core_web_sm

Collecting annoy
  Downloading annoy-1.17.0.tar.gz (646 kB)
[?25l[K     |▌                               | 10 kB 26.5 MB/s eta 0:00:01[K     |█                               | 20 kB 33.2 MB/s eta 0:00:01[K     |█▌                              | 30 kB 39.8 MB/s eta 0:00:01[K     |██                              | 40 kB 30.5 MB/s eta 0:00:01[K     |██▌                             | 51 kB 22.7 MB/s eta 0:00:01[K     |███                             | 61 kB 25.2 MB/s eta 0:00:01[K     |███▌                            | 71 kB 24.6 MB/s eta 0:00:01[K     |████                            | 81 kB 25.2 MB/s eta 0:00:01[K     |████▋                           | 92 kB 27.0 MB/s eta 0:00:01[K     |█████                           | 102 kB 27.6 MB/s eta 0:00:01[K     |█████▋                          | 112 kB 27.6 MB/s eta 0:00:01[K     |██████                          | 122 kB 27.6 MB/s eta 0:00:01[K     |██████▋                         | 133 kB 27.6 MB/s eta 0:00:01[K   

### Imports

In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import time
import faiss
import annoy
from rank_bm25 import BM25Okapi
import tqdm
import spacy
from rank_bm25 import BM25Okapi
from tqdm import tqdm
import en_core_web_sm
nlp = en_core_web_sm.load()
import warnings
warnings.filterwarnings('ignore')

### Data analysis

###### Initial dataset 1

In [3]:
import pandas as pd
df1 = pd.read_excel('Aroma_Chemical.xlsx')
df1.head(3)

Unnamed: 0,CAS-Number,Aroma_Chemical,Odor_Description,Family,Odor_Strength,Perfumary_Note
0,112-31-2,Aldehyde C10 – Decanal,"Sweet, aldehydic, orange, waxy and citrus rind",Aldehydic,High,Base
1,112-45-8,Aldehyde C11,"Citrus, waxy, fatty and aldehydic with a green...",Aldehydic,High,Base
2,112-54-9,Aldehyde C12 Lauric,"Soapy, waxy, aldehydic, citrus, orange rindy w...",Aldehydic,High,Base


In [4]:
df1.tail(3)

Unnamed: 0,CAS-Number,Aroma_Chemical,Odor_Description,Family,Odor_Strength,Perfumary_Note
128,,Traseolide (Quest),"Dry, sweet, amber, musk, herbal, creamy",Musky,,Base
129,,Ysamber K (Symrise),"Woody, woody-ambery, cedar wood, fruityYsamber...",Woody,,Base
130,,Ambrocenide Crystals (Symrise),"Amber, woody, amberyPowerful and long-lasting ...",Ambery,,Base


#### Due to NaN we need some imputation

In [5]:
df1['CAS-Number'] = df1['CAS-Number'].replace({np.nan: 'NA'})
df1['Odor_Strength'] = df1['Odor_Strength'].replace({np.nan: 'NA'})

In [6]:
list(df1.Perfumary_Note.unique())

['Base', 'Middle', 'Top']

In [7]:
df1.Perfumary_Note.value_counts()

Base      68
Middle    40
Top       23
Name: Perfumary_Note, dtype: int64

In [8]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le.fit(['Base','Middle','Top'])
df1['Perfumary_Note_1'] = le.transform(df1.Perfumary_Note)


In [9]:
list(df1.Odor_Strength.unique())

['High', 'Medium', 'Low', 'NA']

In [10]:
df1.Odor_Strength.value_counts()

Medium    65
NA        43
High      20
Low        3
Name: Odor_Strength, dtype: int64

In [11]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le.fit(list(df1.Odor_Strength.unique()))
df1['Odor_Strength_1'] = le.transform(df1.Odor_Strength)

In [12]:
df1['Family'] = df1['Family'].replace({np.nan: 'None'})

In [13]:
df1.Family.value_counts()

Floral              36
Woody               13
Fruity              12
Green               12
Musk                 8
Gourmand             6
Aldehydic            5
Spicy                4
Herbal               4
Musky                3
Citrus               3
Melon                2
Vanilla              2
Coconut              2
Amber                2
Animal               2
Balsamic             2
Anisic               1
Soapy                1
Caramellic           1
Tonka                1
Woody/Amber          1
Marine               1
None                 1
Minty                1
Mossy                1
Animalic             1
Musky - Animalic     1
Anti-oxidant         1
Ambery               1
Name: Family, dtype: int64

In [14]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le.fit(list(df1.Family.unique()))
df1['Family_1'] = le.transform(df1.Family)
df1.head()

Unnamed: 0,CAS-Number,Aroma_Chemical,Odor_Description,Family,Odor_Strength,Perfumary_Note,Perfumary_Note_1,Odor_Strength_1,Family_1
0,112-31-2,Aldehyde C10 – Decanal,"Sweet, aldehydic, orange, waxy and citrus rind",Aldehydic,High,Base,0,0,0
1,112-45-8,Aldehyde C11,"Citrus, waxy, fatty and aldehydic with a green...",Aldehydic,High,Base,0,0,0
2,112-54-9,Aldehyde C12 Lauric,"Soapy, waxy, aldehydic, citrus, orange rindy w...",Aldehydic,High,Base,0,0,0
3,110-41-8,Aldehyde C12 MNA,fresh amber aldehydic moss citrus tuberose met...,Aldehydic,High,Base,0,0,0
4,104-67-6,Aldehyde C14 – gamma Undecalactone,"Creamy, fatty, fruity, coconut, peach, Iactoni...",Aldehydic,Medium,Base,0,2,0


### After few imputation and feature engineering here is the transformed dataset 1

In [16]:
import pandas as pd
df1 = pd.read_csv('chem_modified.csv',index_col=0)
df1.head(3)

Unnamed: 0,CAS-Number,Aroma_Chemical,Odor_Description,Family,Odor_Strength,Perfumary_Note,Perfumary_Note_1,Odor_Strength_1,Family_1,sweet,...,musk,wood,herbal,amber,animal,fresh,sandal,lavender,floral,spice
0,112-31-2,Aldehyde C10 – Decanal,"Sweet, aldehydic, orange, waxy and citrus rind",Aldehydic,High,Base,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,112-45-8,Aldehyde C11,"Citrus, waxy, fatty and aldehydic with a green...",Aldehydic,High,Base,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,112-54-9,Aldehyde C12 Lauric,"Soapy, waxy, aldehydic, citrus, orange rindy w...",Aldehydic,High,Base,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [17]:
df_v1 = df1[['Odor_Description','Perfumary_Note_1',
       'Odor_Strength_1', 'Family_1', 'sweet', 'aldehydic', 'waxy', 'citrus',
       'orange', 'fatty', 'fruity', 'moss', 'tuberose', 'coconut', 'musk',
       'wood', 'herbal', 'amber', 'animal', 'fresh', 'sandal', 'lavender',
       'floral', 'spice']]
df_v1.head(3)

Unnamed: 0,Odor_Description,Perfumary_Note_1,Odor_Strength_1,Family_1,sweet,aldehydic,waxy,citrus,orange,fatty,...,musk,wood,herbal,amber,animal,fresh,sandal,lavender,floral,spice
0,"Sweet, aldehydic, orange, waxy and citrus rind",0,0,0,1,1,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
1,"Citrus, waxy, fatty and aldehydic with a green...",0,0,0,0,1,1,1,0,1,...,0,0,0,0,0,0,0,0,0,0
2,"Soapy, waxy, aldehydic, citrus, orange rindy w...",0,0,0,0,1,1,1,1,0,...,0,0,0,0,0,0,0,0,1,0


# Search Algorithm (1) : Approximate Nearest Neighbor (vector encoding using LSH) 

1.   List item
2.   List item



#### LSH refers to a family of functions (known as LSH families) to hash data points into buckets so that data points near each other are located in the same buckets with high probability, while data points far from each other are likely to be in different buckets. This makes it easier to identify observations with various degrees of similarity.

In [19]:
# The idea here is to spacify an external description (attribute) and find the similarities in our database

df_v2 = df_v1.set_index('Odor_Description').T.to_dict('list')
data = dict()
attr = []
vectors = []


for a,b in df_v2.items():
    attr.append(a)
    vectors.append(b)

data['attr'] = np.array(attr, dtype=object)
data['vector'] = np.array(vectors, dtype=float)

class LSHIndex():
    def __init__(self, vectors, labels):
        self.dimension = vectors.shape[1]
        self.vectors = vectors.astype('float32')
        self.labels = labels    
   
    def build(self, num_bits=8):
      self.index = faiss.IndexLSH(self.dimension, num_bits)
      self.index.add(self.vectors)
        
    def query(self, vectors, k=7):
        distances, indices = self.index.search(vectors.reshape(-1,23).astype('float32'), k) 
        return [self.labels[i] for i in indices[0]]

LSH_index = LSHIndex(data["vector"], data["attr"])
LSH_index.build()
# Here you spacify description from your choice
# Perfume_vector = will convert the attribute into veactor_based to find the similarities
perfume_vector, perfume_attr = data['vector'][list(data['attr']).index("mossy oakmoss woody phenolic earthy")], data['attr'][list(data['attr']).index("mossy oakmoss woody phenolic earthy")]
start = time.time()
simlar_perfume = '\n --'.join(LSH_index.query(perfume_vector))
# print(simlar_perfume ingredients)
simlar_perfume_v1 = LSH_index.query(perfume_vector)
for i in simlar_perfume_v1:
  print('Aroma_Chemical - '+str(df1[df1['Odor_Description']==str(i)]['Aroma_Chemical'].iloc[0]))
  print('Perfume Family - '+str(df1[df1['Odor_Description']==str(i)]['Family'].iloc[0]))
  print('Perfume Odor_Strength - '+str(df1[df1['Odor_Description']==str(i)]['Odor_Strength'].iloc[0]))
  print('Perfumary_Note - '+str(df1[df1['Odor_Description']==str(i)]['Perfumary_Note'].iloc[0]))
  print('************')
  # print(i)
end = time.time()
print('time taken '+ str(end - start))

Aroma_Chemical - Farnesol
Perfume Family - Floral
Perfume Odor_Strength - Low
Perfumary_Note - Base
************
Aroma_Chemical - Melonal
Perfume Family - Melon
Perfume Odor_Strength - High
Perfumary_Note - Middle
************
Aroma_Chemical - Veramoss / Evernyl
Perfume Family - Mossy
Perfume Odor_Strength - High
Perfumary_Note - Base
************
Aroma_Chemical - Calone liquid
Perfume Family - Melon
Perfume Odor_Strength - High
Perfumary_Note - Base
************
Aroma_Chemical - Ethyl Maltol
Perfume Family - Caramellic
Perfume Odor_Strength - High
Perfumary_Note - Base
************
Aroma_Chemical - Aldehyde C12 MNA
Perfume Family - Aldehydic
Perfume Odor_Strength - High
Perfumary_Note - Base
************
Aroma_Chemical - Aldehyde C18 – gamma Nonalactone
Perfume Family - Coconut
Perfume Odor_Strength - Medium
Perfumary_Note - Base
************
time taken 0.03795218467712402


# Search Algorithm (2) : Approximate Nearest Neighbor (vector encoding using trees)
#### There are some other libraries to do nearest neighbor search. Annoy is almost as fast as the fastest libraries, (see below), but there is actually another feature that really sets Annoy apart: it has the ability to use static files as indexes. In particular, this means you can share index across processes. Annoy also decouples creating indexes from loading them, so you can pass around indexes as files and map them into memory quickly. Another nice thing of Annoy is that it tries to minimize memory footprint so the indexes are quite small.

In [21]:
# The idea of this code is to find the similarity of existing observations
# This is between a specific description of one data point in the database with other data points
import annoy

class AnnoyIndex():
    def __init__(self, vectors, labels):
        self.dimention = vectors.shape[1]
        self.vectors = vectors.astype('float32')
        self.labels = labels


    def build(self, number_of_trees=5):
        self.index = annoy.AnnoyIndex(self.dimention)
        for i, vec in enumerate(self.vectors):
            self.index.add_item(i, vec.tolist())
        self.index.build(number_of_trees)
        
    def query(self, vector, k=7):
        indices = self.index.get_nns_by_vector(vector.tolist(), k)
        return [self.labels[i] for i in indices]


ANN_index = AnnoyIndex(data["vector"], data["attr"])
ANN_index.build()

perfume_vector, perfume_attr = data['vector'][3], data['attr'][3] # Here you spicify one data point and find its similarity with others
start = time.time()
simlar_perfumes = '\n* '.join(ANN_index.query(perfume_vector))
# print(simlar_perfumes ingredients)
simlar_perfume_v1 = ANN_index.query(perfume_vector)
for i in simlar_perfume_v1:
  print('Aroma_Chemical - '+str(df1[df1['Odor_Description']==str(i)]['Aroma_Chemical'].iloc[0]))
  print('Perfume Family - '+str(df1[df1['Odor_Description']==str(i)]['Family'].iloc[0]))
  print('Perfume Odor_Strength - '+str(df1[df1['Odor_Description']==str(i)]['Odor_Strength'].iloc[0]))
  print('Perfumary_Note - '+str(df1[df1['Odor_Description']==str(i)]['Perfumary_Note'].iloc[0]))
  print('************')
end = time.time()
print('time taken '+ str(end - start))

Aroma_Chemical - Aldehyde C12 MNA
Perfume Family - Aldehydic
Perfume Odor_Strength - High
Perfumary_Note - Base
************
Aroma_Chemical - Aldehyde C11
Perfume Family - Aldehydic
Perfume Odor_Strength - High
Perfumary_Note - Base
************
Aroma_Chemical - Aldehyde C10 – Decanal
Perfume Family - Aldehydic
Perfume Odor_Strength - High
Perfumary_Note - Base
************
Aroma_Chemical - Aldehyde C12 Lauric
Perfume Family - Aldehydic
Perfume Odor_Strength - High
Perfumary_Note - Base
************
Aroma_Chemical - Ambrocenide Crystals (Symrise)
Perfume Family - Ambery
Perfume Odor_Strength - None
Perfumary_Note - Base
************
Aroma_Chemical - Fixateur 505
Perfume Family - Amber
Perfume Odor_Strength - Medium
Perfumary_Note - Base
************
Aroma_Chemical - Dihydromyrcenol
Perfume Family - Citrus
Perfume Odor_Strength - Medium
Perfumary_Note - Top
************
time taken 0.03936624526977539


In [24]:
perfume_vector

array([0., 0., 0., 0., 1., 1., 1., 0., 0., 0., 1., 1., 0., 0., 0., 0., 1.,
       0., 1., 0., 0., 0., 0.])

## Search Algorithm (3) : Okapi Best Match 25 (BM25)
#### BM25 improves upon TF*IDF. BM25 stands for “Best Match 25”. Released in 1994, it’s the 25th iteration of tweaking the relevance computation. BM25 has its roots in probabilistic information retrieval. Probabilistic information retrieval is a fascinating field unto itself. Basically, it casts relevance as a probability problem. A relevance score, according to probabilistic information retrieval, ought to reflect the probability a user will consider the result relevant. In this dataset we will use description based search with word embeddings.

In [26]:
df2 = pd.read_excel('Essential_oils.xlsx')
df2.head()

Unnamed: 0,Ess_Oil_Name,Odor_Description,Perfumary_Note,Family,Ext_Method,Blends_with
0,Allspice Essential Oil,"Spicy, sweet, woody, and true to Allspice that...",Middle,Myrtaceae,Steam Distilled,"Ginger, and other carminative spices; Sweet Or..."
1,Amyris Essential Oil,"Woody, herbaceous, dry, and earthy",Base,Rutaceae,Steam Distilled,"Cedarwood, and other woodsy aromas"
2,Angelica Root Essential Oil,"Warm, mapley and slightly-sweet",Middle – Base,Apiaceae,Steam Distilled,"Mandarin, Patchouli, Clary Sage, Vetiver, Citr..."
3,Basil Essential Oil,"Sweet, herbaceous, fresh, and true to Basil us...",Middle - Top,Lamiaceae,Steam Distilled,"Lavender and Lemon, and other florals and citrus"
4,Basil- Lemon Essential Oil,"Sweet, lemony, spicy, herbaceous",Top,Lamiaceae,Steam Distilled,Citruses and florals such as Rose


In [27]:
text_list = df2.Odor_Description.str.lower().values
tok_text=[] # for our tokenised corpus
#Tokenising using SpaCy:
for doc in tqdm(nlp.pipe(text_list, disable=["tagger", "parser","ner"])):
    tok = [t.text for t in doc if t.is_alpha]
    tok_text.append(tok)

bm25 = BM25Okapi(tok_text)

def sim_search(query):
    tokenized_query = query.lower().split(" ")
    import time
    t0 = time.time()
    results = bm25.get_top_n(tokenized_query, df2.Odor_Description.values, n=7)
    t1 = time.time()
    for i in results:
      print('**********')
      print('Oil name - '+ str(df2[df2['Odor_Description']==i]['Ess_Oil_Name'].iloc[0]))
      print('Perfumary_Note - '+str(df2[df2['Odor_Description']==i]['Perfumary_Note'].iloc[0]))
      print('Perfume family - '+ str(df2[df2['Odor_Description']==i]['Family'].iloc[0]))
      print('Good for blend with - '+ str(df2[df2['Odor_Description']==i]['Blends_with'].iloc[0]))
      print('**********')

117it [00:00, 7841.83it/s]


In [28]:
query = "fresh amber aldehydic moss citrus tuberose metallic waxy coumarinic"	

In [29]:
sim_search(query)

**********
Oil name - Gingergrass Essential Oil
Perfumary_Note - Middle
Perfume family - Poaceae
Good for blend with - Citruses and florals such as Lavender
**********
**********
Oil name - Neroli Essential Oil
Perfumary_Note - Middle
Perfume family - Rutaceae
Good for blend with - Frankincense, and other resins
**********
**********
Oil name - Frankincense- Frereana Essential Oil
Perfumary_Note - Base
Perfume family - Burseraceae
Good for blend with - Lemon, Myrrh, and other citrus and resins
**********
**********
Oil name - Palmarosa Essential Oil
Perfumary_Note - Middle - Top
Perfume family - Poaceae
Good for blend with - Lavender, and other florals
**********
**********
Oil name - Camphor- White Essential Oil
Perfumary_Note - Base
Perfume family - Lauraceae
Good for blend with - Eucalyptus, and other camphoraceous aromas
**********
**********
Oil name - Pine- Scots Essential Oil
Perfumary_Note - Top
Perfume family - Pinaceae
Good for blend with - Myrrh, and other resins
**********


In [32]:
# This is to be user interface for BM-25 application

search = input("Please enter the search string!!")
sim_search(search)


Please enter the search string!!Woody
**********
Oil name - Cedarwood- Atlas Essential Oil
Perfumary_Note - Base
Perfume family - Pinaceae
Good for blend with - Patchouli, Sandalwood, and blends exceptionally well with woods
**********
**********
Oil name - Muhuhu Essential Oil
Perfumary_Note - Base
Perfume family - Asteraceae
Good for blend with - Frankincense, and other resins
**********
**********
Oil name - Cardamom Essential Oil
Perfumary_Note - Middle
Perfume family - Zingiberaceae
Good for blend with - Bergamot, Cilantro, Ginger, Ylang Ylang, woods and other carminative spices
**********
**********
Oil name - Amyris Essential Oil
Perfumary_Note - Base
Perfume family - Rutaceae
Good for blend with - Cedarwood, and other woodsy aromas
**********
**********
Oil name - Buddha Wood Essential Oil
Perfumary_Note - Base
Perfume family - Myoporaceae
Good for blend with - Vetiver, and other resins
**********
**********
Oil name - Myrrh Essential Oil
Perfumary_Note - Base
Perfume family - 