In [1]:
import numpy as np
import pandas as pd
import spacy
import string
import gensim
import operator
import re
from spacy.lang.en.stop_words import STOP_WORDS
from nltk.tokenize import word_tokenize
from gensim import corpora
from gensim.similarities import MatrixSimilarity
from operator import itemgetter
from fastapi import FastAPI
import nest_asyncio
import uvicorn

##### Load the data

In [2]:
df_drugs = pd.read_json('drug_corpus.json')
df_drugs.head(10)

Unnamed: 0,uid,text
0,0103050P0AAAFAF,Omeprazole_Cap E/C 10mg
1,20030100167,Dressit Ster Dress Pack
2,20031700015,Flaminal Forte 15g Tube Wound Dress Prot
3,0101010G0AAABAB,Co-Magaldrox_Susp 195mg/220mg/5ml S/F
4,0101010N0AAAAAA,Antacid/Oxetacaine_Oral Susp S/F
5,0101010R0AAABAB,Simeticone_Susp 40mg/ml S/F
6,0101010R0BCAAAB,Infacol_Susp 40mg/ml S/F
7,0101021B0AAAHAH,Gppe Liq_Gaviscon S/F
8,0101021B0AAALAL,Sod Algin/Pot Bicarb_Susp S/F
9,0101021B0AAAPAP,Sod Alginate/Pot Bicarb_Tab Chble 500mg


##### Data Cleaning and Pre-processing

In [3]:
spacy_nlp = spacy.load('en_core_web_sm')

#create list of punctuations and stopwords
punctuations = string.punctuation
stop_words = spacy.lang.en.stop_words.STOP_WORDS

#function for data cleaning and processing
#This can be further enhanced by adding / removing reg-exps as desired.

def tokenizer_spacy(sentence):
 
    #remove distracting single quotes
    sentence = re.sub('\'','',sentence)

    #remove digits adnd words containing digits
    sentence = re.sub('\w*\d\w*','',sentence)

    #replace extra spaces with single space
    sentence = re.sub(' +',' ',sentence)

    #remove unwanted lines starting from special charcters
    sentence = re.sub(r'\n: \'\'.*','',sentence)
    sentence = re.sub(r'\n!.*','',sentence)
    sentence = re.sub(r'^:\'\'.*','',sentence)
    
    #remove non-breaking new line characters
    sentence = re.sub(r'\n',' ',sentence)
    
    #remove punctunations
    sentence = re.sub(r'[^\w\s]',' ',sentence)
    
    #creating token object
    tokens = spacy_nlp(sentence)
    
    #lower, strip and lemmatize
    tokens = [word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in tokens]
    
    #remove stopwords, and exclude words less than 2 characters
    tokens = [word for word in tokens if word not in stop_words and word not in punctuations and len(word) > 2]
    
    #return tokens
    return tokens




In [4]:
print ('Cleaning and Tokenizing started now.....')
#from nltk.tokenize
df_drugs['wiki_tokenized'] = df_drugs['text'].map(lambda x: tokenizer_spacy(x))
print(df_drugs.head())
print ('Completed')
#Store the tokenized column into a sepearte variable for ease of operations in subsequent sections
drugs_plot = df_drugs['wiki_tokenized']

Cleaning and Tokenizing started now.....
               uid                                      text  \
0  0103050P0AAAFAF                   Omeprazole_Cap E/C 10mg   
1      20030100167                   Dressit Ster Dress Pack   
2      20031700015  Flaminal Forte 15g Tube Wound Dress Prot   
3  0101010G0AAABAB     Co-Magaldrox_Susp 195mg/220mg/5ml S/F   
4  0101010N0AAAAAA          Antacid/Oxetacaine_Oral Susp S/F   

                                wiki_tokenized  
0                             [omeprazole_cap]  
1                 [dressit, ster, dress, pack]  
2  [flaminal, forte, tube, wound, dress, prot]  
3                             [magaldrox_susp]  
4             [antacid, oxetacaine_oral, susp]  
Completed


##### Building Word Dictionary

In [5]:
#creating term dictionary
dictionary = corpora.Dictionary(drugs_plot)

In [6]:
#print top 30 items from the dictionary with their unique token-id
dict_tokens = [[[dictionary[key], dictionary.token2id[dictionary[key]]] for key, value in dictionary.items() if key <= 30]]
print (dict_tokens)

[[['omeprazole_cap', 0], ['dress', 1], ['dressit', 2], ['pack', 3], ['ster', 4], ['flaminal', 5], ['forte', 6], ['prot', 7], ['tube', 8], ['wound', 9], ['magaldrox_susp', 10], ['antacid', 11], ['oxetacaine_oral', 12], ['susp', 13], ['simeticone_susp', 14], ['infacol_susp', 15], ['gppe', 16], ['liq_gaviscon', 17], ['algin', 18], ['bicarb_susp', 19], ['pot', 20], ['sod', 21], ['alginate', 22], ['bicarb_tab', 23], ['chble', 24], ['gastrocote_tab', 25], ['dual', 26], ['gaviscon', 27], ['infant_sach', 28], ['advance_liq', 29], ['aniseed', 30]]]


##### Feature Extraction (Bag of Words)

In [7]:
corpus = [dictionary.doc2bow(desc) for desc in drugs_plot]
word_frequencies = [[(dictionary[id], frequency) for id, frequency in line] for line in corpus[0:3]]
#vocabs
#The above results shows vocabulary with their frequency.
print(word_frequencies)

[[('omeprazole_cap', 1)], [('dress', 1), ('dressit', 1), ('pack', 1), ('ster', 1)], [('dress', 1), ('flaminal', 1), ('forte', 1), ('prot', 1), ('tube', 1), ('wound', 1)]]


### Build Tf-Idf and LSI Model

Tf-Idf means, Term frequency-Inverse Document Frequency. it is a commonly used NLP model that helps you determine the most important words in each document in the corpus. Once the Tf-Idf is build, pass it to LSI model and specify the num of features to build

In [8]:
tfidf_drugs_model = gensim.models.TfidfModel(corpus, id2word=dictionary)
lsi_drugs_model = gensim.models.LsiModel(tfidf_drugs_model[corpus], id2word=dictionary, num_topics=300)

#Serialize and Store the corpus locally for easy retrival whenver required.
gensim.corpora.MmCorpus.serialize('tfidf_drugs_model_s', tfidf_drugs_model[corpus])
gensim.corpora.MmCorpus.serialize('lsi_drugs_model_s',lsi_drugs_model[tfidf_drugs_model[corpus]])
corpus_tfidf_drugs = gensim.corpora.MmCorpus('tfidf_drugs_model_s')
corpus_lsi_drugs = gensim.corpora.MmCorpus('lsi_drugs_model_s')
print(corpus_tfidf_drugs)
print(corpus_lsi_drugs)

MmCorpus(23585 documents, 10906 features, 76223 non-zero entries)
MmCorpus(23585 documents, 300 features, 7070668 non-zero entries)


In [9]:
drugs_i = MatrixSimilarity(corpus_lsi_drugs, num_features = corpus_lsi_drugs.num_terms)

##### Search Function

Below is the helper function to search the index, sort and return the results

In [10]:
def drugs_search_engine(search_term,relevence_result_number):
    query_bow = dictionary.doc2bow(tokenizer_spacy(search_term))
    query_tfidf = tfidf_drugs_model[query_bow]
    query_lsi = lsi_drugs_model[query_tfidf]
    drugs_i.num_best = relevence_result_number
    drugs_list = drugs_i[query_lsi]
    drugs_list.sort(key=itemgetter(1), reverse=True)
    drugs_details = []
    for j, d in enumerate(drugs_list):
        drugs_details.append (
            {
                'UID': df_drugs['uid'][d[0]],
                'Text': df_drugs['text'][d[0]],
                'Relevance': round((d[1] * 100),2)
            }
        )
        if j == (drugs_i.num_best-1):
            break
    return pd.DataFrame(drugs_details, columns=['UID','Text','Relevance'])

In [11]:
drugs_search_engine('Antacid',6)

Unnamed: 0,UID,Text,Relevance
0,0101021B0AAAFAF,Gppe Tab_Topal Antacid,60.68
1,0106040N0AAADAD,Gppe Enem_Fleet Phos 133ml,59.76
2,130201000AABYBY,Gppe Shower Emollient_Dermol 200,59.59
3,0407010X0AAAYAY,Gppe Tab_Ultramol Solb,59.56
4,090502100AAAAAA,Gppe Tab_Phos-Sandoz,59.41
5,1302020E0AAALAL,Gppe Crm_Sudocrem Antis,59.32


In [12]:
drugs_search_engine('Gppe Tab_Topal Antacid',7)

Unnamed: 0,UID,Text,Relevance
0,0101021B0AAAFAF,Gppe Tab_Topal Antacid,100.0
1,1303000J0AAABAB,Gppe Crm_Benadryl,99.98
2,1305020I0AAADAD,Gppe Crm_Alphosyl H.C.,99.98
3,090607000AAAXAX,Gppe Tab_Ketovite,99.98
4,1302011M0AAAEAE,Gppe Emollient_Emulsiderm,99.98
5,130201000AAA4A4,Gppe Crm_Diprobase,99.98
6,1302011L0AAACAC,Gppe Emollient_Diprobath,99.98


In [13]:
app = FastAPI(
    title="Search API",
    description="A simple API that would return the relevant search result",
    version="0.1",
)

In [14]:
@app.get("/search")
def return_similar_drugs(search_string: str,relevence_result_number: int):
    aaa=drugs_search_engine(search_string,relevence_result_number)
    return aaa

In [15]:
nest_asyncio.apply()

In [None]:
if __name__ == "__main__":
    uvicorn.run(app, host="127.0.0.1", port=8000, debug=True)

INFO:     Started server process [22216]
INFO:     Waiting for application startup.
INFO:     Application startup complete.
INFO:     Uvicorn running on http://127.0.0.1:8000 (Press CTRL+C to quit)


INFO:     127.0.0.1:61155 - "GET /docs HTTP/1.1" 200 OK
INFO:     127.0.0.1:61155 - "GET /openapi.json HTTP/1.1" 200 OK
INFO:     127.0.0.1:61158 - "GET /search?search_string=Gppe%20Tab_Topal%20Antacid&relevence_result_number=6 HTTP/1.1" 200 OK
