In [1]:
!pip install gensim



In [2]:
import os, re, io
import pandas as pd
import numpy as np
import nltk

## stopwords
from gensim.parsing.preprocessing import remove_stopwords
## lemma functionality provide by NLTK
from nltk.stem import WordNetLemmatizer
## make sure you downloaded model for lemmatization
nltk.download('wordnet')
from nltk import word_tokenize
## make sure you downloaded model for tokenization
nltk.download('punkt')
import spacy
nlp = nlp = spacy.load("en_core_web_sm")

from gensim.models import TfidfModel
from gensim.corpora import Dictionary
## cosine similarity
from sklearn.metrics.pairwise import cosine_similarity

[nltk_data] Downloading package wordnet to C:\Users\Abhijit
[nltk_data]     Morye\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\Abhijit
[nltk_data]     Morye\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
df = pd.read_excel("WHO_FAQ (1).xlsx")
df

Unnamed: 0,Context,Answer
0,Hi!,"Hi ! How can I help you, ask me questions abou..."
1,"Hello, is anybody there?","Hello, I am CovBot, you can ask me questions a..."
2,Hey there!,"Hey there, I am CovBot, you can ask me questio..."
3,What is your name?,"Hello, I am CovBot, you can ask me questions a..."
4,Tell me a joke,Why did two 4s skip dinner? Because they alrea...
...,...,...
85,Are smokers and tobacco users at higher risk o...,Smokers are likely to be more vulnerable to CO...
86,Are smokers and tobacco users at higher risk o...,Smoking products such as water pipes often inv...
87,How large does a meeting or event need to be i...,High profile international sporting events suc...
88,How large does a meeting or event need to be i...,An event counts as a “mass gatherings” if the ...


In [4]:
df.shape

(90, 2)

In [5]:
df.describe().T

Unnamed: 0,count,unique,top,freq
Context,90,47,What can I do to protect myself and prevent th...,7
Answer,90,89,"Hello, I am CovBot, you can ask me questions a...",2


In [6]:
class TextPreprocessing():    
    def __init__(self, data_df, column_name=None):
        self.data_df = data_df
        self.column_name = column_name
        self.processed_column_name = f"processed_{self.column_name}"
        
    def convert_lowercase(self):
        self.data_df.fillna('', inplace=True)
        self.data_df[self.column_name] = self.data_df[self.column_name].apply(lambda column: column.lower())
#         self.data_df = self.data_df.apply(lambda column: column.astype(str).str.lower(), axis=0) 
    
    def remove_special_symbol(self):
        pattern = '[^\w\s]'
        self.data_df[self.column_name] = self.data_df[self.column_name].apply(lambda row: re.sub(pattern, ' ', row))
    
    def remove_stopwords(self):
        for idx, question in enumerate(self.data_df[self.column_name]):
            self.data_df.loc[idx, self.processed_column_name] = remove_stopwords(question)
    
    def apply_lemmatization(self):
        lemma = WordNetLemmatizer()
        for idx, question in enumerate(self.data_df[self.processed_column_name]):
            lemmatized_sentences = []
            doc = nlp(question.strip())
#             print(doc)
            for word in doc:
                lemmatized_sentences.append(word.lemma_)
            self.data_df.loc[idx, self.processed_column_name] = " ".join(lemmatized_sentences)    
            
    def preprocessing(self):
        self.convert_lowercase()
        self.remove_special_symbol()
        self.remove_stopwords()
        self.apply_lemmatization()
        return self.data_df

In [7]:
txp = TextPreprocessing(df.copy(), 'Context')
preprocessed_data_df = txp.preprocessing()
preprocessed_data_df.head(10)

Unnamed: 0,Context,Answer,processed_Context
0,hi,"Hi ! How can I help you, ask me questions abou...",hi
1,hello is anybody there,"Hello, I am CovBot, you can ask me questions a...",hello anybody
2,hey there,"Hey there, I am CovBot, you can ask me questio...",hey
3,what is your name,"Hello, I am CovBot, you can ask me questions a...",
4,tell me a joke,Why did two 4s skip dinner? Because they alrea...,tell joke
5,what is a coronavirus,Coronaviruses are a large family of viruses wh...,coronavirus
6,what is a coronavirus,"In humans, several coronaviruses are known to ...",coronavirus
7,what is covid 19,COVID-19 is the infectious disease caused by t...,covid 19
8,what are the symptoms of covid 19,The most common symptoms of COVID-19 are fever...,symptom covid 19
9,what are the symptoms of covid 19,Some people become infected but don’t develop ...,symptom covid 19


## TFIDF Representation        

In [8]:
class TFIDFCalculation():
    def __init__(self, data_df, column_name):
        self.data_df = data_df
        self.column_name = column_name
        self.dictionary = None
        self.model = None
        self.bow = None
        
    def create_tf_idf_model(self):
        sentence_token_list = [sentence.split(" ") for sentence in self.data_df[self.column_name]]
#         print(sentence_token_list)
        self.dictionary = Dictionary(sentence_token_list)
#         print(self.dictionary)
        self.bow = [self.dictionary.doc2bow(sentence_token) for sentence_token in sentence_token_list]
        self.model = TfidfModel(self.bow)
        
        #first 10 representation of TFIDF vectors for dataset
        print("First 10 representation for dataset")
        for idx, question in enumerate(self.data_df[self.column_name]):
            if idx <= 10:
                print(f"{question} --> {self.model[self.bow[idx]]}")
            else:
                break
                
    def get_test_vector(self, test_df, test_column_name):
        test_tf_idf_vector = []
        test_sentence_list = [t_sentence.split(" ") for t_sentence in test_df[test_column_name]]
        test_bow = [self.dictionary.doc2bow(t_sentence_token) for t_sentence_token in test_sentence_list]
        for t_token in test_bow:
            test_tf_idf_vector.append(self.model[t_token])
        return test_tf_idf_vector
    
    def get_training_tf_vector(self):
        train_tf_vector = []
        for bow_token in self.bow:
            train_tf_vector.append(self.model[bow_token])
        return train_tf_vector
    
    def get_train_vocab(self):
        train_vocab = []
        for index in self.dictionary:
            train_vocab.append(self.dictionary[index])
        return train_vocab
        

In [9]:
def retrieve_similar_question(train_tf_vector, test_tf_vector, train_qa_df, train_column_name, test_qa_df, test_column_name, train_answer_column_name):
    
    similar_question_index = []
    for test_index, test_vector in enumerate(test_tf_vector):
        sim_score = -1
        sim_q_index = -1
        
        for train_index, train_vecor in enumerate(train_tf_vector):
            cos_sim_score = cosine_similarity(train_vecor, test_vector)[0][0]
#             print(cos_sim_score)
            
            if sim_score < cos_sim_score:
                sim_score = cos_sim_score
                sim_q_index = train_index
            
        print("*"*100)
        print(f"Test Question --> {test_qa_df[test_column_name].iloc[test_index]}")
        print(f"Train Question with similarity --> {train_qa_df[train_column_name].iloc[sim_q_index]}")
        print(f"Response Answer --> {train_qa_df[train_answer_column_name].iloc[sim_q_index]}")
        print("*"*100)

In [10]:
tfidf = TFIDFCalculation(preprocessed_data_df, 'processed_Context')
tfidf.create_tf_idf_model()
training_qa_tf_vector = tfidf.get_training_tf_vector()
# training_qa_tf_vector

First 10 representation for dataset
hi --> [(0, 1.0)]
hello anybody --> [(1, 0.7071067811865476), (2, 0.7071067811865476)]
hey --> [(3, 1.0)]
 --> [(4, 1.0)]
tell joke --> [(5, 0.7071067811865476), (6, 0.7071067811865476)]
coronavirus --> [(7, 1.0)]
coronavirus --> [(7, 1.0)]
covid 19 --> [(8, 0.7071067811865476), (9, 0.7071067811865476)]
symptom covid 19 --> [(8, 0.15813539237473356), (9, 0.15813539237473356), (10, 0.9746724554212961)]
symptom covid 19 --> [(8, 0.15813539237473356), (9, 0.15813539237473356), (10, 0.9746724554212961)]
symptom covid 19 --> [(8, 0.15813539237473356), (9, 0.15813539237473356), (10, 0.9746724554212961)]


In [11]:
train_tf_vocab = tfidf.get_train_vocab()
train_tf_vocab

['hi',
 'anybody',
 'hello',
 'hey',
 '',
 'joke',
 'tell',
 'coronavirus',
 '19',
 'covid',
 'symptom',
 'spread',
 'air',
 'cause',
 'transmit',
 'virus',
 'catch',
 'person',
 'disease',
 'fece',
 'prevent',
 'protect',
 '14',
 'andprotection',
 'area',
 'day',
 'measure',
 'past',
 'recently',
 'visit',
 'protection',
 'likely',
 'worry',
 'develop',
 'illness',
 'risk',
 'severe',
 'antibiotic',
 'effective',
 'treating',
 'cure',
 'medicine',
 'therapy',
 'drug',
 'treatment',
 'vaccine',
 'sar',
 'mask',
 'wear',
 'dispose',
 'use',
 'incubation',
 'long',
 'period',
 'animal',
 'human',
 'infect',
 'source',
 'pet',
 'surface',
 'survive',
 'package',
 'receive',
 'report',
 'safe',
 'know',
 '2',
 'cov',
 'infection',
 'occur',
 'sars',
 'airborne',
 'high',
 'pregnant',
 'woman',
 'm',
 'test',
 'baby',
 'pass',
 'unborn',
 'available',
 'care',
 'childbirth',
 'pregnancy',
 'birth',
 'ceasarean',
 'confirm',
 'need',
 'section',
 'suspect',
 'breastfeed',
 'hold',
 'newborn'

In [12]:
## Evaluating TF-IDF for test question

In [13]:
test_query_string = ["how does covid-19 spread?", 
                     "What are the symptoms of COVID-19?",
                "Should I wear a mask to protect myself from covid-19",              
                "Is there a vaccine for COVID-19",
                "can the virus transmit through air?",
                "can the virus spread through air?"]
# test_query_string = ['covid 19']
test_df = pd.DataFrame(test_query_string, columns=['test_questions'])
test_text_preprocessor = TextPreprocessing(test_df, 'test_questions')
processed_test_df = test_text_preprocessor.preprocessing()
test_qa_vector = tfidf.get_test_vector(test_df, 'test_questions')
test_qa_vector

[[(4, 0.8177025321429846),
  (8, 0.11539086677417733),
  (9, 0.11539086677417733),
  (11, 0.5520257826018269)],
 [(4, 0.9806620453934152), (8, 0.1383870527267691), (9, 0.1383870527267691)],
 [(8, 0.08958803862845245),
  (9, 0.08958803862845245),
  (21, 0.3945231996784215),
  (47, 0.5893545423414265),
  (48, 0.6935132551050787)],
 [(8, 0.13838705272676913),
  (9, 0.13838705272676913),
  (45, 0.9806620453934154)],
 [(4, 0.41975631080487424),
  (12, 0.6066530332946903),
  (14, 0.6066530332946903),
  (15, 0.2962242966540469)],
 [(4, 0.49736346049195473),
  (11, 0.33576691122146923),
  (12, 0.7188148079985568),
  (15, 0.35099208153213385)]]

In [14]:
retrieve_similar_question(training_qa_tf_vector, test_qa_vector, preprocessed_data_df, 'Context', processed_test_df, 'test_questions', 'Answer')

****************************************************************************************************
Test Question --> how does covid 19 spread 
Train Question with similarity --> what is your name 
Response Answer --> Hello, I am CovBot, you can ask me questions about covid 19 general question regrading symptoms, spread.
****************************************************************************************************
****************************************************************************************************
Test Question --> what are the symptoms of covid 19 
Train Question with similarity --> what is your name 
Response Answer --> Hello, I am CovBot, you can ask me questions about covid 19 general question regrading symptoms, spread.
****************************************************************************************************
****************************************************************************************************
Test Question --> should i wear a mask t

In [15]:
## BERT Embedding
!pip install bert-embedding

Collecting numpy==1.14.6
  Using cached numpy-1.14.6.zip (4.9 MB)
Building wheels for collected packages: numpy
  Building wheel for numpy (setup.py): started
  Building wheel for numpy (setup.py): still running...
  Building wheel for numpy (setup.py): finished with status 'error'
  Running setup.py clean for numpy
Failed to build numpy
Installing collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 1.22.3
    Uninstalling numpy-1.22.3:
      Successfully uninstalled numpy-1.22.3
    Running setup.py install for numpy: started
    Running setup.py install for numpy: finished with status 'done'


  ERROR: Command errored out with exit status 1:
   command: 'D:\Anaconda\python.exe' -u -c 'import sys, setuptools, tokenize; sys.argv[0] = '"'"'C:\\Users\\Abhijit Morye\\AppData\\Local\\Temp\\pip-install-p6sec_jy\\numpy\\setup.py'"'"'; __file__='"'"'C:\\Users\\Abhijit Morye\\AppData\\Local\\Temp\\pip-install-p6sec_jy\\numpy\\setup.py'"'"';f=getattr(tokenize, '"'"'open'"'"', open)(__file__);code=f.read().replace('"'"'\r\n'"'"', '"'"'\n'"'"');f.close();exec(compile(code, __file__, '"'"'exec'"'"'))' bdist_wheel -d 'C:\Users\Abhijit Morye\AppData\Local\Temp\pip-wheel-8fk75bmb'
       cwd: C:\Users\Abhijit Morye\AppData\Local\Temp\pip-install-p6sec_jy\numpy\
  Complete output (1868 lines):
  Running from numpy source directory.
    return is_string(s) and ('*' in s or '?' is s)
  blas_opt_info:
  blas_mkl_info:
  No module named 'numpy.distutils._msvccompiler' in numpy.distutils; trying from distutils
  customize MSVCCompiler
    libraries mkl_rt not found in ['D:\\Anaconda\\lib', 'C:\\',

In [16]:
from bert_embedding import BertEmbedding
bert_embedding = BertEmbedding()
QA_questions = preprocessed_data_df["Context"].to_list()
query_QA_questions = test_df["test_questions"].to_list()



  removing: _configtest.c _configtest.obj _configtest.exe
  C:\Program Files (x86)\Microsoft Visual Studio\2019\BuildTools\VC\Tools\MSVC\14.28.29910\bin\HostX86\x64\cl.exe /c /nologo /Ox /W3 /GL /DNDEBUG /MD -Inumpy\core\src\private -Inumpy\core\src -Inumpy\core -Inumpy\core\src\npymath -Inumpy\core\src\multiarray -Inumpy\core\src\umath -Inumpy\core\src\npysort -ID:\Anaconda\include -ID:\Anaconda\include -I"C:\Program Files (x86)\Microsoft Visual Studio\2019\BuildTools\VC\Tools\MSVC\14.28.29910\include" -I"C:\Program Files (x86)\Windows Kits\10\include\10.0.19041.0\ucrt" -I"C:\Program Files (x86)\Windows Kits\10\include\10.0.19041.0\shared" -I"C:\Program Files (x86)\Windows Kits\10\include\10.0.19041.0\um" -I"C:\Program Files (x86)\Windows Kits\10\include\10.0.19041.0\winrt" -I"C:\Program Files (x86)\Windows Kits\10\include\10.0.19041.0\cppwinrt" /Tc_configtest.c /Fo_configtest.obj
  C:\Program Files (x86)\Microsoft Visual Studio\2019\BuildTools\VC\Tools\MSVC\14.28.29910\bin\HostX86\x6

  C:\Program Files (x86)\Microsoft Visual Studio\2019\BuildTools\VC\Tools\MSVC\14.28.29910\bin\HostX86\x64\cl.exe /c /nologo /Ox /W3 /GL /DNDEBUG /MD -Inumpy\core\include -Ibuild\src.win-amd64-3.8\numpy\core\include/numpy -Inumpy\core\src\private -Inumpy\core\src -Inumpy\core -Inumpy\core\src\npymath -Inumpy\core\src\multiarray -Inumpy\core\src\umath -Inumpy\core\src\npysort -ID:\Anaconda\include -ID:\Anaconda\include -Ibuild\src.win-amd64-3.8\numpy\core\src\private -Ibuild\src.win-amd64-3.8\numpy\core\src\npymath -Ibuild\src.win-amd64-3.8\numpy\core\src\private -Ibuild\src.win-amd64-3.8\numpy\core\src\npymath -Ibuild\src.win-amd64-3.8\numpy\core\src\private -Ibuild\src.win-amd64-3.8\numpy\core\src\npymath -I"C:\Program Files (x86)\Microsoft Visual Studio\2019\BuildTools\VC\Tools\MSVC\14.28.29910\include" -I"C:\Program Files (x86)\Windows Kits\10\include\10.0.19041.0\ucrt" -I"C:\Program Files (x86)\Windows Kits\10\include\10.0.19041.0\shared" -I"C:\Program Files (x86)\Windows Kits\10\i

In [17]:
question_QA_bert_embeddings_list = bert_embedding(QA_questions)
query_QA_bert_embeddings_list = bert_embedding(query_QA_questions)

In [18]:
## store QA bert embeddings in list
question_QA_bert_embeddings = []
for embeddings in question_QA_bert_embeddings_list:
  question_QA_bert_embeddings.append(embeddings[1])

## store query string bert embeddings in list
query_QA_bert_embeddings = []
for embeddings in query_QA_bert_embeddings_list:
  query_QA_bert_embeddings.append(embeddings[1])

In [19]:
# retrieveSimilarFAQ(question_QA_bert_embeddings, query_QA_bert_embeddings, processed_QA_df, "questions", query_QA_df, "test_questions")
retrieve_similar_question(question_QA_bert_embeddings, query_QA_bert_embeddings, preprocessed_data_df, 'Context', processed_test_df, 'test_questions', 'Answer')

****************************************************************************************************
Test Question --> how does covid 19 spread 
Train Question with similarity --> how does covid 19 spread 
Response Answer --> People can catch COVID-19 from others who have the virus. The disease can spread from person to person through small droplets from the nose or mouth which are spread when a person with COVID-19 coughs or exhales. 
****************************************************************************************************
****************************************************************************************************
Test Question --> what are the symptoms of covid 19 
Train Question with similarity --> what are the symptoms of covid 19 
Response Answer --> The most common symptoms of COVID-19 are fever, tiredness, and dry cough. Some patients may have aches and pains, nasal congestion, runny nose, sore throat or diarrhea. These symptoms are usually mild and begin grad