In [63]:
!pip install gensim

Collecting gensim
  Downloading gensim-4.1.2-cp38-cp38-win_amd64.whl (24.0 MB)
Collecting Cython==0.29.23
  Downloading Cython-0.29.23-cp38-cp38-win_amd64.whl (1.7 MB)
Installing collected packages: Cython, gensim
  Attempting uninstall: Cython
    Found existing installation: Cython 0.29.21
    Uninstalling Cython-0.29.21:
      Successfully uninstalled Cython-0.29.21
Successfully installed Cython-0.29.23 gensim-4.1.2


In [1]:
import os, re, io
import pandas as pd
import numpy as np
import nltk

## stopwords
from gensim.parsing.preprocessing import remove_stopwords
## lemma functionality provide by NLTK
from nltk.stem import WordNetLemmatizer
## make sure you downloaded model for lemmatization
#nltk.download('wordnet')
from nltk import word_tokenize
## make sure you downloaded model for tokenization
nltk.download('punkt')
import spacy
nlp = nlp = spacy.load("en_core_web_sm")

from gensim.models import TfidfModel
from gensim.corpora import Dictionary
## cosine similarity
from sklearn.metrics.pairwise import cosine_similarity

[nltk_data] Downloading package punkt to C:\Users\Abhijit
[nltk_data]     Morye\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [165]:
df = pd.read_excel("WHO_FAQ (1).xlsx")
df

Unnamed: 0,Context,Answer
0,What is a coronavirus?,Coronaviruses are a large family of viruses wh...
1,What is a coronavirus?,"In humans, several coronaviruses are known to ..."
2,What is COVID-19?,COVID-19 is the infectious disease caused by t...
3,What are the symptoms of COVID-19?,The most common symptoms of COVID-19 are fever...
4,What are the symptoms of COVID-19?,Some people become infected but don’t develop ...
...,...,...
80,Are smokers and tobacco users at higher risk o...,Smokers are likely to be more vulnerable to CO...
81,Are smokers and tobacco users at higher risk o...,Smoking products such as water pipes often inv...
82,How large does a meeting or event need to be i...,High profile international sporting events suc...
83,How large does a meeting or event need to be i...,An event counts as a “mass gatherings” if the ...


In [166]:
df.shape

(85, 2)

In [167]:
df.describe().T

Unnamed: 0,count,unique,top,freq
Context,85,42,What can I do to protect myself and prevent th...,7
Answer,85,85,"Remember, a mask should only be used by health...",1


In [168]:
class TextPreprocessing():    
    def __init__(self, data_df, column_name=None):
        self.data_df = data_df
        self.column_name = column_name
        self.processed_column_name = f"processed_{self.column_name}"
        
    def convert_lowercase(self):
        self.data_df.fillna('', inplace=True)
        self.data_df[self.column_name] = self.data_df[self.column_name].apply(lambda column: column.lower())
#         self.data_df = self.data_df.apply(lambda column: column.astype(str).str.lower(), axis=0) 
    
    def remove_special_symbol(self):
        pattern = '[^\w\s]'
        self.data_df[self.column_name] = self.data_df[self.column_name].apply(lambda row: re.sub(pattern, ' ', row))
    
    def remove_stopwords(self):
        for idx, question in enumerate(self.data_df[self.column_name]):
            self.data_df.loc[idx, self.processed_column_name] = remove_stopwords(question)
    
    def apply_lemmatization(self):
        lemma = WordNetLemmatizer()
        for idx, question in enumerate(self.data_df[self.processed_column_name]):
            lemmatized_sentences = []
            doc = nlp(question.strip())
#             print(doc)
            for word in doc:
                lemmatized_sentences.append(word.lemma_)
            self.data_df.loc[idx, self.processed_column_name] = " ".join(lemmatized_sentences)    
            
    def preprocessing(self):
        self.convert_lowercase()
        self.remove_special_symbol()
        self.remove_stopwords()
        self.apply_lemmatization()
        return self.data_df

In [169]:
txp = TextPreprocessing(df.copy(), 'Context')
preprocessed_data_df = txp.preprocessing()
preprocessed_data_df.head(10)

Unnamed: 0,Context,Answer,processed_Context
0,what is a coronavirus,Coronaviruses are a large family of viruses wh...,coronavirus
1,what is a coronavirus,"In humans, several coronaviruses are known to ...",coronavirus
2,what is covid 19,COVID-19 is the infectious disease caused by t...,covid 19
3,what are the symptoms of covid 19,The most common symptoms of COVID-19 are fever...,symptom covid 19
4,what are the symptoms of covid 19,Some people become infected but don’t develop ...,symptom covid 19
5,what are the symptoms of covid 19,Around 1 out of every 6 people who gets COVID-...,symptom covid 19
6,what are the symptoms of covid 19,"Older people, and those with underlying medica...",symptom covid 19
7,how does covid 19 spread,People can catch COVID-19 from others who have...,covid 19 spread
8,how does covid 19 spread,People can also catch COVID-19 if they breathe...,covid 19 spread
9,can the virus that causes covid 19 be transmit...,Studies to date suggest that the virus that ca...,virus cause covid 19 transmit air


## TFIDF Representation        

In [171]:
class TFIDFCalculation():
    def __init__(self, data_df, column_name):
        self.data_df = data_df
        self.column_name = column_name
        self.dictionary = None
        self.model = None
        self.bow = None
        
    def create_tf_idf_model(self):
        sentence_token_list = [sentence.split(" ") for sentence in self.data_df[self.column_name]]
#         print(sentence_token_list)
        self.dictionary = Dictionary(sentence_token_list)
#         print(self.dictionary)
        self.bow = [self.dictionary.doc2bow(sentence_token) for sentence_token in sentence_token_list]
        self.model = TfidfModel(self.bow)
        
        #first 10 representation of TFIDF vectors for dataset
        print("First 10 representation for dataset")
        for idx, question in enumerate(self.data_df[self.column_name]):
            if idx <= 10:
                print(f"{question} --> {self.model[self.bow[idx]]}")
            else:
                break
                
    def get_test_vector(self, test_df, test_column_name):
        test_tf_idf_vector = []
        test_sentence_list = [t_sentence.split(" ") for t_sentence in test_df[test_column_name]]
        test_bow = [self.dictionary.doc2bow(t_sentence_token) for t_sentence_token in test_sentence_list]
        for t_token in test_bow:
            test_tf_idf_vector.append(self.model[t_token])
        return test_tf_idf_vector
    
    def get_training_tf_vector(self):
        train_tf_vector = []
        for bow_token in self.bow:
            train_tf_vector.append(self.model[bow_token])
        return train_tf_vector
    
    def get_train_vocab(self):
        train_vocab = []
        for index in self.dictionary:
            train_vocab.append(self.dictionary[index])
        return train_vocab
        

In [193]:
def retrieve_similar_question(train_tf_vector, test_tf_vector, train_qa_df, train_column_name, test_qa_df, test_column_name, train_answer_column_name):
    
    similar_question_index = []
    for test_index, test_vector in enumerate(test_tf_vector):
        sim_score = -1
        sim_q_index = -1
        
        for train_index, train_vecor in enumerate(train_tf_vector):
            cos_sim_score = cosine_similarity(train_vecor, test_vector)[0][0]
#             print(cos_sim_score)
            
            if sim_score < cos_sim_score:
                sim_score = cos_sim_score
                sim_q_index = train_index
            
        print("*"*100)
        print(f"Test Question --> {test_qa_df[test_column_name].iloc[test_index]}")
        print(f"Train Question with similarity --> {train_qa_df[train_column_name].iloc[sim_q_index]}")
        print(f"Response Answer --> {train_qa_df[train_answer_column_name].iloc[sim_q_index]}")
        print("*"*100)

In [194]:
tfidf = TFIDFCalculation(preprocessed_data_df, 'processed_Context')
tfidf.create_tf_idf_model()
training_qa_tf_vector = tfidf.get_training_tf_vector()
# training_qa_tf_vector

First 10 representation for dataset
coronavirus --> [(0, 1.0)]
coronavirus --> [(0, 1.0)]
covid 19 --> [(1, 0.7071067811865476), (2, 0.7071067811865476)]
symptom covid 19 --> [(1, 0.14127408491077922), (2, 0.14127408491077922), (3, 0.9798383876258594)]
symptom covid 19 --> [(1, 0.14127408491077922), (2, 0.14127408491077922), (3, 0.9798383876258594)]
symptom covid 19 --> [(1, 0.14127408491077922), (2, 0.14127408491077922), (3, 0.9798383876258594)]
symptom covid 19 --> [(1, 0.14127408491077922), (2, 0.14127408491077922), (3, 0.9798383876258594)]
covid 19 spread --> [(1, 0.18071375858322683), (2, 0.18071375858322683), (4, 0.9667911226927182)]
covid 19 spread --> [(1, 0.18071375858322683), (2, 0.18071375858322683), (4, 0.9667911226927182)]
virus cause covid 19 transmit air --> [(1, 0.05001203800998319), (2, 0.05001203800998319), (5, 0.5813219517438333), (6, 0.49062348171196185), (7, 0.5813219517438333), (8, 0.2800281559816603)]
covid 19 catch person symptom --> [(1, 0.0810362336573543), (2

In [195]:
train_tf_vocab = tfidf.get_train_vocab()
train_tf_vocab

['coronavirus',
 '19',
 'covid',
 'symptom',
 'spread',
 'air',
 'cause',
 'transmit',
 'virus',
 'catch',
 'person',
 'disease',
 'fece',
 'prevent',
 'protect',
 '14',
 'andprotection',
 'area',
 'day',
 'measure',
 'past',
 'recently',
 'visit',
 'protection',
 'likely',
 'worry',
 'develop',
 'illness',
 'risk',
 'severe',
 'antibiotic',
 'effective',
 'treating',
 'cure',
 'medicine',
 'therapy',
 'drug',
 'treatment',
 'vaccine',
 'sar',
 'mask',
 'wear',
 'dispose',
 'use',
 'incubation',
 'long',
 'period',
 'animal',
 'human',
 'infect',
 'source',
 'pet',
 'surface',
 'survive',
 'package',
 'receive',
 'report',
 'safe',
 '',
 'know',
 '2',
 'cov',
 'infection',
 'occur',
 'sars',
 'airborne',
 'high',
 'pregnant',
 'woman',
 'm',
 'test',
 'baby',
 'pass',
 'unborn',
 'available',
 'care',
 'childbirth',
 'pregnancy',
 'birth',
 'ceasarean',
 'confirm',
 'need',
 'section',
 'suspect',
 'breastfeed',
 'hold',
 'newborn',
 'touch',
 'influenza',
 'similar',
 'different',
 'i

In [196]:
## Evaluating TF-IDF for test question

In [215]:
test_query_string = ["how does covid-19 spread?", 
                     "What are the symptoms of COVID-19?",
                "Should I wear a mask to protect myself from covid-19",              
                "Is there a vaccine for COVID-19",
                "can the virus transmit through air?",
                "can the virus spread through air?"]
# test_query_string = ['covid 19']
test_df = pd.DataFrame(test_query_string, columns=['test_questions'])
test_text_preprocessor = TextPreprocessing(test_df, 'test_questions')
processed_test_df = test_text_preprocessor.preprocessing()
test_qa_vector = tfidf.get_test_vector(test_df, 'test_questions')
test_qa_vector

[[(1, 0.09659675999196742),
  (2, 0.09659675999196742),
  (4, 0.5167779740362349),
  (58, 0.8451500798492022)],
 [(1, 0.11283093631740237),
  (2, 0.11283093631740237),
  (58, 0.9871870945365303)],
 [(1, 0.07964811034113145),
  (2, 0.07964811034113145),
  (14, 0.39129299178843197),
  (40, 0.5904113567462889),
  (41, 0.6968619529293003)],
 [(1, 0.12314261383935593),
  (2, 0.12314261383935593),
  (38, 0.9847191443826114)],
 [(5, 0.5977616450300318),
  (7, 0.5977616450300318),
  (8, 0.28794730815203434),
  (58, 0.44994263988866146)],
 [(4, 0.3246026496397085),
  (5, 0.7052657533629679),
  (8, 0.33973303055012904),
  (58, 0.5308623220134117)]]

In [210]:
retrieve_similar_question(training_qa_tf_vector, test_qa_vector, preprocessed_data_df, 'Context', processed_test_df, 'test_questions', 'Answer')

****************************************************************************************************
Test Question --> how does covid 19 spread 
Train Question with similarity --> how likely am i to catch covid 19 
Response Answer --> The risk depends on where you  are - and more specifically, whether there is a COVID-19 outbreak unfolding there.
****************************************************************************************************
****************************************************************************************************
Test Question --> what are the symptoms of covid 19 
Train Question with similarity --> should i worry about covid 19
Response Answer --> Illness due to COVID-19 infection is generally mild, especially for children and young adults. However, it can cause serious illness: about 1 in every 5 people who catch it need hospital care.
****************************************************************************************************
*****************

In [203]:
## BERT Embedding
!pip install bert-embedding

Collecting bert-embedding
  Using cached bert_embedding-1.0.1-py3-none-any.whl (13 kB)
Collecting typing==3.6.6
  Using cached typing-3.6.6-py3-none-any.whl (25 kB)
Collecting mxnet==1.4.0
  Using cached mxnet-1.4.0-py2.py3-none-win_amd64.whl (21.9 MB)
Processing c:\users\abhijit morye\appdata\local\pip\cache\wheels\75\d0\a1\57ea55532e4ff6e3efbec7a851724a8f7a5b073ff648dd4160\gluonnlp-0.6.0-py3-none-any.whl
Collecting requests<2.19.0,>=2.18.4
  Using cached requests-2.18.4-py2.py3-none-any.whl (88 kB)


ERROR: After October 2020 you may experience errors when installing or updating packages. This is because pip will change the way that it resolves dependency conflicts.

We recommend you use --use-feature=2020-resolver to test your packages with the new resolver before it becomes the default.

yfinance 0.1.67 requires numpy>=1.15, but you'll have numpy 1.14.6 which is incompatible.
yfinance 0.1.67 requires requests>=2.20, but you'll have requests 2.18.4 which is incompatible.
tweepy 4.5.0 requires requests<3,>=2.27.0, but you'll have requests 2.18.4 which is incompatible.
transformers 4.13.0 requires numpy>=1.17, but you'll have numpy 1.14.6 which is incompatible.
tensorboard 2.8.0 requires requests<3,>=2.21.0, but you'll have requests 2.18.4 which is incompatible.
spacy 3.2.1 requires numpy>=1.15.0, but you'll have numpy 1.14.6 which is incompatible.
huggingface-hub 0.2.1 requires packaging>=20.9, but you'll have packaging 20.4 which is incompatible.
fin-news 0.1.2 requires requests>=

Collecting graphviz<0.9.0,>=0.8.1
  Using cached graphviz-0.8.4-py2.py3-none-any.whl (16 kB)
Collecting urllib3<1.23,>=1.21.1
  Using cached urllib3-1.22-py2.py3-none-any.whl (132 kB)
Collecting idna<2.7,>=2.5
  Using cached idna-2.6-py2.py3-none-any.whl (56 kB)
Installing collected packages: typing, urllib3, idna, requests, graphviz, mxnet, gluonnlp, bert-embedding
  Attempting uninstall: urllib3
    Found existing installation: urllib3 1.25.11
    Uninstalling urllib3-1.25.11:
      Successfully uninstalled urllib3-1.25.11
  Attempting uninstall: idna
    Found existing installation: idna 2.10
    Uninstalling idna-2.10:
      Successfully uninstalled idna-2.10
  Attempting uninstall: requests
    Found existing installation: requests 2.27.1
    Uninstalling requests-2.27.1:
      Successfully uninstalled requests-2.27.1
Successfully installed bert-embedding-1.0.1 gluonnlp-0.6.0 graphviz-0.8.4 idna-2.6 mxnet-1.4.0 requests-2.18.4 typing-3.6.6 urllib3-1.22


In [211]:
from bert_embedding import BertEmbedding
bert_embedding = BertEmbedding()
QA_questions = preprocessed_data_df["Context"].to_list()
query_QA_questions = test_df["test_questions"].to_list()



In [212]:
question_QA_bert_embeddings_list = bert_embedding(QA_questions)
query_QA_bert_embeddings_list = bert_embedding(query_QA_questions)

In [213]:
## store QA bert embeddings in list
question_QA_bert_embeddings = []
for embeddings in question_QA_bert_embeddings_list:
  question_QA_bert_embeddings.append(embeddings[1])

## store query string bert embeddings in list
query_QA_bert_embeddings = []
for embeddings in query_QA_bert_embeddings_list:
  query_QA_bert_embeddings.append(embeddings[1])

In [214]:
# retrieveSimilarFAQ(question_QA_bert_embeddings, query_QA_bert_embeddings, processed_QA_df, "questions", query_QA_df, "test_questions")
retrieve_similar_question(question_QA_bert_embeddings, query_QA_bert_embeddings, preprocessed_data_df, 'Context', processed_test_df, 'test_questions', 'Answer')

****************************************************************************************************
Test Question --> how does covid 19 spread 
Train Question with similarity --> how does covid 19 spread 
Response Answer --> People can catch COVID-19 from others who have the virus. The disease can spread from person to person through small droplets from the nose or mouth which are spread when a person with COVID-19 coughs or exhales. 
****************************************************************************************************
****************************************************************************************************
Test Question --> what are the symptoms of covid 19 
Train Question with similarity --> what are the symptoms of covid 19 
Response Answer --> The most common symptoms of COVID-19 are fever, tiredness, and dry cough. Some patients may have aches and pains, nasal congestion, runny nose, sore throat or diarrhea. These symptoms are usually mild and begin grad