In [1]:
import pandas as pd
import numpy as np
import sklearn
import matplotlib.pyplot as plt

#gsheet read db
from googleapiclient.discovery import build
from google_auth_oauthlib.flow import InstalledAppFlow, Flow 
from google.auth.transport.requests import Request 

import os, sys, pickle

%matplotlib inline

# 1. Fetch Data

In [2]:
# Gsheet read to pdf
def gsheetRead_GoogleWay(dpath):
    results = None
    
    scope =  ['https://www.googleapis.com/auth/spreadsheets'] 

    creds = None
    if os.path.exists( 'token.pickle'):
        with open( 'token.pickle', 'rb') as fd:
            creds = pickle.load( fd ) 
    if not creds or not creds.valid:
        if creds and creds.expired and creds.refresh_token:
            creds.refresh( Request() ) 
        else:
            flow = InstalledAppFlow.from_client_secrets_file( 'gsheet_get.json', scope) 
            creds = flow.run_local_server(port=0)
        with open('token.pickle', 'wb') as fd:
            pickle.dump( creds, fd) 
    
    service = build('sheets', 'v4', credentials=creds, cache_discovery=False) 

    sheet = service.spreadsheets()
    reader = sheet.values().get(spreadsheetId=dpath[0], range=dpath[1]).execute() 

    results = reader.get('values', None)  

    return results  
''

'''
Returns two dicts : responses_db(class_category : response_paragraph) and input_phrases(input_que : class_category )
'''
def unpack_FaqGsheet(db_path, training_set, removeHeader=True):     
    start_idx = 1 if removeHeader else 0
    ## 1. unpack responses set @ retrieval class_cat : response 
    gsheet_faq_db = {} 
    tmp = gsheetRead_GoogleWay( db_path )[start_idx: ] ## ignore header row TODO: refactor at caller to decide
    for row in tmp:
        if len(row) > 2:
            gsheet_faq_db[ row[1] ] = row[2] 

    ## 2. unpack training set  que : class_cat       
    gsheet_faq_training_set_db = {}
    tmp = gsheetRead_GoogleWay( training_set )[start_idx: ]
    for row in tmp:
        if len(row) > 1:
            gsheet_faq_training_set_db[ row[0] ] = row[1] 

    return gsheet_faq_db, gsheet_faq_training_set_db 


db_pathz = [ ('1EuvcPe9WXSQTsmSqhq0LWJG4xz2ZRQ1FEdnQ_LQ-_Ks', 'FAQ responses!A1:G1000'), ('1EuvcPe9WXSQTsmSqhq0LWJG4xz2ZRQ1FEdnQ_LQ-_Ks', 'Classify_Phrases!A1:G1000')]


In [3]:
res_path, que_path = db_pathz

dict_responses_db, dict_inputs_db = unpack_FaqGsheet( res_path, que_path)

print( "Loaded {} responses and {} training input text".format(len(dict_responses_db), len(dict_inputs_db)))

Loaded 91 responses and 141 training input text


In [4]:
# make pandas data frame
db_faq = pd.DataFrame( dict_responses_db.items() )
db_faq.columns = ['class_category', 'response_paragraph']
db_faq.head()

Unnamed: 0,class_category,response_paragraph
0,pandemic_define,A pandemic is an epidemic (infectious disease ...
1,pandemic_causes,A pandemic can occur when a new virus emerges ...
2,pandemic_WHO,On 11 March WHO declared COVID-19 a pandemic. ...
3,pandemic_impact,The health impact of a pandemic on the communi...
4,corana_viruses,Coronaviruses are a large family of viruses. S...


In [5]:
# make pandas data frame
db_training = pd.DataFrame( dict_inputs_db.items() )
db_training.columns = [ 'input_text', 'class_category']
db_training.head()

Unnamed: 0,input_text,class_category
0,what is a pandemic?,pandemic_define
1,what causes pandemics?,pandemic_causes
2,what does it mean that WHO has declared a pand...,pandemic_WHO
3,Why do pandemics occur?,pandemic_why
4,How serious will the impact be?,pandemic_impact


# 2. Clean and Tokenize/Vectorize

In [49]:
import nltk
import string , re 
from nltk.corpus import stopwords
from sklearn.feature_extraction import text 
from sklearn.feature_extraction.text import CountVectorizer

In [7]:

'''
Use nltk english puctuations dict. 
Replace all punctions with None

Input: dataset to be operated on. Must be a list (or array or vector)
Output: lower case word tokens after operation 
'''
def wordTokenizeWithoutPunctuations(dataset):
    punctuations = dict( (ord(p), None) for p in string.punctuation )
    text = dataset if isinstance(dataset, str) else " ".join(dataset)
    return nltk.word_tokenize( text.lower().translate( punctuations) ) 

'''
Use WordNetLemmatizer english dict

Input: dataset to be operated on
Output: dataset after operation 
'''
def lemmatizeTokens(tokenz ):
    lemertizer = nltk.stem.WordNetLemmatizer() 
    tokenz =  wordTokenizeWithoutPunctuations( tokenz )  
    return sorted( [ lemertizer.lemmatize( token )  for token in tokenz ] ) 

'''
Work on paragraphs and 
Return sentence as opposed to a list of tokens 
'''
def lemmatizeSentences( sentz_list):
    result = []
    sentz_list = sentz_list if isinstance(sentz_list, list) else nltk.sent_tokenize( sentz_list ) 
    for sent in sentz_list:
        result.extend( lemmatizeTokens(sent) )
    return " ".join(result )

lemmatizeSentences( "The quick brown fox jumped over the lazy dogs. The dog had 23 bones. This was a surprise to us.")

'brown dog fox jumped lazy over quick the the 23 bone dog had the a surprise this to u wa'

In [8]:
## 1. Tokenize input strings
db_training['lemma_sentences'] = db_training.input_text.apply( lambda x: lemmatizeSentences(x) )
db_training.head()

Unnamed: 0,input_text,class_category,lemma_sentences
0,what is a pandemic?,pandemic_define,a is pandemic what
1,what causes pandemics?,pandemic_causes,cause pandemic what
2,what does it mean that WHO has declared a pand...,pandemic_WHO,a declared doe ha it mean pandemic that what who
3,Why do pandemics occur?,pandemic_why,do occur pandemic why
4,How serious will the impact be?,pandemic_impact,be how impact serious the will


In [9]:
# 2. Vectorize tokenized input strings
count_vectorizer = CountVectorizer(stop_words='english', ngram_range=(1,3))
cv_out = count_vectorizer.fit_transform( db_training.lemma_sentences ) 
# print( count_vectorizer.get_feature_names() )
cv_out

db_training = db_training.join( pd.DataFrame( 
        cv_out.toarray(), 
        columns=count_vectorizer.get_feature_names(), index=db_training.index
        ) )

db_training.to_csv('training_matrix.csv')

db_training.head(2)

## TODO: min_df and max_df=1 to remove terms that appear too (in)frequently << corpus specific stop words

Unnamed: 0,input_text,class_category,lemma_sentences,14,14 day,14 day doctor,14 day finishing,19,19 corana,19 corana virus,...,visit,warm,warm weather,wash,water,weather,woman,work,worried,young
0,what is a pandemic?,pandemic_define,a is pandemic what,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,what causes pandemics?,pandemic_causes,cause pandemic what,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
pickle.dump(count_vectorizer, open('count_vectorizer.pkl', 'wb') )
db_faq.to_pickle( 'faq_responses.pkl' )
db_training.to_pickle( 'training_set.pkl' )

# 3. Exploration (EDA)
Seek apparent patterns and areas for further cleaning/preprocessing

- Most commond words; threshold to add them to the stopwords and update the CV_matrix
- Vocabulary 

In [38]:
from collections import Counter # has common_words 

dftmp = pd.DataFrame( db_training.iloc[:, 3:].sum() ) 
# print( list(dftmp.index )  )
dftmp.head()
dftmp[0].sort_values(ascending=False).head(30)

covid19         36
virus           27
risk            10
doe              9
child            9
spread           8
sanitizer        8
hand             7
safe             6
pet              6
tested           6
im               6
people           6
infected         6
corana           6
school           6
protect          6
use              6
infection        5
covid19 doe      5
home             5
long             5
spread virus     4
family           4
isolate          4
test             4
make             4
self             4
isolation        4
person           4
Name: 0, dtype: int64

In [42]:
## get top 30 words for each category
top_dict = {}
words = []
class_cats = db_training.class_category.unique()

for class_cat in class_cats:
    dftmp =  pd.DataFrame( db_training[ db_training.class_category == class_cat ].iloc[:, 3:].sum())
    top = dftmp[0].sort_values(ascending=False).head(30)     
    top_dict[ "{}".format(class_cat) ] = list(zip(top.index, top.values) ) #[  ]
        
print( top_dict['covid19_define'])
    

[('virus', 3), ('corana', 2), ('ncov19 virus', 1), ('ncov19', 1), ('covid19 virus', 1), ('19', 1), ('19 corana', 1), ('19 corana virus', 1), ('rona', 1), ('corana virus', 1), ('covid19', 1), ('diy okay', 0), ('diy', 0), ('young', 0), ('distancing social', 0), ('doctor', 0), ('doctor finishing', 0), ('doctor finishing isolation', 0), ('doctor im', 0), ('diy okay sanitizer', 0), ('distancing practice', 0), ('distancing practice social', 0), ('doe', 0), ('distancing', 0), ('disability higher people', 0), ('disability higher', 0), ('disability', 0), ('different symptom', 0), ('different', 0), ('difference flu virus', 0)]


In [44]:
 ## Visualize top 10 words per category
for class_cat, top_words in top_dict.items():
    print( "{} \t{}".format(class_cat, ", ".join([ w for w, c in top_words[0:10]])))


pandemic_define 	pandemic, young, distancing practice social, doe ha, doe, doctor im speak, doctor im, doctor finishing isolation, doctor finishing, doctor
pandemic_causes 	pandemic, cause pandemic, cause, young, diy, doe ha mean, doe ha, doe, doctor im speak, doctor im
pandemic_WHO 	declared doe, ha mean, mean pandemic, doe, doe ha, doe ha mean, ha mean pandemic, ha, declared doe ha, declared
pandemic_why 	occur, pandemic, occur pandemic, young, distancing social, doe, doctor im speak, doctor im, doctor finishing isolation, doctor finishing
pandemic_impact 	impact, young, distancing social, doe ha, doe, doctor im speak, doctor im, doctor finishing isolation, doctor finishing, doctor
corana_viruses 	corona virus, corona, virus, young, distancing social, doe, doctor im speak, doctor im, doctor finishing isolation, doctor finishing
covid19_define 	virus, corana, ncov19 virus, ncov19, covid19 virus, 19, 19 corana, 19 corana virus, rona, corana virus
covid19_spread 	spread, spread virus, c

In [45]:
## add most common words to stop_words
words = []
for class_cat in top_dict.keys():
    top = [ w for (w, c) in top_dict[class_cat] ]
    for t in top:
        words.append( t )

words[:3]

['pandemic', 'young', 'distancing practice social']

In [46]:
Counter( words ).most_common()

[('diy', 81),
 ('doctor', 80),
 ('diy okay', 80),
 ('distancing practice', 79),
 ('doctor finishing', 77),
 ('diy okay sanitizer', 77),
 ('distancing', 76),
 ('distancing practice social', 74),
 ('distancing social', 74),
 ('doctor finishing isolation', 72),
 ('disability higher people', 72),
 ('disability higher', 72),
 ('disability', 70),
 ('different symptom', 67),
 ('doctor im', 64),
 ('different', 61),
 ('doe', 57),
 ('doctor im speak', 56),
 ('difference flu virus', 50),
 ('difference flu', 46),
 ('doe ha', 45),
 ('young', 44),
 ('doe ha mean', 36),
 ('difference', 31),
 ('covid19', 31),
 ('doe heat', 28),
 ('doe heat prevent', 24),
 ('died passed risk', 22),
 ('virus', 20),
 ('doe infection', 12),
 ('died passed', 10),
 ('child', 9),
 ('risk', 8),
 ('spread', 7),
 ('died', 6),
 ('dog make', 6),
 ('im', 6),
 ('covid19 doe', 5),
 ('infected', 5),
 ('people', 5),
 ('protect', 5),
 ('use', 5),
 ('school', 5),
 ('pandemic', 4),
 ('drinking public safe', 4),
 ('corana', 4),
 ('spread 

In [48]:
add_stop_words = [ w for (w, c) in Counter(words).most_common() if c > 20]
add_stop_words[ : 5]

['diy', 'doctor', 'diy okay', 'distancing practice', 'doctor finishing']

In [54]:

add_stop_words = text.ENGLISH_STOP_WORDS.union( add_stop_words)
new_cv = CountVectorizer( stop_words = add_stop_words, ngram_range=(1,3) )
data_cv = new_cv.fit_transform( db_training.lemma_sentences )

db_training.drop( db_training.iloc[:, 3:], axis=1, inplace=True)

db_training = db_training.join( pd.DataFrame( 
        data_cv.toarray(), 
        columns=new_cv.get_feature_names(), index=db_training.index
        ) )

db_training.to_csv('training_matrix.csv')

db_training.head(2)

  'stop_words.' % sorted(inconsistent))


Unnamed: 0,input_text,class_category,lemma_sentences,14,14 day,14 day finishing,19,19 corana,19 corana virus,access,...,virus worried,visit,warm,warm weather,wash,water,weather,woman,work,worried
0,what is a pandemic?,pandemic_define,a is pandemic what,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,what causes pandemics?,pandemic_causes,cause pandemic what,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [55]:

dftmp = pd.DataFrame( db_training.iloc[:, 3:].sum() ) 
dftmp[0].sort_values(ascending=False).head(30)

virus           27
risk            10
child            9
sanitizer        8
spread           8
hand             7
safe             6
tested           6
use              6
im               6
infected         6
people           6
pet              6
school           6
protect          6
corana           6
infection        5
long             5
home             5
isolate          4
pandemic         4
person           4
test             4
family           4
self             4
isolation        4
ha               4
face             4
spread virus     4
contact          4
Name: 0, dtype: int64

In [57]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [93]:
removeStopWords = True 
'''
TODO: alternative similarity measures Vs acurracy

Input: a doc/sentence observation to find matching docs for
Return: Doc with highest matching score or None if nothing found
TODO: set a threshold for matching scores; don't just use max
'''
def predict(observation):                 
    sent_tokenz = list( db_training.lemma_sentences )
    sent_class_cat = list( db_training.class_category )
    sent_tokenz.append( observation )       
    tfidf = model.fit_transform( sent_tokenz ) 

    valz = cosine_similarity( tfidf[-1], tfidf) 
    idx = valz.argsort()[0][-2] 
    flatz = valz.flatten()
    flatz.sort()

    resp = flatz[-2]

    if resp == 0:
        return (None, None)
    else:
        # return "{}\n\t{}".format(sent_tokenz[ idx ] , sent_tokenz[ idx+1])
        return  sent_class_cat[idx], "{} <<< {}".format( sent_class_cat[idx], sent_tokenz[ idx ] )

In [96]:
quez = ['What is corana?', 'what is covid virus', 'what is covid-19 virus',  'What is COVID-19', 
        'What is pandemic?', 'Can my cat infect me', 'can an insect infect me', 'should i get my pet tested', 
       'how do i protect myself', 'is my child at risk', 'is my old man at risk', 
        'can i go swimming', 'is bottled water a risk', 'is bottled drinking water a risk', 
        'is it safe to drink bottled water', 'is public water bad', 'are frozen foods safe', 
       'can i get infect from handling dead bodies of previously infected persons', 
       'do i have corona']

labelz = [
    'covid19_define', 'covid19_define', 'covid19_define', 'covid19_define', 
    'pandemic_define', 'pets_infection_cdc', 'covid19_spread_insects', 'pets_infection_cdc',
    'covid19_self_protect', 'covid19_at_risk_kids', 'covid19_at_risk',
    'covid19_swimming', 'covid19_public_water', 'covid19_public_water', 
    'covid19_public_water', 'çovid19_public_water', 'covid19_spread_foods',
    'covid19_dead',
    'covid19_symptoms'
]

In [97]:
## With corpus specific stop words

model = TfidfVectorizer(
            tokenizer = lemmatizeTokens,
            stop_words = add_stop_words if removeStopWords else None,
            ngram_range = (1,3),             
        )

predicted = []

for i, que in enumerate(quez):
    cat, pred = predict( que )
    predicted.append( cat )
    print( "{}. [{}] {} ====> {}".format( i, (cat == labelz[i]), que, pred))
    
predicted = np.array(predicted) 
labelz = np.array(labelz ) 
y_true = len( predicted[ predicted == labelz]) 
n = len(labelz) 

print( 'y_true = {} \tn={} \taccuracy = {}%'.format( y_true, n, (y_true*100/n) ) )

  'stop_words.' % sorted(inconsistent))


0. [True] What is corana? ====> covid19_define <<< corana is what
1. [True] what is covid virus ====> covid19_define <<< covid19 is virus what
2. [True] what is covid-19 virus ====> covid19_define <<< covid19 is virus what
3. [False] What is COVID-19 ====> None
4. [True] What is pandemic? ====> pandemic_define <<< a is pandemic what
5. [True] Can my cat infect me ====> pets_infection_cdc <<< cat dog infect me my or or other pet will
6. [True] can an insect infect me ====> covid19_spread_insects <<< a bedbug can cockroach covid19 infect insect mite mosquito or or spread such the tick u virus with
7. [False] should i get my pet tested ====> testing_who <<< be i should tested
8. [True] how do i protect myself ====> covid19_self_protect <<< can how i myself protect
9. [True] is my child at risk ====> covid19_at_risk_kids <<< becoming child covid19 is my of risk sick the what with
10. [True] is my old man at risk ====> covid19_at_risk <<< at is most risk who
11. [True] can i go swimming ===

In [95]:
## same thing with default stop words

model = TfidfVectorizer(
            tokenizer = lemmatizeTokens,
            stop_words = 'english' if removeStopWords else None,
            ngram_range = (1,3),             
        )

predicted = []

for i, que in enumerate(quez):
    cat, pred = predict( que )
    predicted.append( cat )
    print( "{}. [{}] {} ====> {}".format( i, (cat == labelz[i]), que, pred))
    
predicted = np.array(predicted) 
labelz = np.array(labelz ) 
y_true = len( predicted[ predicted == labelz]) 
n = len(labelz) 

print( 'y_true = {} \tn={} \taccuracy = {}%'.format( y_true, n, (y_true*100/n) ) )

  'stop_words.' % sorted(inconsistent))


0. [True] What is corana? ====> covid19_define <<< corana is what
1. [True] what is covid virus ====> covid19_define <<< covid19 is virus what
2. [True] what is covid-19 virus ====> covid19_define <<< covid19 is virus what
3. [True] What is COVID-19 ====> covid19_define <<< covid19 is virus what
4. [True] What is pandemic? ====> pandemic_define <<< a is pandemic what
5. [True] Can my cat infect me ====> pets_infection_cdc <<< cat dog infect me my or or other pet will
6. [True] can an insect infect me ====> covid19_spread_insects <<< a bedbug can cockroach covid19 infect insect mite mosquito or or spread such the tick u virus with
7. [False] should i get my pet tested ====> testing_who <<< be i should tested
8. [True] how do i protect myself ====> covid19_self_protect <<< can how i myself protect
9. [False] is my child at risk ====> covid19_at_risk <<< at is most risk who
10. [True] is my old man at risk ====> covid19_at_risk <<< at is most risk who
11. [True] can i go swimming ====> co

In [98]:
# no n-grams
model = TfidfVectorizer(
            tokenizer = lemmatizeTokens,
            stop_words = 'english' if removeStopWords else None,
        )

predicted = []

for i, que in enumerate(quez):
    cat, pred = predict( que )
    predicted.append( cat )
    print( "{}. [{}] {} ====> {}".format( i, (cat == labelz[i]), que, pred))
    
predicted = np.array(predicted) 
labelz = np.array(labelz ) 
y_true = len( predicted[ predicted == labelz]) 
n = len(labelz) 

print( 'y_true = {} \tn={} \taccuracy = {}%'.format( y_true, n, (y_true*100/n) ) )

  'stop_words.' % sorted(inconsistent))


0. [True] What is corana? ====> covid19_define <<< corana is what
1. [True] what is covid virus ====> covid19_define <<< covid19 is virus what
2. [True] what is covid-19 virus ====> covid19_define <<< covid19 is virus what
3. [True] What is COVID-19 ====> covid19_define <<< covid19 is virus what
4. [True] What is pandemic? ====> pandemic_define <<< a is pandemic what
5. [True] Can my cat infect me ====> pets_infection_cdc <<< cat dog infect me my or or other pet will
6. [True] can an insect infect me ====> covid19_spread_insects <<< a bedbug can cockroach covid19 infect insect mite mosquito or or spread such the tick u virus with
7. [False] should i get my pet tested ====> testing_who <<< be i should tested
8. [True] how do i protect myself ====> covid19_self_protect <<< can how i myself protect
9. [True] is my child at risk ====> covid19_at_risk_kids <<< becoming child covid19 is my of risk sick the what with
10. [True] is my old man at risk ====> covid19_at_risk <<< are at more old p

In [99]:
# no n-grams & corpus specific stop_words
model = TfidfVectorizer(
            tokenizer = lemmatizeTokens,
            stop_words = add_stop_words if removeStopWords else None,
        )

predicted = []

for i, que in enumerate(quez):
    cat, pred = predict( que )
    predicted.append( cat )
    print( "{}. [{}] {} ====> {}".format( i, (cat == labelz[i]), que, pred))
    
predicted = np.array(predicted) 
labelz = np.array(labelz ) 
y_true = len( predicted[ predicted == labelz]) 
n = len(labelz) 

print( 'y_true = {} \tn={} \taccuracy = {}%'.format( y_true, n, (y_true*100/n) ) )

  'stop_words.' % sorted(inconsistent))


0. [True] What is corana? ====> covid19_define <<< corana is what
1. [True] what is covid virus ====> covid19_define <<< covid19 is virus what
2. [True] what is covid-19 virus ====> covid19_define <<< covid19 is virus what
3. [False] What is COVID-19 ====> None
4. [True] What is pandemic? ====> pandemic_define <<< a is pandemic what
5. [True] Can my cat infect me ====> pets_infection_cdc <<< cat dog infect me my or or other pet will
6. [True] can an insect infect me ====> covid19_spread_insects <<< a bedbug can cockroach covid19 infect insect mite mosquito or or spread such the tick u virus with
7. [False] should i get my pet tested ====> testing_who <<< be i should tested
8. [True] how do i protect myself ====> covid19_self_protect <<< can how i myself protect
9. [True] is my child at risk ====> covid19_at_risk_kids <<< becoming child covid19 is my of risk sick the what with
10. [True] is my old man at risk ====> covid19_at_risk <<< are at more old people risk
11. [True] can i go swim

In [100]:
# no n-grams, english stopwords only and default tokenizer 
model = TfidfVectorizer(
            stop_words = 'english' if removeStopWords else None,
        )

predicted = []

for i, que in enumerate(quez):
    cat, pred = predict( que )
    predicted.append( cat )
    print( "{}. [{}] {} ====> {}".format( i, (cat == labelz[i]), que, pred))
    
predicted = np.array(predicted) 
labelz = np.array(labelz ) 
y_true = len( predicted[ predicted == labelz]) 
n = len(labelz) 

print( 'y_true = {} \tn={} \taccuracy = {}%'.format( y_true, n, (y_true*100/n) ) )

0. [True] What is corana? ====> covid19_define <<< corana is what
1. [True] what is covid virus ====> covid19_define <<< covid19 is virus what
2. [True] what is covid-19 virus ====> covid19_define <<< 19 corana is virus what
3. [True] What is COVID-19 ====> covid19_define <<< 19 corana is virus what
4. [True] What is pandemic? ====> pandemic_define <<< a is pandemic what
5. [True] Can my cat infect me ====> pets_infection_cdc <<< cat dog infect me my or or other pet will
6. [True] can an insect infect me ====> covid19_spread_insects <<< a bedbug can cockroach covid19 infect insect mite mosquito or or spread such the tick u virus with
7. [False] should i get my pet tested ====> testing_who <<< be i should tested
8. [True] how do i protect myself ====> covid19_self_protect <<< can how i myself protect
9. [True] is my child at risk ====> covid19_at_risk_kids <<< becoming child covid19 is my of risk sick the what with
10. [True] is my old man at risk ====> covid19_at_risk <<< are at more o

In [101]:
## zero effort
model = TfidfVectorizer(       )

predicted = []

for i, que in enumerate(quez):
    cat, pred = predict( que )
    predicted.append( cat )
    print( "{}. [{}] {} ====> {}".format( i, (cat == labelz[i]), que, pred))
    
predicted = np.array(predicted) 
labelz = np.array(labelz ) 
y_true = len( predicted[ predicted == labelz]) 
n = len(labelz) 

print( 'y_true = {} \tn={} \taccuracy = {}%'.format( y_true, n, (y_true*100/n) ) )

0. [True] What is corana? ====> covid19_define <<< corana is what
1. [True] what is covid virus ====> covid19_define <<< covid19 is virus what
2. [True] what is covid-19 virus ====> covid19_define <<< 19 corana is virus what
3. [True] What is COVID-19 ====> covid19_define <<< 19 corana is virus what
4. [True] What is pandemic? ====> pandemic_define <<< a is pandemic what
5. [True] Can my cat infect me ====> pets_infection_cdc <<< cat dog infect me my or or other pet will
6. [False] can an insect infect me ====> pets_infection_cdc <<< cat dog infect me my or or other pet will
7. [False] should i get my pet tested ====> testing_who <<< can get i tested
8. [True] how do i protect myself ====> covid19_self_protect <<< can how i myself protect
9. [False] is my child at risk ====> covid19_at_risk <<< at is most risk who
10. [True] is my old man at risk ====> covid19_at_risk <<< are at more old people risk
11. [True] can i go swimming ====> covid19_swimming <<< can go i swimming
12. [True] is