In [None]:
# ! conda install -c conda-forge spacy
# ! python -m spacy download en_core_web_sm 

In [13]:
import spacy 
import nltk
from itertools import product
import csv, sqlite3
from spacy.lookups import Lookups
import re

nlp = spacy.load('en_core_web_sm') 

In [14]:
# sentence = "What are total death cases in US on 27-09-2020?"
# sentence = "number of cities with total cases greater than 1000?"

def print_entities(sentence):
	doc = nlp(sentence)
	print("----> Entities:")
	for ent in doc.ents: 
		print("-------->",ent.text, ent.start_char, ent.end_char, ent.label_) 

def print_tokens(sentence):
	doc = nlp(sentence) 
	print("----> Tokens:")
	for token in doc: 
		# if(token.dep_ == "nsubj"):
		print("-------->", token.text, token.pos_, token.dep_) 

In [15]:
queries=[]
with open("../possible-questions.txt","r") as f:
    for line in f.readlines():
        line = line[:-1]
        if(line):
            queries.append(line)

In [16]:
def process_query(query):
    # print(query)
    print_entities(query)
    print_tokens(query)
    doc=nlp(query)
    print([chunk.text for chunk in doc.noun_chunks])
    print()

In [None]:
doc=nlp('found')
query=' '.join([token.lemma_ for token in doc])
query

In [None]:
additional_stopwords=['case', 'find', 'covid', 'coronavirus', 'covid-19', 'covid19', 'world']
for word in additional_stopwords:
        nlp.vocab[word].is_stop=True

assign_base_words={
    'recover' : ['recover','recovery','cure','heal'],
    'death' : ['death','fatality','fatal','demise','decease','die','expire'],
    'confirm': ['confirm']
}

reverse_base_word_dict={}
for base, l in assign_base_words.items():
    for item in l:
        reverse_base_word_dict[item]=base

# table = nlp.vocab.lookups.get_table("lemma_lookup")
# for base, l in assign_base_words.items():
#     for item in l:
#         table[item]=base

# doc=nlp("recovery")
# print(doc[0].lemma_)

In [None]:
def remove_unnecessary(query):
    doc=nlp(query)

    for ent in doc.ents:
        if ent.label_=='GPE' or ent.label_=='DATE':
            query=query.replace(ent.text,"")
    print(query)

    query=query.lower()
    doc=nlp(query)
    query=' '.join([token.lemma_ for token in doc])

    for word,base in reverse_base_word_dict.items():
        query=query.replace(word,base)

    doc=nlp(query)
    for token in doc:
        if token.is_stop==True or token.dep_=='prep' or token.dep_=='punct':
            query=query.replace(token.text,"")
    
    return query

In [None]:
for query in queries:
    print(query)
    process_query(query)
    # processed_query = remove_unnecessary(query)
    # print(processed_query)
    print()

In [None]:
from nltk.corpus import wordnet

synonyms = []

for syn in wordnet.synsets("most"): 
    print(syn.name())
    print(syn.lemmas())
    for l in syn.lemmas(): 
        synonyms.append(l.name())
print (set(synonyms))

In [None]:
from nltk.corpus import wordnet 

def caseTypeProbability(sent):
    docs=nlp(sent)
    case_type=["death", "recovery", "active", "confirm"]

    similarity=0.0
    type="total"
    word=""

    # for case in case_type:
    #     tempdoc=nlp(case)
    #     tempdoc=tempdoc[0]
        
    #     for doc in docs:
    #         temp=doc.similarity(tempdoc)

    #         if similarity<temp:
    #             similarity=temp
    #             type=case
    #             word=doc
    
    for case in case_type:
        w1=wordnet.synsets(case)
        # print(w1)
        for doc in docs:
            w2=wordnet.synsets(str(doc.text))
            # print(w2)
            # temp=w1.similarity(w2)

            for i,j in list(product(*[w1,w2])):
                # print(i,j)
                temp = i.wup_similarity(j) # Wu-Palmer Similarity
                # maxscore = score if maxscore < score else maxscore
                if temp and similarity<temp:
                    similarity=temp
                    type=case
                    word=doc
    
    return type, word, similarity

In [None]:
for query in queries:
    print(query)
    print(caseTypeProbability(query))

In [None]:
from spacy.kb import KnowledgeBase

# nlp = spacy.load('en_core_web_sm')
kb = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=3)

# adding entities
# kb.add_entity(entity="Q1004791", freq=6, entity_vector=[0, 3, 5])
# kb.add_entity(entity="Q42", freq=342, entity_vector=[1, 9, -3])
# kb.add_entity(entity="Q5301561", freq=12, entity_vector=[-2, 4, 2])

# # adding aliases
# kb.add_alias(alias="Douglas", entities=["Q1004791", "Q42", "Q5301561"], probabilities=[0.6, 0.1, 0.2])

print("Number of entities in KB:",kb.get_size_entities()) # 3
print("Number of aliases in KB:", kb.get_size_aliases()) # 2

candidates = kb.get_candidates("Douglas")
for c in candidates:
    print(" ", c.entity_, c.prior_prob, c.entity_vector)

In [None]:
def csv_to_table(path, csv_name, table_name):
    with open(dataset_path+csv_name,'r') as fin:
        dr = csv.DictReader(fin)
        to_db=[tuple(i.values()) for i in dr]
    
    count=len(dr.fieldnames)
    bindings="?, "*count

    cur.executemany("INSERT INTO "+table_name+" VALUES ("+bindings[:-2]+");", to_db)

# def csv_to_table(path, csv_name, table_name):
#     with open(dataset_path+csv_name,'r') as fin:
#         dr = csv.DictReader(fin)
#         to_db=[tuple(i.values()) for i in dr]
    
#     count=len(dr.fieldnames)
#     bindings="?, "*count

#     cur.executemany("INSERT INTO "+table_name+" VALUES ("+bindings[:-2]+");", to_db[:500])


In [None]:
# con = sqlite3.connect(r"D:\My Study Folder\Others\NLP-Search-Engine-COVID-19-Dataset\dataset\covid-19\mysql_database\covid19.db")
# cur = con.cursor()

# dataset_path="../dataset/covid-19/required_only/"

# tables=[("worldwide_aggregate"),("reference"),("timeseries"),("us")]
# for table in tables:
#     cur.execute("DROP TABLE IF EXISTS "+table+";")

# cur.execute("create table worldwide_aggregate(Date Date NOT NULL, Confirmed BIGINT NOT NULL, Recovered BIGINT NOT NULL, Deaths BIGINT NOT NULL, Increase_rate FLOAT default NULL, PRIMARY KEY (Date));")

# csv_to_table(dataset_path,"worldwide-aggregate.csv", tables[0])

# cur.execute("create table us(Date Date NOT NULL, Admin2 VARCHAR(100) NOT NULL, Province_State VARCHAR(100) NOT NULL, Confirmed BIGINT NOT NULL, Deaths BIGINT NOT NULL, Country_Region VARCHAR(100) NOT NULL, PRIMARY KEY (Date, Admin2, Province_State));")

# csv_to_table(dataset_path,"us_simplified.csv", tables[3])

# cur.execute("create table reference(UID INT NOT NULL, iso2 VARCHAR(20), iso3 VARCHAR(20), code3 INT, FIPS INT, Admin2 VARCHAR(100) NOT NULL, Province_State VARCHAR(100) NOT NULL, Country_Region VARCHAR(100) NOT NULL, Lat FLOAT NOT NULL, Long_ FLOAT NOT NULL, Combined_Key VARCHAR(100), Popolation BIGINT NOT NULL, PRIMARY KEY (UID));")

# csv_to_table(dataset_path,"reference.csv", tables[1])

# cur.execute("create table timeseries(Date Date NOT NULL, Country_Region VARCHAR(100) NOT NULL, Province_State VARCHAR(100), Confirmed BIGINT NOT NULL, Recovered BIGINT NOT NULL, Deaths BIGINT NOT NULL, PRIMARY KEY (Date, Country_Region, Province_State));")

# csv_to_table(dataset_path,"time-series-19-covid-combined.csv", tables[2])

# con.commit()

# with open('../dataset/covid-19/mysql_database/dump.sql','w') as fp:
#     for line in con.iterdump():
#         fp.write('%s\n' % line)

# con.close()

# con = sqlite3.connect(r"D:\My Study Folder\Others\NLP-Search-Engine-COVID-19-Dataset\dataset\covid-19\mysql_database\covid19-small.db")
# cur = con.cursor()

# dataset_path="../dataset/covid-19/required_only/"

# tables=[("worldwide_aggregate"),("reference"),("timeseries"),("us")]
# for table in tables:
#     cur.execute("DROP TABLE IF EXISTS "+table+";")

# cur.execute("create table worldwide_aggregate(Date Date NOT NULL, Confirmed BIGINT NOT NULL, Recovered BIGINT NOT NULL, Deaths BIGINT NOT NULL, Increase_rate FLOAT default NULL, PRIMARY KEY (Date));")

# csv_to_table(dataset_path,"worldwide-aggregate.csv", tables[0])

# cur.execute("create table us(Date Date NOT NULL, Admin2 VARCHAR(100) NOT NULL, Province_State VARCHAR(100) NOT NULL, Confirmed BIGINT NOT NULL, Deaths BIGINT NOT NULL, Country_Region VARCHAR(100) NOT NULL, PRIMARY KEY (Date, Admin2, Province_State));")

# csv_to_table(dataset_path,"us_simplified.csv", tables[3])

# cur.execute("create table reference(UID INT NOT NULL, iso2 VARCHAR(20), iso3 VARCHAR(20), code3 INT, FIPS INT, Admin2 VARCHAR(100) NOT NULL, Province_State VARCHAR(100) NOT NULL, Country_Region VARCHAR(100) NOT NULL, Lat FLOAT NOT NULL, Long_ FLOAT NOT NULL, Combined_Key VARCHAR(100), Popolation BIGINT NOT NULL, PRIMARY KEY (UID));")

# csv_to_table(dataset_path,"reference.csv", tables[1])

# cur.execute("create table timeseries(Date Date NOT NULL, Country_Region VARCHAR(100) NOT NULL, Province_State VARCHAR(100), Confirmed BIGINT NOT NULL, Recovered BIGINT NOT NULL, Deaths BIGINT NOT NULL, PRIMARY KEY (Date, Country_Region, Province_State));")

# csv_to_table(dataset_path,"time-series-19-covid-combined.csv", tables[2])

# con.commit()

# with open('../dataset/covid-19/mysql_database/dump-small.sql','w') as fp:
#     for line in con.iterdump():
#         fp.write('%s\n' % line)

# con.close()

In [None]:
# database_path="D:/My Study Folder/Others/NLP-Search-Engine-COVID-19-Dataset/dataset/covid-19/mysql_database/"

# with open(database_path+"dump.sql", 'r') as f:
#     lines=f.read()
#     lines=lines.replace('"','')
#     # print(lines[:1000])

#     tables=[("worldwide_aggregate"),("reference"),("timeseries"),("us")]

#     for table in tables:
#         lines=lines.replace(table,"`"+table+"`")
    
#     with open(database_path+"covid19-schema.sql", 'w') as out:
#         out.write(lines)

In [None]:
# sqlite3mysql -f "D:/My Study Folder/Others/NLP-Search-Engine-COVID-19-Dataset/dataset/covid-19/mysql_database/covid19.db" -u "root" -d "covid19"
# sqlite3mysql -f "D:/My Study Folder/Others/NLP-Search-Engine-COVID-19-Dataset/dataset/covid-19/mysql_database/covid19-small.db" -u "root" -d "covid19_small"

In [None]:
import json
from sutime import SUTime

sutime = SUTime(mark_time_ranges=True, include_range=True)

In [None]:
for q in queries:
    # parsed=json.dumps(sutime.parse(q), sort_keys=True, indent=4)
    parsed=sutime.parse(q)
    # print(parsed)
    print(q)
    for item in parsed:
        print("--> ",item)
    print()
    # break

In [None]:
q="Which country saw highest number of death in the month of April?"
# q="State having maximum number of active cases in USA till now?"
parsed=sutime.parse(q)
# print(parsed)
print(q)
for item in parsed:
    print("--> ",item)
print()

In [None]:
from spacy import displacy

for q in queries:
    doc=nlp(q)
    print(q)
    # print([token.text for token in doc])
    print([(token.text, token.dep_) for token in doc])
    displacy.render(doc, style="dep")
    print()

In [None]:
! pip install geonamescache

In [None]:
import geonamescache

gc = geonamescache.GeonamesCache()
countries = gc.get_countries_by_names()
# print countries dictionary
countries=list(countries.keys())
print(len(countries))

cities=[]
for country in countries:
    city = list(gc.get_cities_by_names(country))

# print countries dictionary
print(len(list(countries.items())))

In [None]:
!pip install geograpy

In [None]:
con = sqlite3.connect(r"D:\My Study Folder\Others\NLP-Search-Engine-COVID-19-Dataset\dataset\covid-19\mysql_database\covid19.db")
# cur = con.cursor()

# cur = con.execute("SELECT DISTINCT Country_Region FROM reference;")
# cur = con.execute("SELECT count(*) FROM reference;")
cur = con.execute("SELECT Confirmed, Date FROM worldwide_aggregate;")

for row in cur:
    print(row)

con.close()

In [None]:
import pickle
parsed_parameter_save_path='../dataset/covid-19/parsed_parameters.pickle'
with open(parsed_parameter_save_path, 'rb') as f:
    temp=pickle.load(f)
temp

In [None]:
import re
query = "which place has the highest number of cases in US?"

assign_base_words={
    'recover' : ['recover','recovery','cure','heal'],
    'death' : ['death','fatality','fatal','demise','decease','die','expire'],
    'confirm': ['confirm'],
    'active' : ['active', 'live'],
    'maximum' : ['maximum', 'high', 'max', 'maximal', 'most'],
    'minimum' : ['minimum', 'low', 'least', 'min'],
    'average' : ['average', 'avg', 'normally', 'usually', 'generally'],
    'state' : ['state', 'province'],
    'country' : ['country', 'region', 'nation', 'place']
}

def get_reverse_dict(assign_base_words):
    reverse_base_word_dict={}
    for base, l in assign_base_words.items():
        for item in l:
            doc = nlp(item)
            item = doc[0].lemma_
            reverse_base_word_dict[item]=base

    return reverse_base_word_dict
for word,base in reverse_base_word_dict.items():
    query=query.replace(word,base)
re.sub(r'\bold\b', 'new', s)

In [None]:
con = sqlite3.connect(r"D:\My Study Folder\Others\NLP-Search-Engine-COVID-19-Dataset\dataset\covid-19\mysql_database\covid19.db")
# cur = con.cursor()

# cur = con.execute("SELECT Province_State FROM (SELECT Province_State, (MAX(sum)-MIN(sum)) as cases FROM (SELECT Date, Province_State, SUM(Confirmed) as sum FROM us WHERE Date BETWEEN '2020-04-01' AND '2020-04-31' GROUP BY Date, Province_State) GROUP BY Province_State) WHERE cases = (SELECT MAX(cases) from (SELECT (MAX(sum)-MIN(sum)) as cases FROM (SELECT Date, Province_State, SUM(Confirmed) as sum FROM us WHERE Date BETWEEN '2020-04-01' AND '2020-04-31' GROUP BY Date, Province_State) GROUP BY Province_State));")

Which state of US was worst affected?

cur = con.execute("SELECT Province_State, (MAX(sum)-MIN(sum)) as cases FROM (SELECT Date, Province_State, SUM(Confirmed) as sum FROM us WHERE Date BETWEEN '2020-04-01' AND '2020-04-31' GROUP BY Date, Province_State) GROUP BY Province_State;")


for row in cur:
    print(row)

con.close()