In [1]:
# ! conda install -c conda-forge spacy
# ! python -m spacy download en_core_web_sm

In [15]:
import spacy 
import nltk
from itertools import product
import csv, sqlite3

nlp = spacy.load('en_core_web_sm') 

In [43]:
# sentence = "What are total death cases in US on 27-09-2020?"
# sentence = "number of cities with total cases greater than 1000?"

def print_entities(sentence):
	doc = nlp(sentence)
	print("----> Entities:")
	for ent in doc.ents: 
		print("-------->",ent.text, ent.start_char, ent.end_char, ent.label_) 

def print_tokens(sentence):
	doc = nlp(sentence) 
	print("----> Tokens:")
	for token in doc: 
		# if(token.dep_ == "nsubj"):
		print("-------->", token.text, token.pos_, token.dep_) 

In [44]:
queries=[]
with open("../possible-questions.txt","r") as f:
    for line in f.readlines():
        line = line[:-1]
        if(line):
            queries.append(line)

In [45]:
def process_query(query):
    print(query)
    print_entities(query)
    #print_tokens(sent)
    doc=nlp(query)
    print([chunk.text for chunk in doc.noun_chunks])
    print()

In [46]:
for q in queries:
    process_query(q)

total number of cases found in Afganistan?
----> Entities:
--------> Afganistan 31 41 GPE
['total number', 'cases', 'Afganistan']

total number of cases found in Colombia till july?
----> Entities:
--------> Colombia 31 39 GPE
--------> july 45 49 DATE
['total number', 'cases', 'Colombia', 'july']

total number of new cases found in France in april?
----> Entities:
--------> France 35 41 GPE
--------> april 45 50 DATE
['total number', 'new cases', 'France', 'april']

total number of new cases found in Greece between april to september?
----> Entities:
--------> Greece 35 41 GPE
--------> between april to september 42 68 DATE
['total number', 'new cases', 'Greece', 'april', 'september']

total number cases recovered in Hungary?
----> Entities:
--------> Hungary 32 39 GPE
['total number cases', 'Hungary']

total number of cases recovered in Iceland till May?
----> Entities:
--------> Iceland 35 42 GPE
--------> May 48 51 DATE
['total number', 'cases', 'Iceland', 'May']

total number of n

In [9]:
from nltk.corpus import wordnet

synonyms = []

for syn in wordnet.synsets("recovered"): 
    print(syn.name())
    print(syn.lemmas())
    for l in syn.lemmas(): 
        synonyms.append(l.name())
print (set(synonyms))

recover.v.01
[Lemma('recover.v.01.recover'), Lemma('recover.v.01.retrieve'), Lemma('recover.v.01.find'), Lemma('recover.v.01.regain')]
recuperate.v.04
[Lemma('recuperate.v.04.recuperate'), Lemma('recuperate.v.04.recover'), Lemma('recuperate.v.04.convalesce')]
recover.v.03
[Lemma('recover.v.03.recover'), Lemma('recover.v.03.go_back'), Lemma('recover.v.03.recuperate')]
recover.v.04
[Lemma('recover.v.04.recover'), Lemma('recover.v.04.recoup'), Lemma('recover.v.04.recuperate')]
reclaim.v.02
[Lemma('reclaim.v.02.reclaim'), Lemma('reclaim.v.02.recover')]
recover.v.06
[Lemma('recover.v.06.recover')]
cured.s.01
[Lemma('cured.s.01.cured'), Lemma('cured.s.01.healed'), Lemma('cured.s.01.recovered')]
recovered.s.02
[Lemma('recovered.s.02.recovered')]
{'reclaim', 'recoup', 'cured', 'convalesce', 'find', 'healed', 'retrieve', 'go_back', 'recovered', 'recover', 'regain', 'recuperate'}


In [10]:
from nltk.corpus import wordnet 

def caseTypeProbability(sent):
    docs=nlp(sent)
    case_type=["death", "recovery", "active", "confirm"]

    similarity=0.0
    type="total"
    word=""

    # for case in case_type:
    #     tempdoc=nlp(case)
    #     tempdoc=tempdoc[0]
        
    #     for doc in docs:
    #         temp=doc.similarity(tempdoc)

    #         if similarity<temp:
    #             similarity=temp
    #             type=case
    #             word=doc
    
    for case in case_type:
        w1=wordnet.synsets(case)
        # print(w1)
        for doc in docs:
            w2=wordnet.synsets(str(doc.text))
            # print(w2)
            # temp=w1.similarity(w2)

            for i,j in list(product(*[w1,w2])):
                # print(i,j)
                temp = i.wup_similarity(j) # Wu-Palmer Similarity
                # maxscore = score if maxscore < score else maxscore
                if temp and similarity<temp:
                    similarity=temp
                    type=case
                    word=doc
    
    return type, word, similarity

In [11]:
for query in queries:
    print(query)
    print(caseTypeProbability(query))

total number of cases found in Afganistan?
('death', cases, 0.7692307692307693)
total number of cases found in Colombia till july?
('death', cases, 0.7692307692307693)
total number of new cases found in France in april?
('death', cases, 0.7692307692307693)
total number of new cases found in Greece between april to september?
('death', cases, 0.7692307692307693)
total number cases recovered in Hungary?
('death', cases, 0.7692307692307693)
total number of cases recovered in Iceland till May?
('death', cases, 0.7692307692307693)
total number of new cases recovered in Indonesia in May?
('death', cases, 0.7692307692307693)
total number of new cases recovered in india from March to December?
('death', cases, 0.7692307692307693)
total number of deaths in india?
('death', deaths, 1.0)
total number of deaths in china till august?
('death', deaths, 1.0)
total number of deaths in US in august?
('death', deaths, 1.0)
total number of deaths in poland between august and december?
('death', deaths, 1

In [12]:
from spacy.kb import KnowledgeBase

# nlp = spacy.load('en_core_web_sm')
kb = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=3)

# adding entities
# kb.add_entity(entity="Q1004791", freq=6, entity_vector=[0, 3, 5])
# kb.add_entity(entity="Q42", freq=342, entity_vector=[1, 9, -3])
# kb.add_entity(entity="Q5301561", freq=12, entity_vector=[-2, 4, 2])

# # adding aliases
# kb.add_alias(alias="Douglas", entities=["Q1004791", "Q42", "Q5301561"], probabilities=[0.6, 0.1, 0.2])

print("Number of entities in KB:",kb.get_size_entities()) # 3
print("Number of aliases in KB:", kb.get_size_aliases()) # 2

candidates = kb.get_candidates("Douglas")
for c in candidates:
    print(" ", c.entity_, c.prior_prob, c.entity_vector)

Number of entities in KB: 0
Number of aliases in KB: 0


In [3]:
def csv_to_table(path, csv_name, table_name):
    with open(dataset_path+csv_name,'r') as fin:
        dr = csv.DictReader(fin)
        to_db=[tuple(i.values()) for i in dr]
    
    count=len(dr.fieldnames)
    bindings="?, "*count

    cur.executemany("INSERT INTO "+table_name+" VALUES ("+bindings[:-2]+");", to_db)

# def csv_to_table(path, csv_name, table_name):
#     with open(dataset_path+csv_name,'r') as fin:
#         dr = csv.DictReader(fin)
#         to_db=[tuple(i.values()) for i in dr]
    
#     count=len(dr.fieldnames)
#     bindings="?, "*count

#     cur.executemany("INSERT INTO "+table_name+" VALUES ("+bindings[:-2]+");", to_db[:500])


In [4]:
con = sqlite3.connect(r"D:\My Study Folder\Others\NLP-Search-Engine-COVID-19-Dataset\dataset\covid-19\mysql_database\covid19.db")
cur = con.cursor()

dataset_path="../dataset/covid-19/required_only/"

tables=[("worldwide_aggregate"),("reference"),("timeseries"),("us")]
for table in tables:
    cur.execute("DROP TABLE IF EXISTS "+table+";")

cur.execute("create table worldwide_aggregate(Date Date NOT NULL, Confirmed BIGINT NOT NULL, Recovered BIGINT NOT NULL, Deaths BIGINT NOT NULL, Increase_rate FLOAT default NULL, PRIMARY KEY (Date));")

csv_to_table(dataset_path,"worldwide-aggregate.csv", tables[0])

cur.execute("create table us(Date Date NOT NULL, Admin2 VARCHAR(100) NOT NULL, Province_State VARCHAR(100) NOT NULL, Confirmed BIGINT NOT NULL, Deaths BIGINT NOT NULL, Country_Region VARCHAR(100) NOT NULL, PRIMARY KEY (Date, Admin2, Province_State));")

csv_to_table(dataset_path,"us_simplified.csv", tables[3])

cur.execute("create table reference(UID INT NOT NULL, iso2 VARCHAR(20), iso3 VARCHAR(20), code3 INT, FIPS INT, Admin2 VARCHAR(100) NOT NULL, Province_State VARCHAR(100) NOT NULL, Country_Region VARCHAR(100) NOT NULL, Lat FLOAT NOT NULL, Long_ FLOAT NOT NULL, Combined_Key VARCHAR(100), Popolation BIGINT NOT NULL, PRIMARY KEY (UID));")

csv_to_table(dataset_path,"reference.csv", tables[1])

cur.execute("create table timeseries(Date Date NOT NULL, Country_Region VARCHAR(100) NOT NULL, Province_State VARCHAR(100), Confirmed BIGINT NOT NULL, Recovered BIGINT NOT NULL, Deaths BIGINT NOT NULL, PRIMARY KEY (Date, Country_Region, Province_State));")

csv_to_table(dataset_path,"time-series-19-covid-combined.csv", tables[2])

con.commit()

with open('../dataset/covid-19/mysql_database/dump.sql','w') as fp:
    for line in con.iterdump():
        fp.write('%s\n' % line)

con.close()

# con = sqlite3.connect(r"D:\My Study Folder\Others\NLP-Search-Engine-COVID-19-Dataset\dataset\covid-19\mysql_database\covid19-small.db")
# cur = con.cursor()

# dataset_path="../dataset/covid-19/required_only/"

# tables=[("worldwide_aggregate"),("reference"),("timeseries"),("us")]
# for table in tables:
#     cur.execute("DROP TABLE IF EXISTS "+table+";")

# cur.execute("create table worldwide_aggregate(Date Date NOT NULL, Confirmed BIGINT NOT NULL, Recovered BIGINT NOT NULL, Deaths BIGINT NOT NULL, Increase_rate FLOAT default NULL, PRIMARY KEY (Date));")

# csv_to_table(dataset_path,"worldwide-aggregate.csv", tables[0])

# cur.execute("create table us(Date Date NOT NULL, Admin2 VARCHAR(100) NOT NULL, Province_State VARCHAR(100) NOT NULL, Confirmed BIGINT NOT NULL, Deaths BIGINT NOT NULL, Country_Region VARCHAR(100) NOT NULL, PRIMARY KEY (Date, Admin2, Province_State));")

# csv_to_table(dataset_path,"us_simplified.csv", tables[3])

# cur.execute("create table reference(UID INT NOT NULL, iso2 VARCHAR(20), iso3 VARCHAR(20), code3 INT, FIPS INT, Admin2 VARCHAR(100) NOT NULL, Province_State VARCHAR(100) NOT NULL, Country_Region VARCHAR(100) NOT NULL, Lat FLOAT NOT NULL, Long_ FLOAT NOT NULL, Combined_Key VARCHAR(100), Popolation BIGINT NOT NULL, PRIMARY KEY (UID));")

# csv_to_table(dataset_path,"reference.csv", tables[1])

# cur.execute("create table timeseries(Date Date NOT NULL, Country_Region VARCHAR(100) NOT NULL, Province_State VARCHAR(100), Confirmed BIGINT NOT NULL, Recovered BIGINT NOT NULL, Deaths BIGINT NOT NULL, PRIMARY KEY (Date, Country_Region, Province_State));")

# csv_to_table(dataset_path,"time-series-19-covid-combined.csv", tables[2])

# con.commit()

# with open('../dataset/covid-19/mysql_database/dump-small.sql','w') as fp:
#     for line in con.iterdump():
#         fp.write('%s\n' % line)

# con.close()

In [12]:
# database_path="D:/My Study Folder/Others/NLP-Search-Engine-COVID-19-Dataset/dataset/covid-19/mysql_database/"

# with open(database_path+"dump.sql", 'r') as f:
#     lines=f.read()
#     lines=lines.replace('"','')
#     # print(lines[:1000])

#     tables=[("worldwide_aggregate"),("reference"),("timeseries"),("us")]

#     for table in tables:
#         lines=lines.replace(table,"`"+table+"`")
    
#     with open(database_path+"covid19-schema.sql", 'w') as out:
#         out.write(lines)

In [None]:
# sqlite3mysql -f "D:/My Study Folder/Others/NLP-Search-Engine-COVID-19-Dataset/dataset/covid-19/mysql_database/covid19.db" -u "root" -d "covid19"
# sqlite3mysql -f "D:/My Study Folder/Others/NLP-Search-Engine-COVID-19-Dataset/dataset/covid-19/mysql_database/covid19-small.db" -u "root" -d "covid19_small"

In [12]:
import json
from sutime import SUTime

sutime = SUTime(mark_time_ranges=True, include_range=True)

In [38]:
for q in queries:
    # parsed=json.dumps(sutime.parse(q), sort_keys=True, indent=4)
    parsed=sutime.parse(q)
    # print(parsed)
    print(q)
    for item in parsed:
        print("--> ",item)
    print()
    # break

total number of cases found in Afganistan?

total number of cases found in Colombia till july?
-->  {'timex-value': '2020-07', 'start': 45, 'end': 49, 'text': 'july', 'type': 'DATE', 'value': '2020-07'}

total number of new cases found in France in april?
-->  {'timex-value': '2021-04', 'start': 45, 'end': 50, 'text': 'april', 'type': 'DATE', 'value': '2021-04'}

total number of new cases found in Greece between april to september?
-->  {'start': 50, 'end': 68, 'text': 'april to september', 'type': 'DURATION', 'value': {'end': 'XXXX-09', 'begin': 'XXXX-04'}}

total number cases recovered in Hungary?

total number of cases recovered in Iceland till May?
-->  {'timex-value': '2021-05', 'start': 48, 'end': 51, 'text': 'May', 'type': 'DATE', 'value': '2021-05'}

total number of new cases recovered in Indonesia in May?
-->  {'timex-value': '2021-05', 'start': 52, 'end': 55, 'text': 'May', 'type': 'DATE', 'value': '2021-05'}

total number of new cases recovered in india from March to Decembe