In [166]:
import nltk, re, string
from sklearn.preprocessing import normalize
from nltk.corpus import stopwords
# numpy is the package for matrix cacluation
import numpy as np  
import pandas as pd

stop_words = stopwords.words('english')

# Step 1. get tokens of each document as list
def get_doc_tokens(doc):
    tokens=[token.strip() \
            for token in nltk.word_tokenize(doc.lower()) \
            if token.strip() not in stop_words and\
               token.strip() not in string.punctuation]
    
    # you can add bigrams, collocations, stemming, 
    # or lemmatization here
    
    token_count={token:tokens.count(token) for token in set(tokens)}
    return token_count

def tfidf(docs):
    # step 2. process all documents to get list of token list
    docs_tokens={idx:get_doc_tokens(doc) \
             for idx,doc in enumerate(docs)}

    # step 3. get document-term matrix
    dtm=pd.DataFrame.from_dict(docs_tokens, orient="index" )
    dtm=dtm.fillna(0)
    dtm = dtm.sort_index(axis = 0)
      
    # step 4. get normalized term frequency (tf) matrix        
    tf=dtm.values
    doc_len=tf.sum(axis=1, keepdims=True)
    tf=np.divide(tf, doc_len)
    
    # step 5. get idf
    df=np.where(tf>0,1,0)
    #idf=np.log(np.divide(len(docs), \
    #    np.sum(df, axis=0)))+1

    smoothed_idf=np.log(np.divide(len(docs)+1, np.sum(df, axis=0)+1))+1    
    smoothed_tf_idf=tf*smoothed_idf
    
    return smoothed_tf_idf
    

# 16MAR

In [167]:
import json
import spacy
nlp = spacy.load("en_core_web_sm")

with open('qa.json', 'r') as fp:
    qa = json.load(fp)

In [168]:
article = qa['context']

In [169]:
list_article = []
article = nlp(article)
for sent in article.sents:
    list_article.append(sent.text)

In [170]:
list_article = [art.replace('\n',' ').strip() for art in list_article]

In [171]:
len(list_article)

154

In [172]:
[i for i in range(len(list_article)) if 'CDC Morbidity' in list_article[i]]

[66]

In [173]:
list_qs = []
for q in qa['qas']:
    list_qs.append(q['question'])

In [174]:
list_qs

['What age group has the highest rate of severe outcomes?',
 'How is COVID-19 spread?',
 'How many states in the U.S. have reported cases of COVID-19?',
 'When did the White House launch the "15 Days to Slow the Spread" program?',
 'What should mildly-ill patients do?',
 'What type of virus is SARS-CoV-2?',
 'What viruses are similar to the COVID-19 coronavirus?',
 'What are the phases of a pandemic?',
 'At which phase does the peak of the pandemic occur?',
 'People with which medical conditions have a higher rate of severe illness?',
 'What kind of test can diagnose COVID-19?',
 'In what species did the COVID-19 virus likely originate?',
 'What risk factors should be considered in addition to clinical symptoms?']

In [175]:
list_article = [i for i in list_article if i!='']

In [176]:
sent_qs_list = list_qs + list_article

In [177]:
# sent_qs_list = [i for i in sent_qs_list if i!='']

In [178]:
tfidf = tfidf(sent_qs_list)

In [179]:
tfidf.shape

(136, 560)

In [180]:
 len(list_qs)

13

In [181]:
tfidf_qs = tfidf[: len(list_qs)]

In [182]:
tfidf_sent = tfidf[ len(list_qs):]

In [183]:
tfidf_sent

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.52268337, 0.52268337,
        0.52268337]])

In [184]:
tfidf_qs

array([[0.80356144, 0.75561443, 0.80356144, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [185]:
from sklearn.metrics import pairwise_distances

In [186]:
pairwise_distances(tfidf_qs,tfidf_sent)

array([[2.5993851 , 2.32313347, 3.18918853, ..., 2.47394367, 3.59692904,
        2.33967934],
       [2.43848856, 2.14158182, 3.05946843, ..., 2.17808506, 3.48243049,
        2.07943819],
       [2.29334371, 1.9747333 , 2.94508845, ..., 2.10580887, 3.38238392,
        1.96559625],
       ...,
       [2.78421152, 2.52823654, 3.3415497 , ..., 2.29872716, 3.73268394,
        2.50986063],
       [2.55317911, 2.27131456, 3.15164151, ..., 2.37823704, 3.56368057,
        2.25837025],
       [2.54186365, 2.25858741, 3.14248173, ..., 2.41343432, 3.55558244,
        2.27560264]])

In [187]:
df = pd.DataFrame(pairwise_distances(tfidf_qs,tfidf_sent))

In [188]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,...,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122
0,2.599385,2.323133,3.189189,2.239595,2.46862,2.440661,2.002878,2.414856,2.520888,2.429484,2.256118,2.21127,2.384305,2.343948,2.518866,2.610747,2.819279,2.359125,2.396847,3.491806,2.422334,2.111215,2.091215,2.317088,2.532746,2.693861,2.387706,2.426793,2.176767,2.281734,2.387391,2.230569,2.369997,2.164868,2.405815,2.219672,2.95928,2.275572,2.474413,2.238734,2.237726,2.636907,3.093201,2.201054,2.18382,2.317546,2.325759,2.205138,2.412325,2.720589,4.703142,2.562271,2.239703,2.253458,2.061361,1.79735,2.794316,2.683557,2.762562,2.318553,...,2.428011,2.446555,2.437135,2.305706,2.312136,2.457126,2.496122,2.546213,3.278977,2.534727,2.081311,2.39363,2.617847,2.31689,3.072404,2.338767,2.242005,2.128834,2.345168,2.349735,2.180394,2.380925,2.212133,2.403304,2.096292,3.57979,2.189096,2.365089,2.60848,2.278119,2.677103,2.778156,2.314699,2.304764,2.808953,2.537561,2.249487,2.971109,2.409623,2.208703,2.258514,2.25199,2.036849,2.154447,2.282084,2.379726,2.245324,2.297099,2.501685,2.073554,2.313132,2.153337,2.53025,2.105846,3.180264,2.291054,2.288108,2.473944,3.596929,2.339679
1,2.438489,2.141582,3.059468,2.050661,2.23045,2.268535,1.853303,2.163676,2.354633,1.959283,2.068693,1.933825,2.207789,2.029224,2.247035,2.450596,2.671659,1.787716,2.221329,3.373741,2.248806,1.713993,1.908998,2.072969,2.367325,2.538959,2.124473,1.805534,1.927583,2.033374,1.824854,1.963737,2.19233,1.910437,2.133786,1.972324,2.819,1.901535,1.988544,2.049721,1.902241,2.478448,2.959276,2.008497,1.881706,1.969003,1.993004,1.749282,2.017167,2.255526,4.616164,2.295585,2.142583,2.010271,1.868068,1.843558,2.479712,2.166472,2.611737,1.952766,...,2.158781,2.274875,2.264741,2.05495,2.129647,2.286239,2.328099,2.381727,3.152953,2.369444,1.876507,2.217856,2.45816,2.061263,2.937531,2.158531,1.733056,1.743362,2.065166,2.070351,1.863775,2.529448,1.954951,2.471275,1.851908,3.464725,1.75276,2.08776,2.448181,2.023948,2.521172,2.628227,2.13243,2.121641,2.66076,2.267972,1.903445,2.831415,2.039695,2.016877,2.007283,1.99994,1.827068,1.957311,2.02841,2.104327,1.992431,2.113312,2.334063,1.801826,2.057037,1.956088,2.241835,1.866164,3.050164,2.038497,2.103536,2.178085,3.48243,2.079438
2,2.293344,1.974733,2.945088,1.875742,2.119868,2.111741,1.684771,1.94546,2.07537,1.963334,1.895439,1.810852,2.046346,1.335222,1.71428,2.159417,2.379981,1.676784,1.702144,3.270371,2.090532,1.707727,1.763049,1.945374,2.217527,2.399898,2.019415,2.050209,1.78051,1.903127,1.848645,1.83717,2.029657,1.764607,2.03694,1.831429,2.694431,1.918555,2.042726,1.748654,1.87351,2.335788,2.840865,1.753467,1.700311,1.968157,1.796355,1.721106,2.078926,2.429862,4.541164,2.214996,1.756255,1.800845,1.702108,1.598913,2.455186,2.266789,2.476766,1.969343,...,2.063108,2.11855,2.107664,1.929924,1.961783,2.130748,2.175602,2.232895,3.04209,2.21979,1.683591,2.057203,2.314249,1.941066,2.818206,1.993102,1.432927,1.629893,1.964941,1.970389,1.804645,2.389834,1.819097,2.328175,1.6869,3.364152,1.611686,1.882039,2.14005,1.89688,2.381072,2.494148,1.964804,1.95309,2.528407,2.186364,1.887542,2.707418,2.07579,1.838747,1.875224,1.867362,1.559618,1.672096,1.90164,2.00606,1.859317,1.944039,2.181983,1.649674,1.936578,1.771858,2.171681,1.659059,2.935422,1.912395,1.933407,2.105809,3.382384,1.965596
3,2.325104,2.011531,2.969887,1.914443,2.177933,2.146191,1.740804,2.116799,2.237003,2.073908,1.933747,1.881229,2.081878,2.035534,2.234725,2.337799,2.568588,1.999993,2.096231,3.292721,2.125326,1.355698,1.824956,2.004547,2.250358,2.430267,2.085773,2.070755,1.84055,1.963573,2.033258,1.903877,2.065477,1.826461,2.106479,1.834578,2.721515,1.807232,2.184498,1.913436,1.881771,2.366979,2.866565,1.86921,1.826155,1.970694,1.983219,1.837185,2.068977,2.397955,4.557286,2.283537,2.012597,1.930642,1.758125,1.721822,2.541163,2.418839,2.506204,1.968409,...,2.131793,2.152891,2.14218,1.991378,1.99882,2.164896,2.209056,2.265504,3.066105,2.252588,1.726605,2.092551,2.345727,2.004317,2.844112,2.029567,1.873186,1.757062,2.036939,2.042195,1.819189,2.420329,1.882243,2.359467,1.744634,3.385883,1.747263,2.059843,2.335268,1.959371,2.411678,2.523383,2.001784,1.990288,2.55725,2.255775,1.893198,2.734373,2.070874,1.878211,1.936541,1.928929,1.645543,1.733147,1.963979,2.076633,1.921142,1.981407,2.215341,1.717246,1.999972,1.812779,2.247548,1.756102,2.960302,1.974395,1.970976,2.183965,3.403999,2.030617
4,3.675964,3.486095,4.114223,3.43099,3.584689,3.565493,3.337215,3.547878,3.620882,3.557851,3.441799,3.412569,3.527155,3.5,3.619475,3.684007,3.834612,3.510182,3.535645,4.353001,3.552972,3.348603,3.381873,3.48207,3.629148,3.743367,3.304114,3.556014,3.390313,3.458644,3.529241,3.425106,3.517499,3.382686,3.54173,3.418019,3.938686,3.454582,3.588681,3.430429,3.429771,3.702592,4.040272,3.276334,3.279488,3.296518,3.487845,3.408598,3.546156,3.762647,5.373589,3.649814,3.48671,3.440055,3.346282,3.327352,3.816295,3.735958,3.793107,3.483044,...,3.556845,3.569529,3.56308,3.474506,3.478776,3.576783,3.603683,3.638559,4.184208,3.630531,3.32983,3.533465,3.689043,3.481938,4.024373,3.496533,3.432564,3.359739,3.243973,3.503878,3.392643,3.736923,3.413128,3.697794,3.339214,4.42389,3.398242,3.514193,3.682401,3.45626,3.731326,3.804479,3.48048,3.473881,3.827026,3.632509,3.437456,3.947581,3.544318,3.410906,3.44337,3.439094,3.302221,3.376026,3.458875,3.524061,3.275869,3.311571,3.607539,3.324986,3.479438,3.375317,3.627406,3.34522,4.107309,3.4648,3.462853,3.588357,4.437771,3.497143
5,3.070275,2.8402,3.583394,2.772287,2.960384,2.93711,2.655349,2.915702,3.004108,2.927829,2.69719,2.650656,2.890449,2.857251,3.002412,3.0799,3.258546,2.869714,2.900804,3.855191,2.921899,2.669648,2.711263,2.835258,3.014065,3.150663,2.893256,2.925596,2.721783,2.806439,2.892995,2.675856,2.878659,2.712276,2.908218,2.756217,3.380403,2.704532,2.965217,2.771591,2.492536,2.539694,3.498239,2.679734,2.727427,2.835632,2.842348,2.654694,2.913606,3.173546,4.978889,3.038918,2.840956,2.783498,2.666737,2.642944,3.236972,3.141857,3.209601,2.740794,...,2.926606,2.942009,2.93418,2.825963,2.831212,2.950806,2.983356,3.02539,3.663534,3.015731,2.567341,2.898146,3.085922,2.835096,3.479864,2.766694,2.774234,2.620738,2.858251,2.862,2.724685,3.143004,2.750149,3.096379,2.657862,3.935058,2.731653,2.874619,2.929974,2.8035,3.136347,3.223033,2.833306,2.825195,3.249617,3.018112,2.699144,3.390763,2.807715,2.747391,2.787593,2.78231,2.611234,2.703965,2.806723,2.886673,2.776917,2.818946,2.988012,2.639965,2.832025,2.703081,3.011968,2.665403,3.575453,2.733882,2.737667,2.964825,3.950657,2.85375
6,2.628789,2.355987,3.213199,2.142908,2.337426,2.471954,2.11351,2.41154,2.551196,2.460919,2.289933,2.207649,2.416327,2.316233,2.501188,2.640024,2.846412,2.34024,2.428704,3.513749,2.45386,2.132059,2.172954,2.322082,2.562914,2.722244,2.380394,2.400035,2.187684,2.286805,2.368731,2.230441,2.40221,2.174218,2.393639,2.228793,2.98514,2.309103,2.052227,2.132205,2.271815,2.665897,2.720114,2.066867,2.218737,2.350478,2.358576,2.205008,2.44398,2.483836,4.719457,2.544894,2.356898,2.262442,2.127135,2.101439,2.745469,2.550794,2.790246,2.351471,...,2.415947,2.269237,2.249686,2.308368,2.345143,2.488211,2.526727,2.576223,3.302335,2.564872,2.117919,2.425529,2.647046,2.316766,3.09732,2.371404,2.234217,2.139995,2.332676,2.337267,2.215366,2.713376,2.217362,2.659229,2.114478,3.601197,2.1919,2.352702,2.637782,2.280813,2.705662,2.805687,2.347671,2.337876,2.836185,2.520014,2.283401,2.996867,2.441313,2.243233,2.263636,2.257127,2.025955,2.189833,2.284774,2.367416,2.128758,2.33032,2.532223,2.081374,2.313008,2.18874,2.504597,2.125463,3.204341,2.293733,2.321457,2.447701,3.618235,2.336257
7,3.342256,3.132223,3.819004,2.914041,3.241598,3.220357,2.96563,3.200844,2.982064,3.211894,2.912155,3.050178,3.177858,3.147692,3.280024,3.3511,3.515993,3.159009,3.187279,4.075117,3.206489,2.978439,3.015796,3.127742,3.290695,3.416249,3.18041,3.209859,3.025258,3.101642,3.180174,3.064198,3.167137,3.016707,3.194028,3.056275,3.629216,3.097112,3.246012,3.070147,3.069412,3.371521,3.739219,3.04278,3.030337,3.128081,3.134171,3.045735,3.198934,3.437365,5.15106,3.313473,3.132908,3.0809,2.97583,2.954528,3.496008,2.446881,3.038348,3.128827,...,2.983635,2.965041,3.217685,2.965153,2.970155,2.857888,3.26259,2.520457,3.894299,3.29222,2.957317,3.18486,3.356635,3.127595,3.722034,3.143836,3.072533,2.990954,3.1486,3.152003,3.027869,3.409187,3.050804,3.366251,2.96788,4.150755,3.034141,3.163466,3.349334,3.098983,3.403051,3.483105,3.125972,3.118623,3.507719,3.294402,3.077997,3.638868,3.196898,3.048318,3.0846,3.079826,2.926196,3.009238,3.101899,3.174424,3.074955,3.112963,3.266848,2.951863,3.124812,3.008443,3.288774,2.974635,3.811555,3.108504,3.106334,3.245654,4.165545,3.144514
8,2.795412,2.540566,3.350888,2.36734,2.674249,2.648462,2.332084,2.624701,2.271184,2.375672,2.217672,2.438696,2.59662,2.559613,2.720701,2.80598,3.000977,2.573518,2.608142,3.640087,2.631583,2.348351,2.395554,2.535039,2.733556,2.883472,2.599743,2.635687,2.407455,2.502766,2.599453,2.456209,2.583488,2.396701,2.616385,2.446318,3.132868,2.497149,2.679598,2.463626,2.462711,2.830337,3.259665,2.429437,2.413834,2.535458,2.542967,2.433137,2.622372,2.908458,4.814257,2.760934,2.54141,2.477014,2.345042,2.317949,2.977537,2.33344,2.698563,2.536378,...,2.499849,2.497827,2.645213,2.429978,2.43608,2.493509,2.189927,2.595517,3.436454,2.735392,2.321504,2.605185,2.812588,2.534858,3.239937,2.554869,2.466599,2.364204,2.56073,2.564913,2.410735,2.875102,2.439479,2.824057,2.334945,3.72457,2.418608,2.279982,2.803871,2.49947,2.867823,2.962377,2.532856,2.523779,2.991278,2.738017,2.473402,3.144044,2.619887,2.436369,2.481614,2.475678,2.281727,2.387293,2.503084,2.592416,2.469616,2.516782,2.704802,2.314552,2.531423,2.386291,2.731243,2.343525,3.342395,2.511265,2.508578,2.679164,3.741046,2.555704
9,2.317361,2.002575,2.963829,1.905031,2.169665,2.1378,1.562648,2.10829,2.228953,2.12503,1.924429,1.87165,2.073226,2.026685,2.226667,2.330098,2.56158,2.044218,2.087639,3.287258,2.116852,1.752309,1.352677,1.995559,2.242356,2.422859,2.077137,2.121952,1.830758,1.90571,2.076774,1.796893,1.786422,1.714825,1.930662,1.88157,2.714902,1.9472,2.176254,1.860755,1.80136,2.359373,2.860288,1.859569,1.839138,1.996091,2.00562,1.864401,2.031568,2.452542,4.55334,2.275652,1.854452,1.816156,1.405198,1.651857,2.534079,2.411397,2.499021,1.870937,...,2.123345,2.144525,2.133773,1.982332,1.989807,2.156577,2.200905,2.257556,3.060237,2.244594,1.572327,2.083944,2.338051,1.927012,2.837785,2.020691,1.907864,1.729911,2.028095,2.033375,1.835069,1.876066,1.648533,2.351836,1.537659,3.38057,1.8454,2.051098,2.327558,1.808206,2.404213,2.516249,1.992785,1.902815,2.550211,2.247793,1.916651,2.727791,2.102294,1.868617,1.927237,1.919588,1.66196,1.804162,1.722285,2.067959,1.911763,1.972314,2.207212,1.706747,1.990964,1.802836,2.239536,1.745836,2.954223,1.96527,1.961835,2.17572,3.398714,2.021746


In [191]:
Ans_list = []
for i in range(len(df)):
    indx_highest_cs = df.iloc[i].argmin()
#     print(indx_highest_cs)
    Ans_list.append(['Question: ' + list_qs[i],'Answer: '+list_article[indx_highest_cs]])

In [192]:
Ans_list

[['Question: What age group has the highest rate of severe outcomes?',
  'Answer: A CDC Morbidity & Mortality Weekly Report that looked at severity of disease among COVID-19 cases in the United States by age group found that 80% of deaths were among adults 65 years and older with the highest percentage of severe outcomes occurring in people 85 years and older.'],
 ['Question: How is COVID-19 spread?',
  'Answer: On March 16, the White House announced a program called “15 Days to Slow the Spread,”pdf iconexternal icon which is a nationwide effort to slow the spread of COVID-19 through the implementation of social distancing at all levels of society.'],
 ['Question: How many states in the U.S. have reported cases of COVID-19?',
  'Answer: All 50 states have reported cases of COVID-19 to CDC.'],
 ['Question: When did the White House launch the "15 Days to Slow the Spread" program?',
  'Answer: On March 16, the White House announced a program called “15 Days to Slow the Spread,”pdf iconext

In [133]:
list_article[50]

'Severity'

In [None]:
import sklearn
from sklearn.metrics import pairwise_distances

def find_solutions(qs, article):
    
    
    return mylist
    # add your code

In [134]:
list_article

['CDC Summary 21 MAR 2020, https://www.cdc.gov/coronavirus/2019-ncov/cases-updates/summary.html',
 'This is a rapidly evolving situation and CDC will provide updated information and guidance as it becomes available.',
 'Updated March 21,',
 '2020  CDC is responding to a pandemic of respiratory disease spreading from person-to-person caused by a novel (new) coronavirus.',
 'The disease has been named “coronavirus disease 2019” (abbreviated “COVID-19”).',
 'This situation poses a serious public health risk.',
 'The federal government is working closely with state, local, tribal, and territorial partners, as well as public health partners, to respond to this situation. COVID-19 can cause mild to severe illness; most severe illness occurs in older adults.',
 'Situation in U.S. Different parts of the country are seeing different levels of COVID-19 activity.',
 'The United States nationally is in the initiation phase of the pandemic.',
 'States in which community spread is occurring are in t

In [146]:
x = [1,2,3,4]

In [147]:
print(x)

[1, 2, 3, 4]


In [150]:
print(*x,sep=' ')

1 2 3 4


In [None]:
6:30 -7:30 IST
        
9:30 

In [152]:
15*75

1125