<a href="https://colab.research.google.com/github/Zidiefeng/covid19-challenge/blob/Round_2/text_similarity.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import os
import json
from datetime import datetime
from datetime import timedelta
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer


In [0]:
data = fetch_20newsgroups()

Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


In [0]:
data.keys()

dict_keys(['data', 'filenames', 'target_names', 'target', 'DESCR'])

In [0]:
filepaths = data.filenames.astype(str)
filepaths

array(['/root/scikit_learn_data/20news_home/20news-bydate-train/rec.autos/102994',
       '/root/scikit_learn_data/20news_home/20news-bydate-train/comp.sys.mac.hardware/51861',
       '/root/scikit_learn_data/20news_home/20news-bydate-train/comp.sys.mac.hardware/51879',
       ...,
       '/root/scikit_learn_data/20news_home/20news-bydate-train/comp.sys.ibm.pc.hardware/60695',
       '/root/scikit_learn_data/20news_home/20news-bydate-train/comp.graphics/38319',
       '/root/scikit_learn_data/20news_home/20news-bydate-train/rec.motorcycles/104440'],
      dtype='<U86')

In [0]:
data.target

array([7, 4, 4, ..., 3, 1, 8])

In [0]:
keys = []
for path in filepaths:
    keys.append(os.path.split(path)[1])

keys[0:5]

['102994', '51861', '51879', '38242', '60880']

In [0]:
# Create a dataframe with metadata columns
metadata = pd.DataFrame(keys, columns = ['Document_ID'] )
metadata['Group'] = data.target
metadata['Text'] = data.data

In [0]:
# Get stats on dataset
print ("Rows:",len(metadata))
print ("Distinct IDs:",len(metadata['Document_ID'].drop_duplicates()))
print ("Distrinct Groups:",len(metadata['Group'].drop_duplicates()))

Rows: 11314
Distinct IDs: 9840
Distrinct Groups: 20


In [0]:
# Example of filename repetition
metadata.loc[[5392,5680]]

Unnamed: 0,Document_ID,Group,Text
5392,76139,6,From: jllee@acsu.buffalo.edu (Johnny L Lee)\nS...
5680,76139,17,From: hovig@uxa.cso.uiuc.edu (Hovig Heghinian)...


In [0]:
# One group for each ID
# Some of the IDs have been re-used within different groups. Making sure that no ID is repeated.
metadata['Document_ID'] = metadata['Document_ID'].astype(int)
dupes = metadata.Document_ID.duplicated(keep='first')
max_id = metadata.Document_ID.max() + 1
new_id = range(max_id, max_id + dupes.sum())
metadata.loc[dupes, 'Document_ID'] = new_id

print ("Rows:",len(metadata))
print ("Distinct IDs:",len(metadata['Document_ID'].drop_duplicates()))
print ("Distrinct Groups:",len(metadata['Group'].drop_duplicates()))

# Test
metadata.loc[[5392,5680]]

Rows: 11314
Distinct IDs: 11314
Distrinct Groups: 20


Unnamed: 0,Document_ID,Group,Text
5392,76139,6,From: jllee@acsu.buffalo.edu (Johnny L Lee)\nS...
5680,179489,17,From: hovig@uxa.cso.uiuc.edu (Hovig Heghinian)...


In [0]:
# Add a derived column for lengths of texts
lengths = []
for i in metadata['Text']:
    lengths.append(len(i))

metadata['Length'] = lengths
metadata.head()


Unnamed: 0,Document_ID,Group,Text,Length
0,102994,7,From: lerxst@wam.umd.edu (where's my thing)\nS...,721
1,51861,4,From: guykuo@carson.u.washington.edu (Guy Kuo)...,858
2,51879,4,From: twillis@ec.ecn.purdue.edu (Thomas E Will...,1981
3,38242,1,From: jgreen@amber (Joe Green)\nSubject: Re: W...,815
4,60880,14,From: jcm@head-cfa.harvard.edu (Jonathan McDow...,1120


In [0]:
# Assign dates to news articles
np.random.seed(123)
date_list = pd.date_range(datetime.date(datetime.now()) - timedelta(180), datetime.date(datetime.now()), freq='D')
dates = []
for i in metadata['Document_ID']:
    dates.append(np.random.choice(date_list, replace = True))
    
metadata['Date'] = dates
metadata.head()

Unnamed: 0,Document_ID,Group,Text,Length,Date
0,102994,7,From: lerxst@wam.umd.edu (where's my thing)\nS...,721,2020-03-24
1,51861,4,From: guykuo@carson.u.washington.edu (Guy Kuo)...,858,2020-04-10
2,51879,4,From: twillis@ec.ecn.purdue.edu (Thomas E Will...,1981,2020-02-10
3,38242,1,From: jgreen@amber (Joe Green)\nSubject: Re: W...,815,2020-03-13
4,60880,14,From: jcm@head-cfa.harvard.edu (Jonathan McDow...,1120,2019-12-23


In [0]:
# Final Metadata
metadata = metadata[['Document_ID', 'Group', 'Length', 'Date']]
metadata.head()

Unnamed: 0,Document_ID,Group,Length,Date
0,102994,7,721,2020-03-24
1,51861,4,858,2020-04-10
2,51879,4,1981,2020-02-10
3,38242,1,815,2020-03-13
4,60880,14,1120,2019-12-23


In [0]:
# Split data into existing corpus and test articles
corpus = data.data[0:11000]
corpus_keys = keys[0:11000]
print (len(corpus_keys))

test = data.data[11000:11314]
test_keys = keys[11000:11314]
print (len(test_keys))

11000
314


In [0]:
len(data.data)

11314

In [0]:
# Split data into existing corpus and test articles
corpus = data.data[0:11000]
corpus_keys = keys[0:11000]
print (len(corpus_keys))

test = data.data[11000:11314]
test_keys = keys[11000:11314]
print (len(test_keys))

11000
314


In [0]:
# Use scikit-learn tfidf vectorizer to build a TFIDF Matrix for the corpus
tfidf = TfidfVectorizer()
corpus_tfidf_matrix = tfidf.fit_transform(corpus)
corpus_tfidf_matrix.shape

(11000, 128684)

In [0]:
# Use scikit-learn tfidf vectorizer to build a TFIDF matrix for test articles.
test_tfidf_matrix = tfidf.transform(test)
test_tfidf_matrix.shape

(314, 128684)

In [0]:
test_tfidf_matrix[[1]]

<1x128684 sparse matrix of type '<class 'numpy.float64'>'
	with 81 stored elements in Compressed Sparse Row format>

In [0]:
# Calculate similarity scores using our trained TFIDF Matrix and test keys.
similarity_scores = cosine_similarity(corpus_keys, corpus_tfidf_matrix, test_keys, test_tfidf_matrix).rename(columns = {'KF1':'ID1', 'KF2':'ID2'})
similarity_scores = similarity_scores.sort_values('Similarity_Score',ascending = False)
similarity_scores[['ID1', 'ID2']] = similarity_scores[['ID1', 'ID2']].astype(int)
display(similarity_scores.head())

print ("Number of combination of similarity scores:", len(similarity_scores))

TypeError: ignored

In [0]:
corpus_keys[0]

'102994'

In [0]:
cosine_similarity(corpus_tfidf_matrix, test_tfidf_matrix).shape

(11000, 314)

In [0]:
cosine_similarity(corpus_tfidf_matrix, test_tfidf_matrix)

array([[0.0507121 , 0.05995096, 0.05243065, ..., 0.04418013, 0.0442273 ,
        0.03479337],
       [0.02801882, 0.03766755, 0.03485824, ..., 0.07320681, 0.0692839 ,
        0.02566053],
       [0.08634991, 0.09948231, 0.08247619, ..., 0.07539519, 0.06165261,
        0.02481964],
       ...,
       [0.02349242, 0.03490878, 0.06243049, ..., 0.03587715, 0.04580663,
        0.0177015 ],
       [0.07213005, 0.05449305, 0.05942875, ..., 0.04274488, 0.08231781,
        0.0273951 ],
       [0.05906101, 0.0703093 , 0.06023365, ..., 0.06072288, 0.04958232,
        0.01220954]])

In [0]:
corpus_tfidf_matrix[1][1]

IndexError: ignored

In [0]:
tfidf = TfidfVectorizer()
corpus_tfidf_matrix = tfidf.fit_transform(text)
corpus_tfidf_matrix.shape

(2505, 205359)

In [0]:
questions = [
  'Development of a point-of-care test and rapid bed-side tests',
  'Diagnosing SARS-COV-2 with Nucleic-acid based tech',
  'Diagnosing SARS-COV-2 with antibodies'
]
test_tfidf_matrix = tfidf.transform(questions)
test_tfidf_matrix.shape

(3, 205359)

In [0]:
sim_matrix = cosine_similarity(corpus_tfidf_matrix, test_tfidf_matrix)

In [0]:
sim_matrix.shape

(2505, 3)

In [0]:
lis = []
threshold = 0.25
for i in range(sim_matrix.shape[0]):
  for j in range(sim_matrix.shape[1]):
    if sim_matrix[i][j] > threshold:
      lis.append([i, j, sim_matrix[i][j]])

In [0]:
lis

[[401, 2, 0.2779109468122042],
 [502, 2, 0.28150386452310255],
 [1301, 2, 0.28777243232792077],
 [1342, 2, 0.25542596694844993],
 [1600, 2, 0.25643242336569444],
 [2456, 2, 0.2504504732192753]]

In [0]:
sim_matrix[i][j]

0.007292113072635005

In [0]:
data= []
data_bad=[]
with open("sm_df.txt", 'r') as f:
    for line in f:
        try: #print(line)
            data.append(json.loads(line))
        except:
            #print(line)
            print("no")

In [0]:
text=[]
for i in range(len(data)):
  tex=""
  for key in data[i]["text_dict"].keys():
    tex= tex+ data[i]["text_dict"][key]
  text.append(tex)

In [34]:
!pip install transformers
import torch
from transformers import BertForQuestionAnswering
model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')



HBox(children=(FloatProgress(value=0.0, description='Downloading', max=443.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1340675298.0, style=ProgressStyle(descr…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




In [0]:
def answer_question(question, answer_text):
    '''
    Takes a `question` string and an `answer_text` string (which contains the
    answer), and identifies the words within the `answer_text` that are the
    answer. Prints them out.
    '''
    # ======== Tokenize ========
    # Apply the tokenizer to the input text, treating them as a text-pair.
    input_ids = tokenizer.encode(question, answer_text,max_length=500
                                )

    # Report how long the input sequence is.
    #print('Query has {:,} tokens.\n'.format(len(input_ids)))

    # ======== Set Segment IDs ========
    # Search the input_ids for the first instance of the `[SEP]` token.
    sep_index = input_ids.index(tokenizer.sep_token_id)

    # The number of segment A tokens includes the [SEP] token istelf.
    num_seg_a = sep_index + 1

    # The remainder are segment B.
    num_seg_b = len(input_ids) - num_seg_a

    # Construct the list of 0s and 1s.
    segment_ids = [0]*num_seg_a + [1]*num_seg_b

    # There should be a segment_id for every input token.
    assert len(segment_ids) == len(input_ids)

    # ======== Evaluate ========
    # Run our example question through the model.
    start_scores, end_scores = model(torch.tensor([input_ids]), # The tokens representing our input text.
                                    token_type_ids=torch.tensor([segment_ids])) # The segment IDs to differentiate question from answer_text

    # ======== Reconstruct Answer ========
    # Find the tokens with the highest `start` and `end` scores.
    answer_start = torch.argmax(start_scores)
    answer_end = torch.argmax(end_scores)
    
    
    # Get the string versions of the input tokens.
    tokens = tokenizer.convert_ids_to_tokens(input_ids)

    # Start with the first token.
    answer = tokens[answer_start]

    # Select the remaining answer tokens and join them with whitespace.
    for i in range(answer_start + 1, answer_end + 1):
        
        # If it's a subword token, then recombine it with the previous token.
        if tokens[i][0:2] == '##':
            answer += tokens[i][2:]
        
        # Otherwise, add a space then the token.
        else:
            answer += ' ' + tokens[i]
            
    s_scores = start_scores.detach().numpy().flatten()
    e_scores = end_scores.detach().numpy().flatten()
    #print('score:'+(start_scores)+"; "+str(end_scores))
    #print('score:'+str(max(s_scores))+"; "+str(min(e_scores)))
    #print(str(tensor[torch.argmax(start_scores)]))
    #print('Answer: "' + answer + '"')
    #[answer,str(max(s_scores)),len(input_ids)]
    return answer

In [38]:
questions[2]

'Diagnosing SARS-COV-2 with antibodies'

In [39]:
answer_question(questions[2], text[1301])

'[CLS] diagnosing sars - cov - 2 with antibodies [SEP]'