# <center>HW 4: Text preprocessing</center>

In [None]:
import nltk, re, json, string
from sklearn.preprocessing import normalize
from sklearn.metrics import pairwise_distances
import numpy as np  
import pandas as pd
from nltk.corpus import stopwords
import spacy

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

## Q1: Regular Expression (2 points)

Suppose you have scraped the text shown below from an online source. You'd like to extract data using regular expression.

Define a **extract** function which:
- Takes a piece of text (in the format of shown below) as an input
- Extracts data into a list of tuples using regular expression, e.g.  `[('BTC-USD','56,212.15','-58.16','-0.10%'), ('ETH-USD',  ...), ...]`
- Returns the list of tuples

In [None]:
text='''Symbol   Last Price  Change   % Change   Note
                  BTC-USD  56,212.15   -58.16   -0.10%   Bitcoin 
                  ETH-USD  1,787.79    -53.63   -2.91%   Ether
                  BNB-USD  1,101,290.51      +5.81    +2.04%   Binance
                  USDT-USD 1.0003      -0.0004  -0.04%   Tether
                  ADA-USD  1.1187      -0.0528  -4.51%   Cardano
      '''
text

'Symbol   Last Price  Change   % Change   Note\n                  BTC-USD  56,212.15   -58.16   -0.10%   Bitcoin \n                  ETH-USD  1,787.79    -53.63   -2.91%   Ether\n                  BNB-USD  1,101,290.51      +5.81    +2.04%   Binance\n                  USDT-USD 1.0003      -0.0004  -0.04%   Tether\n                  ADA-USD  1.1187      -0.0528  -4.51%   Cardano\n      '

In [None]:
# Define the function

def extract(text):
   
    regex = r"(\w+-\w+)\s+([0-9,.]+)\s+([0-9-.+]+)\s+([0-9-+%.]+)"

    return re.findall(regex, text)
    

In [None]:
# Test the function

extract(text)

[('BTC-USD', '56,212.15', '-58.16', '-0.10%'),
 ('ETH-USD', '1,787.79', '-53.63', '-2.91%'),
 ('BNB-USD', '1,101,290.51', '+5.81', '+2.04%'),
 ('USDT-USD', '1.0003', '-0.0004', '-0.04%'),
 ('ADA-USD', '1.1187', '-0.0528', '-4.51%')]

## Q2: Collocation (3 points)

Define a function `top_collocation(doc, K)` to find top-K collocations in specific patterns in a document as follows:
  - Takes a document (i.e. `doc`) and `K` as inputs
  - Find collocations as follows:
    - Tokenize the document and find POS tag of each token (hint: you can use NLTK word tokenizer or Spacy tokenizer).
    - Create bigrams from the tokens with POS tags.

    - Keep only bigrams matching the following patterns:
       - `Adj + Noun`: e.g. linear function
       - `Noun + Noun`: e.g. regression coefficient
    - Get frequency of each bigram (hint: you can use nltk.FreqDist)
    - Returns top K collocations by frequency

In [None]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('genesis')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

[nltk_data] Downloading package genesis to /root/nltk_data...
[nltk_data]   Unzipping corpora/genesis.zip.


True

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
# Define the function

def top_collocation(doc, K):
    
    pattern=r'\w[\w\-]*\w' 

    tokens=nltk.regexp_tokenize(doc.lower(), pattern)

    tagged_tokens = nltk.pos_tag(tokens)

    bigrams=list(nltk.bigrams(tagged_tokens))

    phrases = [(x[0],y[0]) for (x,y) in bigrams\
               if x[1].startswith('JJ')or x[1].startswith('JJR') or x[1].startswith('JJS') and y[1].startswith('NN') or \
               x[1].startswith('NN') and y[1].startswith('NN')] 

    word_dist = nltk.FreqDist(phrases)

    result = word_dist.most_common(K)
    
    return result

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
data = json.load(open("/content/drive/MyDrive/BIA_660_Web_Mining/assignments/Homework4/qa.json","r"))
article = data["context"]

top_collocation(article, 10)

[(('public', 'health'), 16),
 (('community', 'spread'), 9),
 (('united', 'states'), 7),
 (('higher', 'risk'), 4),
 (('covid-19', 'illness'), 4),
 (('elevated', 'risk'), 4),
 (('new', 'coronavirus'), 3),
 (('health', 'threat'), 3),
 (('new', 'virus'), 3),
 (('respiratory', 'disease'), 2)]

## Q3: Question and Answering (QA) System (5 points)

Develop a QA system which allow you to search for answers in an article. For example, the file `qa.json` contains a research article. This article can answer a number of questions about COVID-19. You will design a solution to automatically search answers to these questions in this article.

`qa.json` is taken from https://github.com/deepset-ai/COVID-QA. This file contains a few questions, and answers to these questions have been located in the article. Let's define a QA system and check if your system can locate the right answers.

The following script helps you understand `qa.json`:

In [None]:
# Retrieve the article

data = json.load(open("/content/drive/MyDrive/BIA_660_Web_Mining/assignments/Homework4/qa.json","r"))
article = data["context"]

# A long article. Just print the first 200 characters
print(article[0:200])

CDC Summary 21 MAR 2020,
https://www.cdc.gov/coronavirus/2019-ncov/cases-updates/summary.html

This is a rapidly evolving situation and CDC will provide updated information and guidance as it becomes 


In [None]:
# Retrieve all the questions and answers
qas = data["qas"]

# show the first question-answer pair. Note the answer starts at the 6117th character
print(qas[0])

# get all questions
qs = [item["question"] for item in qas]
qs

{'question': 'What age group has the highest rate of severe outcomes?', 'id': 236, 'answers': [{'text': 'people 85 years and older', 'answer_start': 6117}], 'is_impossible': False}


['What age group has the highest rate of severe outcomes?',
 'How is COVID-19 spread?',
 'How many states in the U.S. have reported cases of COVID-19?',
 'When did the White House launch the "15 Days to Slow the Spread" program?',
 'What should mildly-ill patients do?',
 'What type of virus is SARS-CoV-2?',
 'What viruses are similar to the COVID-19 coronavirus?',
 'What are the phases of a pandemic?',
 'At which phase does the peak of the pandemic occur?',
 'People with which medical conditions have a higher rate of severe illness?',
 'What kind of test can diagnose COVID-19?',
 'In what species did the COVID-19 virus likely originate?',
 'What risk factors should be considered in addition to clinical symptoms?']

Next, following the instructions below step by step to develop the QA system

### Q3.1. Tokenizer

Define a function `tokenize(doc)`  as follows:
   - Take a piece of text (i.e. variable `doc`) as an input
   - Split the input text into unigrams
   - Clean up tokens as follows:
       - Lemmatize all unigrams
       - Remove all stop words
       - Remove all punctuations
       - Convert all unigrams to the lower case 
       - remove empty unigrams
   - Return the list of unigrams after all the processing. (Hint: you can use spacy package for this task. To test if a token is stop word or punctuation, check https://spacy.io/api/token#attributes)

In [None]:
nltk.download('punkt')
nltk.download('stopwords')
import pandas as pd
pd.set_option('display.max_columns', None)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
# Define the function
import spacy
import en_core_web_sm
from spacy.lemmatizer import Lemmatizer
from spacy.lang.en.stop_words import STOP_WORDS

def tokenize(doc):
    nlp = en_core_web_sm.load()
    doc_tokens = nlp(doc)
    tokens = list() 

    for word in doc_tokens:
        if word.is_stop == False and word.is_punct == False:
            word.lemma_ = word.lemma_.lower()
            tokens.append(word.lemma_)
                       
    return tokens

In [None]:
doc = 'Older people and people of all ages with severe chronic medical conditions — \
like heart disease, lung disease and diabetes, \
for example — seem to be at higher risk of developing serious COVID-19 illness.'

print(tokenize(doc))

['old', 'people', 'people', 'age', 'severe', 'chronic', 'medical', 'condition', 'like', 'heart', 'disease', 'lung', 'disease', 'diabetes', 'example', 'high', 'risk', 'develop', 'covid-19', 'illness']


### Q3.2. Compute TF-IDF Matrix

Define a function `compute_tfidf(docs)` as follows: 

- Take `docs`, a list of documents (e.g. a list of questions) as an input
- Tokenize each document in `docs` using the `tokenize` function defined in Q3.1. 
- Calculate tf_idf weights as shown in lecture notes (Hint: you can reuse the last code segment in NLP Lecture Notes (II))
- Return a smoothed normalized `tf_idf` array. (The result may differ a little bit depending on the tokenize function and packages you use.)

In [None]:
# Define the function

def get_doc_tokens(doc):
    stop_words = stopwords.words('english')
    tokens=[token.strip() \
            for token in nltk.word_tokenize(doc.lower()) \
            if token.strip() not in stop_words and\
               token.strip() not in string.punctuation]

    token_count = nltk.FreqDist(tokens)
    
    return token_count
  
def compute_tfidf(docs):
    documents={idx:get_doc_tokens(doc) for idx,doc in enumerate(docs)}

    dtm=pd.DataFrame.from_dict(documents, orient="index")
    dtm=dtm.fillna(0)
    dtm = dtm.sort_index(axis = 0)


    tf_array = dtm.values
    doc_len = tf_array.sum(axis=1, keepdims=True)
    tf = np.divide(tf_array, doc_len)

    df = np.where(tf>0,1,0)

    smoothed_idf = np.log(np.divide(len(docs)+1, np.sum(df, axis=0)+1))+1    
    smoothed_tf_idf = tf*smoothed_idf

    # pd.options.display.float_format = '{:,.2f}'.format
    # print(pd.DataFrame(smoothed_tf_idf, columns = dtm.columns))

    return smoothed_tf_idf

In [None]:
# Test the function using three questions

np.set_printoptions(precision=2)
compute_tfidf(qs[0:3])

array([[0.28, 0.28, 0.28, 0.28, 0.28, 0.28, 0.  , 0.  , 0.  , 0.  , 0.  ,
        0.  , 0.  ],
       [0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.64, 0.85, 0.  , 0.  , 0.  ,
        0.  , 0.  ],
       [0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.21, 0.  , 0.28, 0.28, 0.28,
        0.28, 0.28]])

### Q3.2. Put Everything Together

Define a function `find_solutions(qs, article)` as follows: 

- Take two inputs:
    - `qs`: a list of questions (i.e. strings)
    - `article`: a document which may contain answers to the questions
- Segment the article into sentences (i.e. `sents`). You will locate the sentence which can answer a question.
- Concatenate the questions (`qs`) and sentences (`sents`) into a single list (i.e. `qs + sents`)
- Call the function `compute_tfidf` defined in Q3.2 with `qs + sents` to get a `TF-IDF` matrix. (Note, now `qs` and `sents` are converted to TF-IDF vectors in the same dimension. As a result, you can measure their similarities.) 
- Split the `TF-IDF` matrix into two sub matrices, one corresponding to `qs` and the other for `sents`. 
- Next, calculate the pairwise cosine similarity between the `qs` and `sents`. With $m$ questions and $n$ sentences, you should get a $m \times n$ matrix. (hint: you can `sklearn.metrics.pairwise_distances` to calculate pairwise distances between two matrices)
- Finally, the answer to each question is the sentence which has the `maximum similarity` to it. 
- Print out each question and its matched answer. Check if your QA system is able to find the right answer.(Depending on the packages you use, your answer might be a bit different from mine.)

In [None]:
# Define the function


import sklearn
def find_solutions(qs, article):
    
    sentences = nltk.sent_tokenize(article)
    qs_article = qs + sentences

    x = compute_tfidf(qs_article)
    
    res = sklearn.metrics.pairwise_distances(x[0:len(qs)], x[len(qs):])
    result = list()

    for q_index in range(len(res)):
        arr = res[q_index]
        ans = list()
        ans.append("Question: " + qs[q_index])
        ans.append("Answer: " + sentences[np.argmin(arr, axis=0)])
        result.append(ans)
    
    return result
                                      

In [None]:
# Test the system

find_solutions(qs, article)

[['Question: What age group has the highest rate of severe outcomes?',
  'Answer: A CDC Morbidity & Mortality Weekly Report that looked at severity of disease among COVID-19 cases in the United States by age group found that 80% of deaths were among adults 65 years and older with the highest percentage of severe outcomes occurring in people 85 years and older.'],
 ['Question: How is COVID-19 spread?',
  'Answer: CDC Recommends\nEveryone can do their part to help us respond to this emerging public health threat:\nOn March 16, the White House announced a program called “15 Days to Slow the Spread,”pdf iconexternal icon which is a nationwide effort to slow the spread of COVID-19 through the implementation of social distancing at all levels of society.'],
 ['Question: How many states in the U.S. have reported cases of COVID-19?',
  'Answer: All 50 states have reported cases of COVID-19 to CDC.'],
 ['Question: When did the White House launch the "15 Days to Slow the Spread" program?',
  'An