## Assignment 2 
### Jiaxuan  a1876484

### 1. Reading dataset and pre-processing

In [1]:
import json
import os
import csv
import numpy as np
import pandas as pd
import random
import string
from collections import defaultdict
from gensim.models import Word2Vec
from gensim.parsing.preprocessing import preprocess_string
from sklearn.metrics.pairwise import cosine_similarity
from nltk.stem import WordNetLemmatizer
from nltk import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk import bigrams
from nltk import pos_tag
from collections import Counter

def preprocess_text(text):
    """
    Preprocess the text input for both dataset articles and queries.

    text: Text in string.
    """
    lemmatizer = WordNetLemmatizer() # define lemmatizer
    stop_words = set(stopwords.words('english')) # use stop_words
    
    tokens = word_tokenize(text)
    tokens = [t.lower() for t in tokens if t.lower() not in stop_words] # eliminate the stopwords
    tokens = [t for t in tokens if t not in string.punctuation] # eliminate the stopwords
    tokens = [t for t in tokens if not t.isnumeric()] # eliminate the single numbers to clean the text, keep the words like COVID-19.
    tokens = [lemmatizer.lemmatize(t) for t in tokens] # lemmatize rest words
    return " ".join(tokens) # return a string after preprocess


def read_and_process_data(file_path, num_samples = 1000):
    """
    Randomly load and process the dataset and save in a dataframe in the end.

    file_path: Path to the folder containing the JSON files
    num_samples: Number of samples to read from the folder (default: 10000)
    """
    files = os.listdir(file_path) # get all the file names in the folder
    sample = random.sample(files, num_samples) # randomly pick files with the number of samples
    df_sample = pd.DataFrame(columns=['paper_id', 'paragraphs'])# create a dataframe to save the aritcle id and paragraphs 
    for file in sample: # read files, extract paper_id and texts in bodytext
        with open(file_path + file ,encoding = 'utf-8') as f:
            data = json.load(f)
        paperid = data['paper_id']
        bodytext = ''
        for text in data['body_text']:
            text = text['text']
            bodytext += text
        df_sample.loc[len(df_sample.index)] = [paperid , bodytext] # add a line with paper_id and all the bodytext in it.
    df_sample["paragraphs"] = df_sample["paragraphs"].apply(preprocess_text)# preprocess all the paragraphs.
    return df_sample
# Here we decided to use a dataframe to save the text as sentences, because we will use word2vec to transform them into word embeddings, 
# so the "distance" between them is very helpful and necessary

### 2. Named Entity Recognition and Knowledge Base

In [2]:
def create_knowledge_base(articles):
    """
    Use NER to create KB, extract all the NEs that are linked by 'also named', 'also called' and 'also known as'

    articles: Articles after pre-process
    """
    nlp = spacy.load("en_core_sci_sm") # more accurate extraction of NER using spcay's medical data
    kb = {}
    def add_to_kb(key: str, value: str): # define adding the detected words to the dictionary
        if key not in kb:
            kb[key] = [value]
        elif value not in kb[key]:
            kb[key].append(value)

    for _, paragraphs in articles:
        doc = nlp(paragraphs)
        for token in doc:
            # check for aliases using "also/sometines known as" pattern
            if token.lower_ in {"also", "sometimes"}:
                conj = token.head
                if conj.lower_ in {"known","called","named"}:
                    subject = conj.head
                    obj = [t for t in conj.rights if t.dep_ in {"dobj", "attr"}]
                    # add the word after lemmatizer and lower
                    add_to_kb(subject.lemma_.lower(), obj[0].lemma_.lower()) 

            # check for aliases using "or" or "cc"pattern
            if token.lower_ == "or" and token.dep_ == "cc":
                left = token.lefts
                right = token.rights
                for left_item in left:
                    for right_item in right:
                        # determine whether the left and right lexicals are consistent with pos_tag
                        if (left_item.dep_ in {"conj", "appos"} and
                            right_item.dep_ in {"conj", "appos"} and
                            left_item.pos_ in {"NOUN", "PROPN"} and
                            right_item.pos_ in {"NOUN", "PROPN"}):
                            # add the word after lemmatizer and lower
                            add_to_kb(left_item.lemma_.lower(), right_item.lemma_.lower())

    return kb

#Here I used the simpler method of finding conjunctions to find aliases, 
#but to be more precise I added a lexical determination so that only nouns would be added to the KB dictionary

### 3. Indexing method

In [63]:
# 3. Article embeddings
def average_word_vectors(words, model, vocabulary, num_features):
    """
    Calculate the average word vectors for a list of words.

    words: The input list of words
    model: Word2Vec model trained on the paragraphs
    vocabulary: Set of words in the model's vocabulary
    num_features: Size of the word embeddings same to the model size
    """
    feature_vector = np.zeros((num_features,), dtype="float32") # initialize an empty feature vector with the same size as the word embeddings
    n_words = 0 # initialize a counter for the number of words in the list that are in the model's vocabulary
    for word in words: # iterate through the words in the list
         
        if word in vocabulary: # if the word is in the model's vocabulary, update the feature vector and counter
            n_words += 1
            feature_vector = np.add(feature_vector, model.wv[word])
            
    # if there are any valid words in the list, divide the feature vector by the number of valid words
    if n_words:
        feature_vector = np.divide(feature_vector, n_words)
    return feature_vector

def create_article_embeddings(processed_data, embedding_size=200):
    """
    Create article embeddings using Word2Vec.

    processed_data: DataFrame containing the processed articles
    embedding_size: Size of the word embeddings (default: 200)
    """
    # train the Word2Vec model
    #model = Word2Vec([paragraph.split() for paragraph in processed_data["paragraphs"]], vector_size=embedding_size, min_count=3,window=5,workers=4)
    model = Word2Vec.load("word2vec.model")
    # compute the average embeddings for each article
    article_embeddings = [average_word_vectors(article.split(), model, set(model.wv.index_to_key), 200) for article in processed_data["paragraphs"]]
    #model.save("word2vec.model")
    return model, article_embeddings
# Here I use Inverted Index to implement article indexing, in average_word_vectors() we define the implementation of converting text into word vectors, 
# then generate the model and define the word vector output of all articles separately in create_article_embeddings() to compare later with word vecters of query words, 
# And I use the content of 20,000 files as the training content of my model, the size of the model is about 650MB.

### 4. Text matching utility

In [86]:
# 4. Text matching utility

    
def match_articles(query, model,processed_data, article_embeddings, kb, top_k=10):
    """
    Match articles based on the given query.

    query: The input query (question)
    model: Word2Vec model trained on the paragraphs
    processed_data: The dataframe of dataset to find the cord_uid
    article_embeddings: Precomputed article embeddings
    kb: Knowledge Base containing named entities and their associated terms
    top_k: Number of top matches to return (default: 3)
    """
    
    # preprocess the query and find related entities from the KB
    query = preprocess_text(query)
    query_tokens = word_tokenize(query) # tokenize words to get pos_tag and detect each if in kb
    query_tokens = pos_tag(query_tokens) # use pos_tag here to extract the "key words" including Noun, verb, adjective in the query
    query_entities = [word[0] for word in query_tokens if word[1] in ['NN','NNP','NNS','NNPS','JJ','JJR','JJS','VB','VBG','VBD','VBN','VBP','VBZ']]
    
    # find related entities from the KB for each token in the query
    for token in query_tokens:
        if token[0] in kb and token[1] in ['NN','NNP','NNS','NNPS']:# detect if the noun in the token is in the KB
            query_entities += kb[token[0]] # add the aliases into the query_entities
    
    # compute the query embedding after combined associated names in kb
    query_embedding = average_word_vectors(query_entities, model, set(model.wv.index_to_key), 200)
    
    # calculate the cosine similarity between the query embedding and the article embeddings
    cos_similarities  = cosine_similarity([query_embedding], article_embeddings)[0]
    
    # get the indices of the top_k most similar articles
    top_n_indices = np.argsort(cos_similarities)[-top_k:][::-1]  # Top 3 most similar articles
    
    # create a list of the top_k most similar articles with their similarity scores
    top_n_similarities = cos_similarities[top_n_indices]   
    
    # extract the snippet from the article
    snippet = []
    for index in top_n_indices:
        # read the file again to extract each part of body text
        file = "C:/Users/17820/Downloads/pmc_json/"+processed_data.iloc[index]['paper_id']+'.xml.json'
        with open(file ,encoding = 'utf-8') as f:
            data = json.load(f)
        paragraph = []
        for text in data['body_text']:
            text = text['text']
            sentences = preprocess_text(text) # reprocess each text in the articles to make them embedding later
            paragraph.append(sentences)
        
        # compute each part of text to snippet embedding
        snippet_embeddings = [average_word_vectors(sentences.split(), model, set(model.wv.index_to_key), 200) for sentences in paragraph]
        
        # calculate again the cosine similarity between the query embedding and the snippet embeddings
        cos_similarities_snippet = cosine_similarity([query_embedding], snippet_embeddings)[0]
        
        # use np.argsort() funtion to let each snippet in the article be sorted according to cos_similarities_snippet from largest to smallest
        snippet_index = np.argsort(cos_similarities_snippet)[-1:][::-1]
        
        # add the most related original version of snippet in each of three articles in the list
        snippet.append(data['body_text'][snippet_index[0]]['text'])
        
        #csv_path = 'C:/Users/17820/Downloads/metadata.csv'
        # prepare the three top articles' paper_ids for output
        paper_ids = [processed_data.iloc[index]['paper_id'] for index in top_n_indices]
        # prepare the three top articles' similarities for output
        similarities = [cos_similarities[index] for index in top_n_indices]
    return snippet,paper_ids,similarities 
    
def extract_answer(query, snippet):
    """
    Extract a specific answer from the snippet based on the question word.

    query: The input query (question)
    snippet: The snippet from the most relevant article
    """
    query_tags = pos_tag(word_tokenize(query))
    question_word = ""
    for word, pos in query_tags:
        if pos in ["WDT", "WP", "WP$", "WRB"]:
            question_word = word.lower()
            break
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(snippet)
    if question_word in ["who", "what", "which"]:
        for ent in doc.ents:
            if ent.label_ in ["PERSON", "ORG"]:
                return ent.text

    elif question_word == "when":
        for ent in doc.ents:
            if ent.label_ in ["DATE", "TIME"]:
                return ent.text

    elif question_word == "where":
        for ent in doc.ents:
            if ent.label_ in ["GPE", "LOC", "FAC"]:
                return ent.text

    elif question_word in ["how", "why"]:
        for sent in doc.sents:
            for token in sent:
                if token.dep_ == "ROOT":
                    return sent.text
    
    

#In the match_articles() function I combined the method of finding the article in the pd and the text in the article, this step is also the most important step of the whole system.
#First we extracted the noun, verb and adjective parts of the query, here I did not choose to use NER but use pos_tag because NER can not identify the key words in the question well, 
#it may be common nouns (face mask) or the most words (larggest), so extract them all can be better to get the semantics.
#Then for each keyword in the noun, I searched in the KB whether it has aliases and added to the tokens of the query, which is very helpful to improve the accuracy of the search and increase the weight of keywords.
#For similarity comparison I used cos_similarities to embedding the query and each atricle to get the three most matching articles, 
#and similarly for each paragraph in the atricle I used this method to find the most relevant snippet, snippet and their scores (similarity) as output

In [7]:
import textwrap
import torch
import pandas as pd
from transformers import BertForQuestionAnswering
from transformers import BertTokenizer
#device_available = torch.cuda.is_available()
device_available = False
from IPython.core.display import display, HTML
import seaborn as sns
import matplotlib.pyplot as plt

model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
if device_available:
    model.cuda()

tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

def get_dataset(csv_path):
    corpus = []
    csv_df = pd.read_csv(csv_path).dropna(subset=['authors', 'abstract']).drop_duplicates(subset='abstract')
    csv_df = csv_df[csv_df['abstract']!='Unknown']
    for ix,row in csv_df.iterrows():
        if row['abstract'] and not pd.isna(row['abstract']):
            temp_dict = dict()
            temp_dict['abstract'] = row['abstract']
            temp_dict['title'] = row['title']
            temp_dict['authors'] = row['authors']
            temp_dict['url'] = row['doi']
            temp_dict['publish_time'] = row['publish_time']
            corpus.append(temp_dict)
    return corpus

wrapper = textwrap.TextWrapper(width=80) 

corpus = get_dataset('metadata.csv')
def answer_question_dict(question, keyword=None, show_visualization=False):

    '''
    Takes a `question` string and an `answer_text` string (which contains the
    answer), and identifies the words within the `answer_text` that are the
    answer. Prints them out.
    '''
    # select corpus
    answer_text = corpus

    # Initializing answers list
    answers = {}
    min_score = 0
    counter = 0 # for stopping iterations earlier
    
    for answer_option in answer_text:
        if keyword and keyword not in answer_option['abstract']:
            continue

      # ======== Tokenize ========
      # Apply the tokenizer to the input text, treating them as a text-pair.
        input_ids = tokenizer.encode(question, answer_option['abstract'],max_length=512)

      # Report how long the input sequence is.
      #print('Query has {:,} tokens.\n'.format(len(input_ids)))

      # ======== Set Segment IDs ========
      # Search the input_ids for the first instance of the `[SEP]` token.
        sep_index = input_ids.index(tokenizer.sep_token_id)

      # The number of segment A tokens includes the [SEP] token istelf.
        num_seg_a = sep_index + 1

      # The remainder are segment B.
        num_seg_b = len(input_ids) - num_seg_a

      # Construct the list of 0s and 1s.
        segment_ids = [0]*num_seg_a + [1]*num_seg_b
    

      # There should be a segment_id for every input token.
        assert len(segment_ids) == len(input_ids)

      # ======== Evaluate ========
      # Run our example question through the model.
        
        input_ids_tensor = torch.tensor([input_ids])
        segment_ids_tensor = torch.tensor([segment_ids])
        if device_available:
            input_ids_tensor = input_ids_tensor.to('cuda:0')
            segment_ids_tensor = segment_ids_tensor.to('cuda:0')

        start_scores, end_scores = model(input_ids_tensor, # The tokens representing our input text.
                                  token_type_ids=segment_ids_tensor) # The segment IDs to differentiate question from answer_text
    
      # only review answers with score above threshold
        score = round(torch.max(start_scores).item(), 3)

        if score>min_score and score>0:

        # ======== Reconstruct Answer ========
        
        # Find the tokens with the highest `start` and `end` scores.
            answer_start = torch.argmax(start_scores)
            answer_end = torch.argmax(end_scores)


        # Get the string versions of the input tokens.
            tokens = tokenizer.convert_ids_to_tokens(input_ids)

        # Start with the first token.
            answer = tokens[answer_start]

        # Select the remaining answer tokens and join them with whitespace.
            for i in range(answer_start + 1, answer_end + 1):

                # If it's a subword token, then recombine it with the previous token.
                if tokens[i][0:2] == '##':
                    answer += tokens[i][2:]

                # Otherwise, add a space then the token.
                else:
                    answer += ' ' + tokens[i]

        # ======== Add Answer to best answers list ========

            if len(answers)>4:
                min_score = min([d for d in answers.keys()])

            if len(answers)==10:
                answers.pop(min_score)
            answers[score] = [answer, score, '<a href="https://doi.org/'+str(answer_option['url'])+'" target="_blank">' + str(answer_option['title']) +'</a>', answer_option['abstract'], answer_option['publish_time']]

            visualization_start = max(answer_start-20,0)
            visualization_end = min((answer_end+1)+20,len(tokens))
            # Variables needed for graphs
            s_scores = start_scores.cpu().detach().numpy().flatten()
            e_scores = end_scores.cpu().detach().numpy().flatten()

            # We'll use the tokens as the x-axis labels. In order to do that, they all need
            # to be unique, so we'll add the token index to the end of each one.
            token_labels = []
            for (i, token) in enumerate(tokens):
                token_labels.append('{:} - {:>2}'.format(token, i))
            answers[score] = [answer, score, '<a href="https://doi.org/'+str(answer_option['url'])+'" target="_blank">' + str(answer_option['title']) +'</a>', answer_option['abstract'], answer_option['publish_time'], s_scores, e_scores, token_labels, visualization_start, visualization_end]

    # Return dataframe with relevant data
    df_columns = ['Answer', 'Confidence', 'Title', 'Abstract', 'Published', 's_scores', 'e_scores', 'token_labels', 'visualization_start', 'visualization_end']
    df = pd.DataFrame.from_dict(answers, orient='index',columns = df_columns)
    df.sort_values(by=['Confidence'], inplace=True, ascending=False)
    return df

  from IPython.core.display import display, HTML


In [8]:
import pickle

def load_or_run_answer_question_dict(question,keyword):
    pickle_name = question.replace(' ','_').replace('?','_')
    path_to_file = F"/kaggle/input/kaggle/{pickle_name}.pickle"
    print(path_to_file)
    try:
      df = pickle.load(open(path_to_file, "rb"))
    except (OSError, IOError) as e:
        df = answer_question_dict(question, keyword)
        pickle.dump(df, open(path_to_file, "wb"))
    return df

### 6. Simple user interface

In [5]:
questions = [{'question':"Is smoking a risk factor?",'keyword':None},
             {'question':"Is a pre-existing pulmonary disease a risk factor?",'keyword':None},
             {'question':"Do co-existing conditions make the virus more transmissible?",'keyword':None},
             {'question':"Is being a pregnant woman a risk factor?",'keyword':'pregnant'},
             {'question':"Is being a neonate a risk factor?",'keyword':'neonate'},
             {'question':"Are there differences in risk factors associated to socio-economic factors?",'keyword':None},
             {'question':"How does the transmission happen?",'keyword':'transmission'},
             {'question':"What is the reproductive rate?",'keyword':None},
             {'question':"What is the incubation period?",'keyword':None},
             {'question':"What are the modes of transmission?",'keyword':None},
             {'question':"What are the enviromental factors?",'keyword':None},
             {'question':"How long is the serial interval?",'keyword':None},
             {'question':"What is the severity of disease among high risk groups and patients?",'keyword':None},
             {'question':"What is the risk of death among high risk groups and patients?",'keyword':None},
             {'question':"What is the susceptibility of populations?",'keyword':None},
             {'question':"What are the public health mitigation measures that could be effective for control?",'keyword':None}]

In [87]:
def main():
    # Load data and preprocess
    dataset_folder = "C:/Users/17820/Downloads/pmc_json/"
    processed_data = read_and_process_data(dataset_folder)
    #kb = create_kb(processed_data)
    kb = {}
    model, article_embeddings = create_article_embeddings(processed_data)
    
    # I built a simple user input interface, so that you can freely select the question input and get the result until you type 'quit'
    while True:
        query = input("Enter your question (or 'quit' to exit): \n")
        if query.strip().lower() == 'quit':
            break

        snippet,paper_ids,similarities = match_articles(query, model, processed_data,article_embeddings, kb)
        answer = []
        for s in snippet:
            ans = extract_answer(query,s)
            answer.append(ans)
        answer = Counter(answer)
        answer = list(answer.keys())[0]
        print(f"answer is {answer}")
        print('------------------------------------------------------------------------------')

if __name__ == "__main__":
    main()
    
# For my custom questions, the answer system gave some more relevant answers, in terms of improving performance, 
#using larger data, more accurate data cleaning and using pre-trained models are expected to improve the accuracy and score

Enter your question (or 'quit' to exit): 
Where the first case of COVID-19 was found?
answer is China
------------------------------------------------------------------------------
Enter your question (or 'quit' to exit): 
When the first case of COVID-19 was found?
answer is 26 March 2020
------------------------------------------------------------------------------


KeyboardInterrupt: Interrupted by user

### 7. References

In [None]:
#TREC-COVID Qrels 1, Retrieved from https://ir.nist.gov/covidSubmit/data.html#cumu

In [1]:
import torch
try:
    import transformers
    from transformers import BartTokenizer, BartForConditionalGeneration
except ImportError:
    raise ImportError(INSTALL_MSG)
from IPython.display import display, Markdown

torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'

LONG_BORING_TENNIS_ARTICLE = """
 Andy Murray  came close to giving himself some extra preparation time for his w
edding next week before ensuring that he still has unfinished tennis business to
 attend to. The world No 4 is into the semi-finals of the Miami Open, but not be
fore getting a scare from 21 year-old Austrian Dominic Thiem, who pushed him to 
4-4 in the second set before going down 3-6 6-4, 6-1 in an hour and three quarte
rs. Murray was awaiting the winner from the last eight match between Tomas Berdy
ch and Argentina's Juan Monaco. Prior to this tournament Thiem lost in the secon
d round of a Challenger event to soon-to-be new Brit Aljaz Bedene. Andy Murray p
umps his first after defeating Dominic Thiem to reach the Miami Open semi finals
 . Muray throws his sweatband into the crowd after completing a 3-6, 6-4, 6-1 vi
ctory in Florida . Murray shakes hands with Thiem who he described as a 'strong 
guy' after the game . And Murray has a fairly simple message for any of his fell
ow British tennis players who might be agitated about his imminent arrival into 
the home ranks: don't complain. Instead the British No 1 believes his colleagues
 should use the assimilation of the world number 83, originally from Slovenia, a
s motivation to better themselves. At present any grumbles are happening in priv
ate, and Bedene's present ineligibility for the Davis Cup team has made it less 
of an issue, although that could change if his appeal to play is allowed by the 
International Tennis Federation. Murray thinks anyone questioning the move, now 
it has become official, would be better working on getting their ranking closer 
to his. 'If he was 500 in the world they wouldn't be that fussed about it but ob
viously he threatens their position a bit,' said the 27 year-old Scot. ' and he'
s obviously the British number two, comfortably. 'So they can complain but the b
est thing to do is use it in the right way and accept it for what it is, and try
 to use it as motivation whether they agree with it or not. He's British now so 
they've just got to deal with it. Murray stretches for a return after starting h
is quarter final match slowly on the show court . Thiem held nothing back as he 
raced through the opening set, winning it 6-3 with a single break . The young Au
strian is considered to be one of the hottest prospects on the ATP Tour . 'I wou
ld hope that all the guys who are below him now like James (Ward) , Kyle (Edmund
) , Liam (Broady) they will use it as motivation. If he becomes eligible for Dav
is Cup then those guys are going to have to prove themselves. 'It can only be se
en as a positive for those guys using it to try to get better. He's a good playe
r but so are James and Kyle and Liam has improved. Aljaz is there, he's on the t
our every week, the other guys aren't quite there yet.' For the first time Murra
y, who has an encyclopaedic knowledge of the top 100, gave his opinion of Bedene
: 'He's a good player with a very good serve. He's a legitimate top 100 player, 
when he plays Challengers he's there or thereabouts, when he plays on the main t
our he wins matches, it's not like he turns up and always loses in the first rou
nd. Murray's fiancee was once again watching from the stands shaded by a huge br
immed hat . Kim Sears flashes her enormous diamond engagement ring while watchin
g her beau on court . 'He had a bad injury last year (wrist) but has recovered w
ell. I would imagine he would keep moving up the rankings although I don't know 
exactly how high he can go. I've practised with him a couple of times, I haven't
 seen him play loads, but when you serve as well as he does it helps. I would im
agine he' s going to be comfortably in the top 70 or 80 in the world for a while
.' It is understood the Lawn Tennis Association will give background support to 
his case regarding the Davis Cup but have made it clear that the onus is on him 
to lead the way. An official statement said: 'To have another player in the men'
s top 100 is clearly a positive thing for British tennis and so we very much wel
come Aljaz's change in citizenship.' The last comparable switch came twenty year
s ago when Greg Rusedski arrived from Canada. It was by no means universally pop
ular but, like Bedene, he pledged that he was in for the long haul and, in fairn
ess to him, he proved true to his word. Loising the first set shocked Murray int
o life as he raced to a commanding lead in the second . The No 3 seed sent over 
a few glaring looks towards his team before winning the second set . Murray had 
to put such matters aside as he tackled the unusually talented Thiem, a delight 
to watch. Coached by Boris Becker's veteran mentor Gunter Bresnik, he slightly r
esembles Andy Roddick and hits with similar power but more elegance. His single 
handed backhand is a thing of rare beauty. However, he has had a mediocre season
 coming into this event and there was little to forewarn of his glorious shotmak
ing that seemed to catch Murray unawares early on. The world No 4 looked to have
 worked him out in the second, but then suffered one of his periopdic mental lap
ses and let him back in from 4-1 before closing it out with a break. After break
ing him for 3-1 in the decider the Austrian whirlwind burnt itself out. 'He's a 
strong guy who hits the ball hard and it became a very physical match,' said Mur
ray. Murray was presented with a celebratory cake after winning his 500th match 
in the previous round .
""".replace('\n','')

In [3]:
from transformers import pipeline

summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

ARTICLE = """ New York (CNN)When Liana Barrientos was 23 years old, she got married in Westchester County, New York.
A year later, she got married again in Westchester County, but to a different man and without divorcing her first husband.
Only 18 days after that marriage, she got hitched yet again. Then, Barrientos declared "I do" five more times, sometimes only within two weeks of each other.
In 2010, she married once more, this time in the Bronx. In an application for a marriage license, she stated it was her "first and only" marriage.
Barrientos, now 39, is facing two criminal counts of "offering a false instrument for filing in the first degree," referring to her false statements on the
2010 marriage license application, according to court documents.
Prosecutors said the marriages were part of an immigration scam.
On Friday, she pleaded not guilty at State Supreme Court in the Bronx, according to her attorney, Christopher Wright, who declined to comment further.
After leaving court, Barrientos was arrested and charged with theft of service and criminal trespass for allegedly sneaking into the New York subway through an emergency exit, said Detective
Annette Markowski, a police spokeswoman. In total, Barrientos has been married 10 times, with nine of her marriages occurring between 1999 and 2002.
All occurred either in Westchester County, Long Island, New Jersey or the Bronx. She is believed to still be married to four men, and at one time, she was married to eight men at once, prosecutors say.
Prosecutors said the immigration scam involved some of her husbands, who filed for permanent residence status shortly after the marriages.
Any divorces happened only after such filings were approved. It was unclear whether any of the men will be prosecuted.
The case was referred to the Bronx District Attorney\'s Office by Immigration and Customs Enforcement and the Department of Homeland Security\'s
Investigation Division. Seven of the men are from so-called "red-flagged" countries, including Egypt, Turkey, Georgia, Pakistan and Mali.
Her eighth husband, Rashid Rajput, was deported in 2006 to his native Pakistan after an investigation by the Joint Terrorism Task Force.
If convicted, Barrientos faces up to four years in prison.  Her next court appearance is scheduled for May 18.
"""
print(summarizer(ARTICLE, max_length=130, min_length=30, do_sample=False))
[{'summary_text': 'Liana Barrientos, 39, is charged with two counts of "offering a false instrument for filing in the first degree" In total, she has been married 10 times, with nine of her marriages occurring between 1999 and 2002. She is believed to still be married to four men.'}]

2023-04-18 20:08:50.803729: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Downloading (…)lve/main/config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

[{'summary_text': 'Liana Barrientos, 39, is charged with two counts of "offering a false instrument for filing in the first degree" In total, she has been married 10 times, with nine of her marriages occurring between 1999 and 2002. She is believed to still be married to four men.'}]


[{'summary_text': 'Liana Barrientos, 39, is charged with two counts of "offering a false instrument for filing in the first degree" In total, she has been married 10 times, with nine of her marriages occurring between 1999 and 2002. She is believed to still be married to four men.'}]

In [6]:
sentence = "Covid started in china, covid started in wuhan, covid in china, covid started in 2019, china might be origin of covid"
print(summarizer(sentence, max_length=10, min_length=10, do_sample=False))

[{'summary_text': 'Covid started in china'}]


In [9]:
art = """It is anticipated that COVID-19 will spread further outside of Tokyo, 
therefore, the preparation for the COVID-19 emergency described above will be useful for 
those regions where the number of infected patients is still low.', Here we describe our 
experience of COVID-19 in five young infants. In the pandemic context, infants younger than 
3 months with isolated fever should be tested for SARS-CoV-2. Although infants might initially 
present signs of severe infection, our experience is that the youngest children tolerate and 
rapidly improve from COVID-19, in contrast to adults admitted to hospital with COVID-19. 
However, because little is known about SARS-CoV-2 infection in infants,4, 6 close monitoring 
is required for at least 2 weeks after the diagnosis. All of the infants' parents showed mild 
signs of viral infection (ie, rhinitis, or cough or fever, or both, for <1 week), which could
be related to undiagnosed COVID-19.", 'Different viral agents are associated with an increased 
risk of more severe disease course and respiratory complications in immunocompromised patients.
1–3 The recent outbreak of severe acute respiratory syndrome coronavirus 2 (SARS-CoV-2) 
disease 2019 (COVID-19) responsible for a severe acute respiratory syndrome (SARS) represents 
a source of concern for the management of patients with inflammatory rheumatic diseases. 
Lombardy is the region in Northern Italy with the highest incidence of COVID-19 cases, 
with more than 33 000 confirmed patients and 1250 requiring admission to the intensive 
care unit within 1 month. Since the first reports of COVID-19 cases in Italy, we have circulated 
a survey with a 2-week follow-up contact to patients with chronic arthritis treated with 
biological disease-modifying antirheumatic drugs (bDMARDs) or targeted synthetic 
disease-modifying antirheumatic drugs (tsDMARDs) followed up at our biological outpatient 
clinic in Pavia, Lombardy. The survey investigated the patients’ health conditions, the presence 
of contacts with subjects known to be affected by COVID-19 and management of the DMARDs 
during the first few weeks of pandemic. All patients had provided their informed consent
for the use of personal and clinical data for scientific purposes, and no patient refused 
to participate."""
print(summarizer(art, max_length=30, min_length=10, do_sample=False))

[{'summary_text': 'In the pandemic context, infants younger than 3 months with isolated fever should be tested for SARS-CoV-2.'}]


In [11]:
question = "Can the virus be found in nasal discharge, sputum, urine, fecal matter, or blood"
sentence = """clinical samples from all patients including nasopharyngeal swab (nps) / sputum, urine, and feces were collected for serial virus rna testing by standard qrt-pcr assay.
methods matched sputum samples, ops, blood cultures, serum, and urine samples were taken from patients (> 18 years) with cap and tested for the presence of possible respiratory pathogens using bacterial cultures, pcr for 17 viruses and five bacteria and urinary antigen testing.
background : the world health organization alert for the h1n1 influenza pandemic led to the implementation of certain measures regarding admission of patients with flu-like symptoms.
"""

print(summarizer(sentence, max_length=20, min_length=10, do_sample=False))

[{'summary_text': ' clinical samples from all patients including nasopharyngeal swab (nps)'}]


In [14]:
test="""was 2. 81 % has lower severity and mortality than sars but is much more transmissive
and affects more elderly individuals than youth and more men than women sars-cov-2 is highly
contagious and often leads to severe viral pneumonia with respiratory failure and death in the
elderly and subjects with pre-existing conditions sero prevalence of sars-cov-2 antibodies may 
help in more accurate estimations of the total number of cases adults represent the population 
with the highest infection rate at 4 % prevalence we save 74 % at 1 % we save 91 % and at 
1 % we save 97 % of tests. we analyze the number of times each sample is used and show the 
method is still efficient if we resort to testing a case individually if the sample is running
low. abstract in addition we recommend clinical screening to filter out individuals with symptoms
and show this leaves us with a population with lower prevalence the true number of people currently
infected with the virus, divided by the total population size.middle east respiratory syndrome
coronavirus (mers-cov) can cause severe and fatal acute respiratory disease in humans 
and remains endemic in the middle east since first being identified in 2012. there 
are currently no approved vaccines or therapies available for mers-cov. in this study, we 
evaluated parainfluenza virus 5 (piv5)-based vaccine expressing the mers-cov envelope spike 
protein (piv5 / mers-s) in a human dpp4 knockin c57bl / 6 congenic mouse model (hdpp4 ki). 
following a single-dose intranasal immunization, piv5-mers-s induced neutralizing antibody 
and robust t cell responses in hdpp4 ki mice. a single intranasal administration of 10 (4) 
pfu piv5-mers-s provided complete protection against a lethal challenge with mouse-adapted 
mers-cov (mersma6. 1. 2) and improved virus clearance in the lung. in comparison, single-dose 
intramuscular immunization with 10 (6) pfu uv-inactivated mersma6. 1. 2 mixed with imject
alum provided protection to only 25 % of immunized mice. intriguingly, an influx of eosinophils
was observed only in the lungs of mice immunized with inactivated mers-cov, suggestive of 
a hypersensitivity-type response. overall, our study indicated that piv5-mers-s is a promising
effective vaccine candidate against mers-cov infection. importance mers-cov causes lethal
infection in humans, and there is no vaccine. our work demonstrates that piv5 is a promising
vector for developing a mers vaccine. furthermore, success of piv5-based mers vaccine can
be employed to develop a vaccine for emerging covs such as sars-cov-2, which causes covid-19"""
print(summarizer(test, max_length=40, min_length=10, do_sample=False))

[{'summary_text': 'Middle east respiratory syndrome (mers-cov) can cause severe and fatal acute respiratory disease in humans. There are currently no approved vaccines or therapies available for mers-Cov.'}]
