# Learning to use spacy with their amazing tutorial on https://course.spacy.io/chapter1

In [4]:
#importing a json file
import json

with open('nytimes_articles.json') as json_file:
    nytimes_articles = json.load(json_file)

In [5]:
# Import the English language class
from spacy.lang.en import English

# Create the nlp object
nlp = English()

In [6]:
# Created by processing a string of text with the nlp object
doc = nlp("Hello world!")

# Iterate over tokens in a Doc
for token in doc:
    print(token.text)

Hello
world
!


In [7]:
doc = nlp("Hello world!")

# Index into the Doc to get a single Token
token = doc[1]

# Get the token text via the .text attribute
print(token.text)

world


In [8]:
doc = nlp("Hello world!")

# A slice from the Doc is a Span object
span = doc[1:3]

# Get the span text via the .text attribute
print(span.text)

world!


In [9]:
doc = nlp("It costs $5.")
print('Index:   ', [token.i for token in doc])
print('Text:    ', [token.text for token in doc])

print('is_alpha:', [token.is_alpha for token in doc])
print('is_punct:', [token.is_punct for token in doc])
print('like_num:', [token.like_num for token in doc])

Index:    [0, 1, 2, 3, 4]
Text:     ['It', 'costs', '$', '5', '.']
is_alpha: [True, True, False, False, False]
is_punct: [False, False, False, False, True]
like_num: [False, False, False, True, False]


In [10]:
# Import the English language class
from spacy.lang.en import English

# Create the nlp object
nlp = English()

# Process a text
doc = nlp("This is a sentence.")

# Print the document text
print(doc.text)

This is a sentence.


In [None]:
from spacy.lang.en import English

nlp = English()

# Process the text
doc = nlp("I like tree kangaroos and narwhals.")

# Select the first token
first_token = doc[0]

# Print the first token's text
print(first_token.text)

In [None]:
# Import the English language class and create the nlp object
from spacy.lang.en import English

nlp = English()

# Process the text
doc = nlp("I like tree kangaroos and narwhals.")

# A slice of the Doc for "tree kangaroos"
tree_kangaroos = doc[2:4]
print(tree_kangaroos.text)

# A slice of the Doc for "tree kangaroos and narwhals" (without the ".")
tree_kangaroos_and_narwhals = doc[2:6]
print(tree_kangaroos_and_narwhals.text)

In [None]:
# Process the text
doc = nlp(
    "In 1990, more than 60% of people in East Asia were in extreme poverty. "
    "Now less than 4% are."
)

# Iterate over the tokens in the doc
for token in doc:
    # Check if the token resembles a number
    if token.like_num:
        # Get the next token in the document
        next_token = doc[token.i + 1]
        # Check if the next token's text equals '%'
        if next_token.text == "%":
            print("Percentage found:", token.text)

In [None]:
Match exact token texts

[{'TEXT': 'iPhone'}, {'TEXT': 'X'}]
Match lexical attributes
[{'LOWER': 'iphone'}, {'LOWER': 'x'}]
Match any token attributes
[{'LEMMA': 'buy'}, {'POS': 'NOUN'}]

In [None]:
import spacy
# Import the Matcher
from spacy.matcher import Matcher

# Load a model and create the nlp object
nlp = spacy.load('en_core_web_sm')

# Initialize the matcher with the shared vocab
matcher = Matcher(nlp.vocab)

# Add the pattern to the matcher
pattern = [{'TEXT': 'iPhone'}, {'TEXT': 'X'}]
matcher.add('IPHONE_PATTERN', None, pattern)

# Process some text
doc = nlp("New iPhone X release date leaked")

# Call the matcher on the doc
matches = matcher(doc)

# Iterate over the matches
for match_id, start, end in matches:
    # Get the matched span
    matched_span = doc[start:end]
    print(matched_span.text)

In [None]:
pattern = [
    {'IS_DIGIT': True},
    {'LOWER': 'fifa'},
    {'LOWER': 'world'},
    {'LOWER': 'cup'},
    {'IS_PUNCT': True}
]
doc = nlp("2018 FIFA World Cup: France won!")

In [None]:
matcher.add('IPHONE_PATTERN', None, pattern)
matches = matcher(doc)
for match_id, start, end in matches:
    # Get the matched span
    matched_span = doc[start:end]
    print(matched_span.text)

In [None]:
pattern = [
    {'LEMMA': 'love', 'POS': 'VERB'},
    {'POS': 'NOUN'}
]
doc = nlp("I loved dogs but now I love cats more.")
matcher.add('IPHONE_PATTERN', None, pattern)
matches = matcher(doc)
for match_id, start, end in matches:
    # Get the matched span
    matched_span = doc[start:end]
    print(matched_span.text)

In [None]:
#the
pattern = [
    {'LEMMA': 'buy'},
    {'POS': 'DET', 'OP': '?'},  # optional: match 0 or 1 times
    {'POS': 'NOUN'}
]
doc = nlp("I bought a car")
matcher.add('IPHONE_PATTERN', None, pattern)
matches = matcher(doc)
for match_id, start, end in matches:
    # Get the matched span
    matched_span = doc[start:end]
    print(matched_span.text)

In [117]:

import spacy

# Import the Matcher
from spacy.matcher import Matcher

nlp = spacy.load("en_core_web_sm")
doc = nlp("New iPhone X release date iphone X leaked as Apple reveals pre-orders by mistake")

# Initialize the Matcher with the shared vocabulary
matcher = Matcher(nlp.vocab)

# Create a pattern matching two tokens: "iPhone" and "X"
pattern = [{"lower": "iphone"}, {"lower": "x"}]

# Add the pattern to the matcher
matcher.add("IPHONE_X_PATTERN", None, pattern)

# Use the matcher on the doc
matches = matcher(doc)
print("Matches:", [doc[start:end].text for match_id, start, end in matches])

Matches: ['iPhone X', 'iphone X']


In [None]:
import spacy
from spacy.matcher import Matcher

nlp = spacy.load("en_core_web_sm")
matcher = Matcher(nlp.vocab)

doc = nlp(
    "After making the iOS update you won't notice a radical system-wide "
    "redesign: nothing like the aesthetic upheaval we got with iOS 7. Most of "
    "iOS 11's furniture remains the same as in iOS 10. But you will discover "
    "some tweaks once you delve a little deeper."
)

# Write a pattern for full iOS versions ("iOS 7", "iOS 11", "iOS 10")
pattern = [{"TEXT": "iOS"}, {"IS_DIGIT": True}]

# Add the pattern to the matcher and apply the matcher to the doc
matcher.add("IOS_VERSION_PATTERN", None, pattern)
matches = matcher(doc)
print("Total matches found:", len(matches))

# Iterate over the matches and print the span text
for match_id, start, end in matches:
    print("Match found:", doc[start:end].text)

In [None]:
matches

In [None]:
import spacy
from spacy.matcher import Matcher

nlp = spacy.load("en_core_web_sm")
matcher = Matcher(nlp.vocab)

doc = nlp(
    "i downloaded Fortnite on my laptop and can't open the game at all. Help? "
    "so when I was downloading Minecraft, I got the Windows version where it "
    "is the '.zip' folder and I used the default program to unpack it... do "
    "I also need to download Winzip?"
)

# Write a pattern that matches a form of "download" plus proper noun
pattern = [{"LEMMA": "download"}, {"POS": "PROPN"}]

# Add the pattern to the matcher and apply the matcher to the doc
matcher.add("DOWNLOAD_THINGS_PATTERN", None, pattern)
matches = matcher(doc)
print("Total matches found:", len(matches))

# Iterate over the matches and print the span text
for match_id, start, end in matches:
    print("Match found:", doc[start:end].text)
    

In [3]:
import spacy
from spacy.matcher import Matcher

nlp = spacy.load("en_core_web_sm")
matcher = Matcher(nlp.vocab)

doc = nlp(
    "Features of the app include a beautiful design, smart search, automatic "
    "labels and optional voice responses."
)

# Write a pattern for adjective plus one or two nouns
pattern = [{"POS": "ADJ"}, {"POS": "NOUN"}, {"POS": "NOUN", "OP": "?"}]

# Add the pattern to the matcher and apply the matcher to the doc
matcher.add("ADJ_NOUN_PATTERN", None, pattern)
matches = matcher(doc)
print("Total matches found:", len(matches))

# Iterate over the matches and print the span text
for match_id, start, end in matches:
    print("Match found:", doc[start:end].text)
    
    

Total matches found: 5
Match found: beautiful design
Match found: smart search
Match found: automatic labels
Match found: optional voice
Match found: optional voice responses


# Using Spacy for coronavirus stuff

In [57]:
doc=nlp(nytimes_articles['2020/01/08 | https://www.nytimes.com/2020/01/08/health/china-pneumonia-outbreak-virus.html'])

In [62]:
doc=nytimes_articles['2020/01/08 | https://www.nytimes.com/2020/01/08/health/china-pneumonia-outbreak-virus.html']


In [58]:
from spacy.matcher import PhraseMatcher
from spacy.tokens import Span
import spacy
from spacy.matcher import Matcher
nlp = spacy.load("en_core_web_sm")
animals = ["coronavirus", "virus", "disease", ]
animal_patterns = list(nlp.pipe(animals))
print("animal_patterns:", animal_patterns)
matcher = PhraseMatcher(nlp.vocab)
matcher.add("ANIMAL", None, *animal_patterns)
# Write a pattern for adjective plus one or two nouns
# pattern = [{"POS": "ADJ"}, {"POS": "NOUN"}, {"POS": "NOUN", "OP": "?"}]
# Add the pattern to the matcher and apply the matcher to the doc
# matcher.add("ADJ_NOUN_PATTERN", None, pattern)
matches = matcher(doc)
print("Total matches found:", len(matches))
# Iterate over the matches and print the span text
for match_id, start, end in matches:
    span = doc[start:end]  # Matched span
    sent = span.sent
    print("Match found:", span.text)
    print("Sentence: ", sent)
    
    

animal_patterns: [coronavirus, virus, disease]
Total matches found: 24
Match found: coronavirus
Sentence:  -summary">The new coronavirus doesn’t appear to be readily spread by humans, but researchers caution that more study is needed.</p
Match found: virus
Sentence:  Chinese researchers say they have identified a new virus behind an illness that has infected dozens of people across Asia, setting off fears in a region that was struck by a deadly epidemic 17 years ago.</p>, <
Match found: virus
Sentence:  p class="css-exrw3m evys1bk0">There is no evidence that the new virus is readily spread by humans, which would make it particularly dangerous, and it has not been tied to any deaths.
Match found: virus
Sentence:  have “initially identified” the new virus, a coronavirus, as the pathogen behind a <a class="css-1g7m0tk" href="https://www.nytimes.com/2020/01/06/world/asia/china-SARS-pneumonialike.html" title="">mysterious, pneumonialike illness that has sickened 59 people in the city of Wuh

In [61]:
from spacy.matcher import PhraseMatcher
from spacy.tokens import Span
import spacy
from spacy.matcher import Matcher
nlp = spacy.load("en_core_web_sm")
animals = ["coronavirus", "virus", "disease", ]
animal_patterns = list(nlp.pipe(animals))
matcher = PhraseMatcher(nlp.vocab)
matcher.add("ANIMAL", None, *animal_patterns)
# Write a pattern for adjective plus one or two nouns
# pattern = [{"POS": "ADJ"}, {"POS": "NOUN"}, {"POS": "NOUN", "OP": "?"}]
# Add the pattern to the matcher and apply the matcher to the doc
# matcher.add("ADJ_NOUN_PATTERN", None, pattern)
matches = matcher(doc)
# Iterate over the matches and print the span text
for match_id, start, end in matches:
    span = doc[start:end]  # Matched span
    sent = span.sent
    animals = cities
    animal_patterns = list(nlp.pipe(animals))
    matcher = PhraseMatcher(nlp.vocab)
    matcher.add("ANIMAL", None, *animal_patterns)
    # Write a pattern for adjective plus one or two nouns
    # pattern = [{"POS": "ADJ"}, {"POS": "NOUN"}, {"POS": "NOUN", "OP": "?"}]
    # Add the pattern to the matcher and apply the matcher to the doc
    # matcher.add("ADJ_NOUN_PATTERN", None, pattern)
    matches = matcher(doc)
    print("Match found:", span.text)
    print("Sentence: ", sent)

Match found: coronavirus
Sentence:  -summary">The new coronavirus doesn’t appear to be readily spread by humans, but researchers caution that more study is needed.</p
Match found: virus
Sentence:  Chinese researchers say they have identified a new virus behind an illness that has infected dozens of people across Asia, setting off fears in a region that was struck by a deadly epidemic 17 years ago.</p>, <
Match found: virus
Sentence:  p class="css-exrw3m evys1bk0">There is no evidence that the new virus is readily spread by humans, which would make it particularly dangerous, and it has not been tied to any deaths.
Match found: virus
Sentence:  have “initially identified” the new virus, a coronavirus, as the pathogen behind a <a class="css-1g7m0tk" href="https://www.nytimes.com/2020/01/06/world/asia/china-SARS-pneumonialike.html" title="">mysterious, pneumonialike illness that has sickened 59 people in the city of Wuhan</a> and caused a panic in the central Chinese region,
Match found: c

KeyboardInterrupt: 

In [38]:
import json

with open('cities_full.json') as json_file:
    cities_and_countries = json.load(json_file)
    

In [40]:
cities_and_countries

{'Beijing': ['CN'],
 'Lhasa': ['CN'],
 'Alashankou': ['CN'],
 'Yining/Qulja': ['CN'],
 'Aksu': ['CN'],
 'Altay': ['CN'],
 'Alxa Left Banner': ['CN'],
 'Amdo': ['CN'],
 'Anguo': ['CN'],
 'Ankang': ['CN'],
 'Anji': ['CN'],
 'Anlu': ['CN'],
 'Anping': ['CN'],
 'Anning': ['CN', 'CN'],
 'Anqiu': ['CN'],
 'Anqing': ['CN'],
 'Anren': ['CN'],
 'Anshun': ['CN'],
 'Anshan': ['CN'],
 'Anxiang': ['CN'],
 'Anyang': ['CN', 'CN'],
 'Artush': ['CN'],
 'Baicheng': ['CN', 'CN'],
 'Badong': ['CN'],
 'Bainang': ['CN'],
 'Baiquan': ['CN'],
 'Baingoin': ['CN'],
 'Baiyin': ['CN'],
 'Baixiang': ['CN'],
 'Baishui': ['CN'],
 'Baofeng': ['CN'],
 'Baiyu': ['CN'],
 'Baoan': ['CN'],
 'Baoshan District': ['CN'],
 'Baotou': ['CN'],
 'Baojing': ['CN'],
 'Baoji': ['CN'],
 'Baoqing': ['CN'],
 'Bazhou': ['CN'],
 'Yuzhong': ['CN', 'CN'],
 "Bei'an": ['CN'],
 'Beihai': ['CN'],
 'Bayannur': ['CN'],
 'Benxi': ['CN'],
 'Benxi Manchu Autonomous County': ['CN'],
 'Beizhen': ['CN'],
 'Bengbu': ['CN'],
 'Beipiao': ['CN'],
 'Bin Co

In [132]:
#turning cities into list of just the cities without the country
cities = []
for key in cities_and_countries.keys():
    cities.append(key.lower())

In [133]:
cities

['beijing',
 'lhasa',
 'alashankou',
 'yining/qulja',
 'aksu',
 'altay',
 'alxa left banner',
 'amdo',
 'anguo',
 'ankang',
 'anji',
 'anlu',
 'anping',
 'anning',
 'anqiu',
 'anqing',
 'anren',
 'anshun',
 'anshan',
 'anxiang',
 'anyang',
 'artush',
 'baicheng',
 'badong',
 'bainang',
 'baiquan',
 'baingoin',
 'baiyin',
 'baixiang',
 'baishui',
 'baofeng',
 'baiyu',
 'baoan',
 'baoshan district',
 'baotou',
 'baojing',
 'baoji',
 'baoqing',
 'bazhou',
 'yuzhong',
 "bei'an",
 'beihai',
 'bayannur',
 'benxi',
 'benxi manchu autonomous county',
 'beizhen',
 'bengbu',
 'beipiao',
 'bin county',
 'binzhou',
 'bortala',
 "bo'ai",
 'biyang',
 'biru',
 'boxing',
 'botou',
 'baise',
 'boli',
 'cang',
 'caoxian',
 'cangzhou',
 'boye',
 'burqin',
 '兰陵县',
 'cangnan',
 'changge',
 'changji/sanji',
 'changle',
 'changchun',
 'changdao',
 'changde',
 'changshou',
 '长沙县',
 'changting',
 'changtai',
 'changning',
 'changning district',
 'changli',
 'changshan',
 '长清区',
 'changping',
 'chaoyang',
 "che

In [49]:
import spacy
from spacy.matcher import Matcher
nlp = spacy.load("en_core_web_sm")
animals = ["coronavirus", "virus", "disease", ]
animal_patterns = list(nlp.pipe(animals))
print("animal_patterns:", animal_patterns)
matcher = PhraseMatcher(nlp.vocab)
matcher.add("ANIMAL", None, *animal_patterns)
# Write a pattern for adjective plus one or two nouns
# pattern = [{"POS": "ADJ"}, {"POS": "NOUN"}, {"POS": "NOUN", "OP": "?"}]
# Add the pattern to the matcher and apply the matcher to the doc
# matcher.add("ADJ_NOUN_PATTERN", None, pattern)
matches = matcher(doc)
print("Total matches found:", len(matches))
# Iterate over the matches and print the span text
for match_id, start, end in matches:
    span = doc[start:end]  # Matched span
    sent = span.sent
    print("Match found:", span.text)
    print("Sentence: ", sent)

animal_patterns: [coronavirus, virus, disease]
Total matches found: 24
Match found: coronavirus
Sentence:  -summary">The new coronavirus doesn’t appear to be readily spread by humans, but researchers caution that more study is needed.</p
Match found: virus
Sentence:  Chinese researchers say they have identified a new virus behind an illness that has infected dozens of people across Asia, setting off fears in a region that was struck by a deadly epidemic 17 years ago.</p>, <
Match found: virus
Sentence:  p class="css-exrw3m evys1bk0">There is no evidence that the new virus is readily spread by humans, which would make it particularly dangerous, and it has not been tied to any deaths.
Match found: virus
Sentence:  have “initially identified” the new virus, a coronavirus, as the pathogen behind a <a class="css-1g7m0tk" href="https://www.nytimes.com/2020/01/06/world/asia/china-SARS-pneumonialike.html" title="">mysterious, pneumonialike illness that has sickened 59 people in the city of Wuh

In [51]:
matches

[(6303828839600189595, 52, 53),
 (6303828839600189595, 159, 160),
 (6303828839600189595, 203, 204),
 (6303828839600189595, 271, 272),
 (6303828839600189595, 274, 275),
 (6303828839600189595, 337, 338),
 (6303828839600189595, 363, 364),
 (6303828839600189595, 466, 467),
 (6303828839600189595, 526, 527),
 (6303828839600189595, 530, 531),
 (6303828839600189595, 626, 627),
 (6303828839600189595, 638, 639),
 (6303828839600189595, 713, 714),
 (6303828839600189595, 730, 731),
 (6303828839600189595, 781, 782),
 (6303828839600189595, 926, 927),
 (6303828839600189595, 1089, 1090),
 (6303828839600189595, 1112, 1113),
 (6303828839600189595, 1146, 1147),
 (6303828839600189595, 1168, 1169),
 (6303828839600189595, 1172, 1173),
 (6303828839600189595, 1364, 1365),
 (6303828839600189595, 1401, 1402),
 (6303828839600189595, 1765, 1766)]

In [162]:
def create_matcher(syns, label):
    disease_patterns = list(nlp.pipe(syns))
    # print("disease_patterns:", disease_patterns)
    matcher = PhraseMatcher(nlp.vocab, attr = "LOWER")
    matcher.add(label, None, *disease_patterns)
    return matcher

In [154]:
def get_sentences_from_doc(doc):
    if not doc:
        return []
    sents = []
    for s in doc.sents:
        sents.append(s.text)
    return sents

In [167]:
def match_doc_with_sentences(doc, matcher):
    """
    Runs a matcher against a doc and returns a new doc just for the matched sentences
    :param doc:
    :return:
    """
    debug = True
    matched_sentences = []
    matched_sents_keys = {}

    # Add the pattern to the matcher and apply the matcher to the doc
    # matcher.add("ADJ_NOUN_PATTERN", None, pattern)
    matches = matcher(doc)
    if debug:
        print("Total matches found:", len(matches))

    # Iterate over the matches and print the span text
    for match_id, start, end in matches:
        span = doc[start:end]  # Matched span
        sent = span.sent
        #if sent in matched_sents_keys:
            #continue
        #matched_sents_keys[sent] = True
        matched_sentences.append(sent.text)
        if debug:
            print("Match found:", span.text)
            wrapper = textwrap.TextWrapper(width=100)
            word_list = wrapper.wrap(text=sent.text)
            for element in word_list:
                print(element)

    if len(matched_sentences) > 0:
        new_doc = nlp(" ".join(matched_sentences))
        return new_doc
    else:
        return None


In [170]:
match_counts = {}

In [199]:
def match_doc_with_counts(doc, matcher):
    """
    Runs a matcher against a doc and returns a new doc just for the matched sentences
    :param doc:
    :return:
    """
    global match_counts
    debug = True
    
    
    
    matched_sents_keys = {}

    # Add the pattern to the matcher and apply the matcher to the doc
    # matcher.add("ADJ_NOUN_PATTERN", None, pattern)
    matches = matcher(doc)
    if debug:
        print("Total matches found:", len(matches))

    # Iterate over the matches and print the span text
    for match_id, start, end in matches:
        span = doc[start:end]  # Matched span
        sent = span.sent
        if span.text.lower() in match_counts:
            match_counts[f'{span.text.lower()}']+=1
        else:
            match_counts[f'{span.text.lower()}']=1
        print (f'this is match_counts: {match_counts}')
        #if sent in matched_sents_keys:
            #continue
        #matched_sents_keys[sent] = True
        if debug:
            print("Match found:", span.text)
            wrapper = textwrap.TextWrapper(width=100)
            word_list = wrapper.wrap(text=sent.text)
            for element in word_list:
                print(element)


In [186]:
def search_article(article):
    # print(first_art)

    doc = nlp(article)

    #disease_syns = [{"lower":"coronavirus"}, {"lower":"virus"}, {"lower":"disease"}]
    disease_syns = ["coronavirus", "virus", "disease"]
    disease_matcher = create_matcher(disease_syns, "DISEASE")

    death_syns = cities
    death_matcher = create_matcher(death_syns, "DEATHS")

    # Write a pattern for adjective plus one or two nouns
    # pattern = [{"POS": "ADJ"}, {"POS": "NOUN"}, {"POS": "NOUN", "OP": "?"}]

    docs_match_disease = match_doc_with_sentences(doc, disease_matcher)
    docs_match_disease_and_death = match_doc_with_counts(docs_match_disease, death_matcher)

    if False and docs_match_disease:
        print(f"Sentences only matching the disease")
        sents = get_sentences_from_doc(docs_match_disease)
        for s in sents:
            print(f"\t{s}")
    #print(f'{docs_match_disease_and_death}')
    if docs_match_disease_and_death:
        print(f"Sentences matching the disease & death")
        sents = get_sentences_from_doc(docs_match_disease_and_death)
        return sents
    return None


In [149]:
cities

['beijing',
 'lhasa',
 'alashankou',
 'yining/qulja',
 'aksu',
 'altay',
 'alxa left banner',
 'amdo',
 'anguo',
 'ankang',
 'anji',
 'anlu',
 'anping',
 'anning',
 'anqiu',
 'anqing',
 'anren',
 'anshun',
 'anshan',
 'anxiang',
 'anyang',
 'artush',
 'baicheng',
 'badong',
 'bainang',
 'baiquan',
 'baingoin',
 'baiyin',
 'baixiang',
 'baishui',
 'baofeng',
 'baiyu',
 'baoan',
 'baoshan district',
 'baotou',
 'baojing',
 'baoji',
 'baoqing',
 'bazhou',
 'yuzhong',
 "bei'an",
 'beihai',
 'bayannur',
 'benxi',
 'benxi manchu autonomous county',
 'beizhen',
 'bengbu',
 'beipiao',
 'bin county',
 'binzhou',
 'bortala',
 "bo'ai",
 'biyang',
 'biru',
 'boxing',
 'botou',
 'baise',
 'boli',
 'cang',
 'caoxian',
 'cangzhou',
 'boye',
 'burqin',
 '兰陵县',
 'cangnan',
 'changge',
 'changji/sanji',
 'changle',
 'changchun',
 'changdao',
 'changde',
 'changshou',
 '长沙县',
 'changting',
 'changtai',
 'changning',
 'changning district',
 'changli',
 'changshan',
 '长清区',
 'changping',
 'chaoyang',
 "che

In [130]:
death_syns=cities

In [106]:
import textwrap

In [202]:
x = 'Coronavirus shanghai Wuhan wuhan'
answer=search_article(x)

Total matches found: 1
Match found: Coronavirus
Coronavirus shanghai Wuhan wuhan
Total matches found: 3
this is match_counts: {'shanghai': 2, 'wuhan': 2}
Match found: shanghai
Coronavirus shanghai Wuhan wuhan
this is match_counts: {'shanghai': 2, 'wuhan': 3}
Match found: Wuhan
Coronavirus shanghai Wuhan wuhan
this is match_counts: {'shanghai': 2, 'wuhan': 4}
Match found: wuhan
Coronavirus shanghai Wuhan wuhan


In [200]:
match_counts = {}

In [109]:
answer

['coronavirus Wuhan']

In [87]:
all_matches={}
i=0
for thing in nytimes_articles:
    print(f'{i}')
    text=search_article(nytimes_articles[f'{thing}'])
    all_matches[f'{thing}'] = text
    i+=1

0
Sentences matching the disease & death
1
Sentences matching the disease & death
2
Sentences matching the disease & death
3
Sentences matching the disease & death
4
5
Sentences matching the disease & death
6
Sentences matching the disease & death
7
8
Sentences matching the disease & death
9
Sentences matching the disease & death
10
11
Sentences matching the disease & death
12
Sentences matching the disease & death
13
Sentences matching the disease & death
14
Sentences matching the disease & death
15
16
Sentences matching the disease & death
17
18
19
Sentences matching the disease & death
20
Sentences matching the disease & death
21
Sentences matching the disease & death
22
Sentences matching the disease & death
23
Sentences matching the disease & death
24
Sentences matching the disease & death
25
Sentences matching the disease & death
26
27
28
Sentences matching the disease & death
29
30
Sentences matching the disease & death
31
Sentences matching the disease & death
32
Sentences matc

KeyboardInterrupt: 

In [74]:
len(nytimes_articles)


1519

In [97]:
len(all_matches['2020/01/10 | https://www.nytimes.com/2020/01/10/world/asia/china-virus-wuhan-death.html'])

3

In [None]:
pattern = [{"lower": "iphone"}, {"lower": "x"}]