In [2]:
import re
from nltk import word_tokenize, sent_tokenize, ngrams, pos_tag, RegexpParser
from collections import Counter
import requests

## **1. Read URL**

In [30]:
nbc_url = 'https://www.cnbc.com/2019/01/17/netflix-price-hike-helps-disney-upcoming-streaming-service-analyst.html'
nbc = requests.get(nbc_url).text

## **2. Extract Text from the Article**

In [31]:
from bs4 import BeautifulSoup
from bs4.element import Comment

In [32]:
def tag_visible(element):
    if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
        return False
    if isinstance(element, Comment):
        return False
    return True


def text_from_html(body):
    soup = BeautifulSoup(body, 'html.parser')
    texts = soup.findAll(text=True)
    visible_texts = filter(tag_visible, texts)  
    return u" ".join(t.strip() for t in visible_texts)

text = text_from_html(nbc)
print(text[0:1000])

Skip Navigation SIGN IN Pro Watchlist Make It Select USA INTL Markets Pre-Markets U.S. Markets Currencies Cryptocurrency Futures & Commodities Bonds Funds & ETFs Watchlist Business Economy Finance Health & Science Media Real Estate Energy Transportation Industrials Retail Wealth Life Small Business Investing Invest In You Personal Finance Financial Advisors Trading Nation Options Action ETF Street Buffett Archive Earnings Trader Talk Tech Cybersecurity Enterprise Internet Media Mobile Social Media Venture Capital Tech Guide Politics White House Policy Defense Congress 2020 Elections CNBC TV Live TV Live Audio Latest Video Top Video CEO Interviews Business Day Shows The News with Shepard Smith Entertainment Shows CNBC World Digital Originals Full Episodes Menu SEARCH QUOTES Markets Pre-Markets U.S. Markets Currencies Cryptocurrency Futures & Commodities Bonds Funds & ETFs Watchlist Business Economy Finance Health & Science Media Real Estate Energy Transportation Industrials Retail Wealt

## **3. Use re (regular expression) package**

**a) Find all matches of $ amounts in the article**

In [33]:
count = re.findall('\$(.+?) ', text)
print("Matches of dollar amount: ",len(count))
print("Dollar amount matches: ", count)

Matches of dollar amount:  2
Dollar amount matches:  ['325.', '351']


**b) Substitute all numbers with # character and print the output**

In [34]:
sub = re.sub(r'[0-9]','#',text)
print(sub[0:1000])

Skip Navigation SIGN IN Pro Watchlist Make It Select USA INTL Markets Pre-Markets U.S. Markets Currencies Cryptocurrency Futures & Commodities Bonds Funds & ETFs Watchlist Business Economy Finance Health & Science Media Real Estate Energy Transportation Industrials Retail Wealth Life Small Business Investing Invest In You Personal Finance Financial Advisors Trading Nation Options Action ETF Street Buffett Archive Earnings Trader Talk Tech Cybersecurity Enterprise Internet Media Mobile Social Media Venture Capital Tech Guide Politics White House Policy Defense Congress #### Elections CNBC TV Live TV Live Audio Latest Video Top Video CEO Interviews Business Day Shows The News with Shepard Smith Entertainment Shows CNBC World Digital Originals Full Episodes Menu SEARCH QUOTES Markets Pre-Markets U.S. Markets Currencies Cryptocurrency Futures & Commodities Bonds Funds & ETFs Watchlist Business Economy Finance Health & Science Media Real Estate Energy Transportation Industrials Retail Wealt

**c) Count (using regular expressions) ”Netflix” and “Disney” mentions**

In [35]:
print('Mentions of Netflix: '+str(len(re.findall('Netflix',text, re.IGNORECASE))))
print('Mentions of Disney: '+str(len(re.findall('Disney',text, re.IGNORECASE))))

Mentions of Netflix: 13
Mentions of Disney: 7


## **4. Use NTLK and/or Spacy (Links to an external site.) tokenization features**

In [36]:
import spacy
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/aimeetran/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

**a) Tokenize sentences and words**

In [37]:
#Tokenize sentence
sentences = sent_tokenize(text)
first5 = sentences[0:5]
first5

['Skip Navigation SIGN IN Pro Watchlist Make It Select USA INTL Markets Pre-Markets U.S. Markets Currencies Cryptocurrency Futures & Commodities Bonds Funds & ETFs Watchlist Business Economy Finance Health & Science Media Real Estate Energy Transportation Industrials Retail Wealth Life Small Business Investing Invest In You Personal Finance Financial Advisors Trading Nation Options Action ETF Street Buffett Archive Earnings Trader Talk Tech Cybersecurity Enterprise Internet Media Mobile Social Media Venture Capital Tech Guide Politics White House Policy Defense Congress 2020 Elections CNBC TV Live TV Live Audio Latest Video Top Video CEO Interviews Business Day Shows The News with Shepard Smith Entertainment Shows CNBC World Digital Originals Full Episodes Menu SEARCH QUOTES Markets Pre-Markets U.S. Markets Currencies Cryptocurrency Futures & Commodities Bonds Funds & ETFs Watchlist Business Economy Finance Health & Science Media Real Estate Energy Transportation Industrials Retail Wea

In [38]:
#Tokenize words
words = word_tokenize(text)
words[0:10]

['Skip',
 'Navigation',
 'SIGN',
 'IN',
 'Pro',
 'Watchlist',
 'Make',
 'It',
 'Select',
 'USA']

**b) Remove all English stop words**

In [39]:
#Remove stop word
from nltk.corpus import stopwords
sw = set(stopwords.words('english'))

In [40]:
tokens_without_sw = [word for word in words if not word in sw]
print(tokens_without_sw[:100])

['Skip', 'Navigation', 'SIGN', 'IN', 'Pro', 'Watchlist', 'Make', 'It', 'Select', 'USA', 'INTL', 'Markets', 'Pre-Markets', 'U.S.', 'Markets', 'Currencies', 'Cryptocurrency', 'Futures', '&', 'Commodities', 'Bonds', 'Funds', '&', 'ETFs', 'Watchlist', 'Business', 'Economy', 'Finance', 'Health', '&', 'Science', 'Media', 'Real', 'Estate', 'Energy', 'Transportation', 'Industrials', 'Retail', 'Wealth', 'Life', 'Small', 'Business', 'Investing', 'Invest', 'In', 'You', 'Personal', 'Finance', 'Financial', 'Advisors', 'Trading', 'Nation', 'Options', 'Action', 'ETF', 'Street', 'Buffett', 'Archive', 'Earnings', 'Trader', 'Talk', 'Tech', 'Cybersecurity', 'Enterprise', 'Internet', 'Media', 'Mobile', 'Social', 'Media', 'Venture', 'Capital', 'Tech', 'Guide', 'Politics', 'White', 'House', 'Policy', 'Defense', 'Congress', '2020', 'Elections', 'CNBC', 'TV', 'Live', 'TV', 'Live', 'Audio', 'Latest', 'Video', 'Top', 'Video', 'CEO', 'Interviews', 'Business', 'Day', 'Shows', 'The', 'News', 'Shepard', 'Smith']


In [41]:
removed_sw = len(text)-len(tokens_without_sw)
print('Number of stopwords removed:',removed_sw)

Number of stopwords removed: 5293


**c) List and count n-grams for any given input n**

In [42]:
n= int(input('n-gram: '))
for item in ngrams(words,n):
    print(item)


n-gram: 2
('Skip', 'Navigation')
('Navigation', 'SIGN')
('SIGN', 'IN')
('IN', 'Pro')
('Pro', 'Watchlist')
('Watchlist', 'Make')
('Make', 'It')
('It', 'Select')
('Select', 'USA')
('USA', 'INTL')
('INTL', 'Markets')
('Markets', 'Pre-Markets')
('Pre-Markets', 'U.S.')
('U.S.', 'Markets')
('Markets', 'Currencies')
('Currencies', 'Cryptocurrency')
('Cryptocurrency', 'Futures')
('Futures', '&')
('&', 'Commodities')
('Commodities', 'Bonds')
('Bonds', 'Funds')
('Funds', '&')
('&', 'ETFs')
('ETFs', 'Watchlist')
('Watchlist', 'Business')
('Business', 'Economy')
('Economy', 'Finance')
('Finance', 'Health')
('Health', '&')
('&', 'Science')
('Science', 'Media')
('Media', 'Real')
('Real', 'Estate')
('Estate', 'Energy')
('Energy', 'Transportation')
('Transportation', 'Industrials')
('Industrials', 'Retail')
('Retail', 'Wealth')
('Wealth', 'Life')
('Life', 'Small')
('Small', 'Business')
('Business', 'Investing')
('Investing', 'Invest')
('Invest', 'In')
('In', 'You')
('You', 'Personal')
('Personal', 'Fi

**d) Lemmatize and deduplicate unigrams into a vocabulary of terms**


In [43]:
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

lem_text = [wordnet_lemmatizer.lemmatize(word) for word in tokens_without_sw]
lem_text[0:20]

['Skip',
 'Navigation',
 'SIGN',
 'IN',
 'Pro',
 'Watchlist',
 'Make',
 'It',
 'Select',
 'USA',
 'INTL',
 'Markets',
 'Pre-Markets',
 'U.S.',
 'Markets',
 'Currencies',
 'Cryptocurrency',
 'Futures',
 '&',
 'Commodities']

**e) Print bigrams and trigrams in the first 5 sentences**

In [44]:
# Make it a continuous string
first5 = (' '.join(word for word in first5)) 

#Tokens of first 5 sentences
first5_tokenized = word_tokenize(first5)

In [51]:
#Bigram
bigrams = Counter(ngrams(first5_tokenized, 2))
print("Number of bigrams in first 5 sentences: ",len(bigrams))
print("\n List of first 10 bigrams:")
list(bigrams.items())[:10]

Number of bigrams in first 5 sentences:  289

 List of first 10 bigrams:


[(('Skip', 'Navigation'), 1),
 (('Navigation', 'SIGN'), 1),
 (('SIGN', 'IN'), 1),
 (('IN', 'Pro'), 1),
 (('Pro', 'Watchlist'), 1),
 (('Watchlist', 'Make'), 1),
 (('Make', 'It'), 1),
 (('It', 'Select'), 1),
 (('Select', 'USA'), 1),
 (('USA', 'INTL'), 1)]

In [52]:
#Trigram
trigrams = Counter(ngrams(first5_tokenized, 3))
print("Number of bigrams in first 5 sentences: ",len(trigrams))
print("\n List of first 10 sentence trigrams:")
list(trigrams.items())[:10]

Number of bigrams in first 5 sentences:  301

 List of first 10 sentence trigrams:


[(('Skip', 'Navigation', 'SIGN'), 1),
 (('Navigation', 'SIGN', 'IN'), 1),
 (('SIGN', 'IN', 'Pro'), 1),
 (('IN', 'Pro', 'Watchlist'), 1),
 (('Pro', 'Watchlist', 'Make'), 1),
 (('Watchlist', 'Make', 'It'), 1),
 (('Make', 'It', 'Select'), 1),
 (('It', 'Select', 'USA'), 1),
 (('Select', 'USA', 'INTL'), 1),
 (('USA', 'INTL', 'Markets'), 1)]

**f) Print POS tags in the first 5 sentences**

In [53]:
sentence_pos = pos_tag(first5_tokenized)
sentence_pos[0:20]

[('Skip', 'NNP'),
 ('Navigation', 'NNP'),
 ('SIGN', 'NNP'),
 ('IN', 'NNP'),
 ('Pro', 'NNP'),
 ('Watchlist', 'NNP'),
 ('Make', 'NNP'),
 ('It', 'PRP'),
 ('Select', 'NNP'),
 ('USA', 'NNP'),
 ('INTL', 'NNP'),
 ('Markets', 'NNP'),
 ('Pre-Markets', 'NNP'),
 ('U.S.', 'NNP'),
 ('Markets', 'NNP'),
 ('Currencies', 'NNP'),
 ('Cryptocurrency', 'NNP'),
 ('Futures', 'NNP'),
 ('&', 'CC'),
 ('Commodities', 'NNP')]

In [145]:
grammar = "NP: {<DT>?<JJ>*<NNP>}"
cp = RegexpParser(grammar)
cp.parse(sentence_pos)[:20]

[Tree('NP', [('Skip', 'NNP')]),
 Tree('NP', [('Navigation', 'NNP')]),
 Tree('NP', [('SIGN', 'NNP')]),
 Tree('NP', [('IN', 'NNP')]),
 Tree('NP', [('Pro', 'NNP')]),
 Tree('NP', [('Watchlist', 'NNP')]),
 Tree('NP', [('Make', 'NNP')]),
 ('It', 'PRP'),
 Tree('NP', [('Select', 'NNP')]),
 Tree('NP', [('USA', 'NNP')]),
 Tree('NP', [('INTL', 'NNP')]),
 Tree('NP', [('Markets', 'NNP')]),
 Tree('NP', [('Pre-Markets', 'NNP')]),
 Tree('NP', [('U.S.', 'NNP')]),
 Tree('NP', [('Markets', 'NNP')]),
 Tree('NP', [('Currencies', 'NNP')]),
 Tree('NP', [('Cryptocurrency', 'NNP')]),
 Tree('NP', [('Futures', 'NNP')]),
 ('&', 'CC'),
 Tree('NP', [('Commodities', 'NNP')])]