## Import Packages

In [1]:
import requests as req
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import json
import logging
import datetime
from lxml import etree

from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
import nltk
nltk.download('wordnet')
import matplotlib.pyplot as plt
import spacy
import re

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Mattheus\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Define Functions

In [2]:
def scrape_reuters():
    headers = {"User-Agent": "Mozilla/5.0 (X11; CrOS x86_64 12871.102.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.141 Safari/537.36"}
    url ="https://www.reuters.com/technology/do-you-want-tesla-accept-doge-musk-asks-twitter-2021-05-11/"
    res = req.get(url, headers=headers)
    soup = BeautifulSoup(res.content, "html.parser")
    dom = etree.HTML(str(soup))
    content_list = dom.xpath('//*[contains(@class,"paywall-article")]//*')

    to_join = []
    for i in range(len(content_list)):
        text = str(content_list[i].text).strip()
        # print(text)
        if text != "None" and text != "read more":
            to_join.append(text)

    full_content = ' '.join(to_join)
    
    return full_content

## Topic Modeling

In [3]:
content = scrape_reuters()

In [4]:
content

'May 11 (Reuters) - Billionaire Elon Musk and cryptocurrency aficionado on Tuesday asked The question, in the form of a Twitter poll, comes days after he called dogecoin a "hustle," sending the meme-inspired cryptocurrency\'s price reeling after a 700% rally in a month. A proponent of digital currencies, the Tesla Inc (TSLA.O) Musk is one of the world\'s richest people and owner of several futuristic companies, including SpaceX and Neuralink. He has used his candid Twitter feed to give his opinion on cryptocurrencies, including bitcoin, most times affecting their price. In March, Musk said U.S. customers could purchase Tesla vehicles with bitcoin, marking a significant step forward for the cryptocurrency\'s use in commerce. The electric-car maker had bought $1.5 billion worth of bitcoin earlier this year, propelling its prices to record highs. Based on current prices, one would need nearly 80,000 dogecoins or 0.7 bitcoin to buy the cheapest Tesla Model 3 car. 1/2 SpaceX owner and Tesla

In [26]:
nlp = spacy.load('en_core_web_sm') # we first load the English language model
parsed_content = nlp(content)
print (parsed_content)

May 11 (Reuters) - Billionaire Elon Musk and cryptocurrency aficionado on Tuesday asked The question, in the form of a Twitter poll, comes days after he called dogecoin a "hustle," sending the meme-inspired cryptocurrency's price reeling after a 700% rally in a month. A proponent of digital currencies, the Tesla Inc (TSLA.O) Musk is one of the world's richest people and owner of several futuristic companies, including SpaceX and Neuralink. He has used his candid Twitter feed to give his opinion on cryptocurrencies, including bitcoin, most times affecting their price. In March, Musk said U.S. customers could purchase Tesla vehicles with bitcoin, marking a significant step forward for the cryptocurrency's use in commerce. The electric-car maker had bought $1.5 billion worth of bitcoin earlier this year, propelling its prices to record highs. Based on current prices, one would need nearly 80,000 dogecoins or 0.7 bitcoin to buy the cheapest Tesla Model 3 car. 1/2 SpaceX owner and Tesla CEO

In [28]:
# take out individual tokens from parsed review
token_text = [token.text for token in parsed_content]
print(token_text)

['May', '11', '(', 'Reuters', ')', '-', 'Billionaire', 'Elon', 'Musk', 'and', 'cryptocurrency', 'aficionado', 'on', 'Tuesday', 'asked', 'The', 'question', ',', 'in', 'the', 'form', 'of', 'a', 'Twitter', 'poll', ',', 'comes', 'days', 'after', 'he', 'called', 'dogecoin', 'a', '"', 'hustle', ',', '"', 'sending', 'the', 'meme', '-', 'inspired', 'cryptocurrency', "'s", 'price', 'reeling', 'after', 'a', '700', '%', 'rally', 'in', 'a', 'month', '.', 'A', 'proponent', 'of', 'digital', 'currencies', ',', 'the', 'Tesla', 'Inc', '(', 'TSLA.O', ')', 'Musk', 'is', 'one', 'of', 'the', 'world', "'s", 'richest', 'people', 'and', 'owner', 'of', 'several', 'futuristic', 'companies', ',', 'including', 'SpaceX', 'and', 'Neuralink', '.', 'He', 'has', 'used', 'his', 'candid', 'Twitter', 'feed', 'to', 'give', 'his', 'opinion', 'on', 'cryptocurrencies', ',', 'including', 'bitcoin', ',', 'most', 'times', 'affecting', 'their', 'price', '.', 'In', 'March', ',', 'Musk', 'said', 'U.S.', 'customers', 'could', 'purc

In [29]:
# lemmatize words
token_lemmas = [token.lemma_ for token in parsed_content]
print(token_lemmas)

['May', '11', '(', 'Reuters', ')', '-', 'Billionaire', 'Elon', 'Musk', 'and', 'cryptocurrency', 'aficionado', 'on', 'Tuesday', 'ask', 'the', 'question', ',', 'in', 'the', 'form', 'of', 'a', 'Twitter', 'poll', ',', 'come', 'day', 'after', 'he', 'call', 'dogecoin', 'a', '"', 'hustle', ',', '"', 'send', 'the', 'meme', '-', 'inspire', 'cryptocurrency', "'s", 'price', 'reeling', 'after', 'a', '700', '%', 'rally', 'in', 'a', 'month', '.', 'a', 'proponent', 'of', 'digital', 'currency', ',', 'the', 'Tesla', 'Inc', '(', 'TSLA.O', ')', 'Musk', 'be', 'one', 'of', 'the', 'world', "'s", 'rich', 'people', 'and', 'owner', 'of', 'several', 'futuristic', 'company', ',', 'include', 'spacex', 'and', 'Neuralink', '.', 'he', 'have', 'use', 'his', 'candid', 'Twitter', 'feed', 'to', 'give', 'his', 'opinion', 'on', 'cryptocurrencie', ',', 'include', 'bitcoin', ',', 'most', 'time', 'affect', 'their', 'price', '.', 'in', 'March', ',', 'Musk', 'say', 'U.S.', 'customer', 'could', 'purchase', 'Tesla', 'vehicle', '

In [30]:
# check if token is a stop word
token_stop = [token.is_stop for token in parsed_content]
for t in range(len(token_lemmas)):
    if token_stop[t]:
        print("Token '" + token_lemmas[t] + "' is a stopword")
    else:
        print("Token '" + token_lemmas[t] + "' is not a stopword")

Token 'May' is a stopword
Token '11' is not a stopword
Token '(' is not a stopword
Token 'Reuters' is not a stopword
Token ')' is not a stopword
Token '-' is not a stopword
Token 'Billionaire' is not a stopword
Token 'Elon' is not a stopword
Token 'Musk' is not a stopword
Token 'and' is a stopword
Token 'cryptocurrency' is not a stopword
Token 'aficionado' is not a stopword
Token 'on' is a stopword
Token 'Tuesday' is not a stopword
Token 'ask' is not a stopword
Token 'the' is a stopword
Token 'question' is not a stopword
Token ',' is not a stopword
Token 'in' is a stopword
Token 'the' is a stopword
Token 'form' is not a stopword
Token 'of' is a stopword
Token 'a' is a stopword
Token 'Twitter' is not a stopword
Token 'poll' is not a stopword
Token ',' is not a stopword
Token 'come' is not a stopword
Token 'day' is not a stopword
Token 'after' is a stopword
Token 'he' is a stopword
Token 'call' is not a stopword
Token 'dogecoin' is not a stopword
Token 'a' is a stopword
Token '"' is not 

In [31]:
# assemble results for inspection
pd.DataFrame(zip(token_text, token_lemmas, token_stop), columns=['Original Text', 'Stemmed Text', 'stopwords']).head(10)

Unnamed: 0,Original Text,Stemmed Text,stopwords
0,May,May,True
1,11,11,False
2,(,(,False
3,Reuters,Reuters,False
4,),),False
5,-,-,False
6,Billionaire,Billionaire,False
7,Elon,Elon,False
8,Musk,Musk,False
9,and,and,True
