# News Headlines Sentiment

Use the news api to pull the latest news articles for bitcoin and ethereum and create a DataFrame of sentiment scores for each coin. 

Use descriptive statistics to answer the following questions:
1. Which coin had the highest mean positive score?
2. Which coin had the highest negative score?
3. Which coin had the highest positive score?

In [71]:
# Initial imports
import os
import pandas as pd
from dotenv import load_dotenv
from nltk.sentiment.vader import SentimentIntensityAnalyzer

from newsapi.newsapi_client import NewsApiClient

analyzer = SentimentIntensityAnalyzer()


%matplotlib inline

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

In [63]:
# Read your api key environment variable
load_dotenv()

news_api_key = os.getenv("news_api_key")

#help(NewsApiClient)



In [44]:
# Create a newsapi client
#help(NewsApiClient)

newsapi = NewsApiClient(news_api_key)

#help(newsapi)

test = newsapi.get_everything(q='bitcoin')

#type(test)

help(newsapi.get_everything)



Help on method get_everything in module newsapi.newsapi_client:

get_everything(q=None, sources=None, domains=None, exclude_domains=None, from_param=None, to=None, language=None, sort_by=None, page=None, page_size=None) method of newsapi.newsapi_client.NewsApiClient instance
        Search through millions of articles from over 5,000 large and small news sources and blogs.
    
        Optional parameters:
            (str) q - return headlines w/ specified coin! Valid values are:
                        'bitcoin', 'trump', 'tesla', 'ethereum', etc
    
            (str) sources - return headlines of news sources! some Valid values are:
                        'bbc-news', 'the-verge', 'abc-news', 'crypto coins news',
                        'ary news','associated press','wired','aftenposten','australian financial review','axios',
                        'bbc news','bild','blasting news','bloomberg','business insider','engadget','google news',
                        'hacker news','info

In [100]:
# Fetch the Bitcoin news articles
bitcoin_headlines = newsapi.get_everything(q='bitcoin',
                                           language="en",
                                           page_size=100,
                                           sort_by="relevancy")



bitcoin_headlines['articles'][0]

len(bitcoin_headlines['articles'])

100

In [97]:
# Fetch the Ethereum news articles
ethereum_headlines = newsapi.get_everything(q='ethereum',
                                            language="en",
                                            page_size=100,
                                            sort_by="relevancy")


# Confirm output type of sentiment analyzer
test_sentiment = analyzer.polarity_scores(ethereum_headlines['articles'][0]['content'])

test_sentiment

type(ethereum_headlines['articles'][0]['content'])

len(ethereum_headlines['articles'])

100

In [105]:
# Create the Bitcoin sentiment scores DataFrame

bitcoin_sentiment = []

for article in bitcoin_headlines["articles"]:
    
    try:
    
        text = article["content"]
        date = article["publishedAt"][:10]
    
        sentiment = analyzer.polarity_scores(text)
    
        compound = sentiment["compound"]
        positive = sentiment["pos"]
        negative = sentiment["neg"]
        neutral = sentiment["neu"]


        bitcoin_sentiment.append({
            "date": date,
            "text": text,
            "compound": compound,
            "positive": positive,
            "negative": negative,
            "neutral": neutral
            })
        
    except AttributeError:
        
        pass

type(bitcoin_sentiment[0]['text'])

# I first tried to create the above function with out the 'try...except...pass' structure.  I kept getting the error
# "AttributeError: 'NoneType' object has no attribute 'encode'".  What exactly does this mean?  Were some article 
# texts NoneTypes instead of strings?  In any case, the 'try' seemed to fix it.  (Used in class exercise.)


bitcoin_sentiment

bitcoin_df = pd.DataFrame(bitcoin_sentiment)

columns = ["date", "text", "compound", "positive", "negative", "neutral"]

# Rearrange columns
bitcoin_df = bitcoin_df[columns]

bitcoin_df.head()

Unnamed: 0,date,text,compound,positive,negative,neutral
0,2020-09-10,Two alleged crypto traders in Singapore appare...,-0.6908,0.0,0.16,0.84
1,2020-09-08,"By Alexis Akwagyiram, Tom Wilson\r\n* Monthly ...",0.0,0.0,0.0,1.0
2,2020-08-23,“The COVID-19 pandemic has resulted in a mass ...,0.2732,0.063,0.0,0.937
3,2020-09-08,"LAGOS/LONDON (Reuters) - Four months ago, Abol...",0.0,0.0,0.0,1.0
4,2020-09-08,"LAGOS/LONDON (Reuters) - Four months ago, Abol...",0.0,0.0,0.0,1.0


In [107]:
# Create the ethereum sentiment scores DataFrame

ethereum_sentiment = []

for article in ethereum_headlines["articles"]:
    
    try:
    
        text = article["content"]
        date = article["publishedAt"][:10]
    
        sentiment = analyzer.polarity_scores(text)
    
        compound = sentiment["compound"]
        positive = sentiment["pos"]
        negative = sentiment["neg"]
        neutral = sentiment["neu"]


        ethereum_sentiment.append({
            "date": date,
            "text": text,
            "compound": compound,
            "positive": positive,
            "negative": negative,
            "neutral": neutral
            })
        
    except AttributeError:
        
        pass

type(ethereum_sentiment[0]['text'])


ethereum_sentiment

ethereum_df = pd.DataFrame(ethereum_sentiment)


# Rearrange columns

columns = ["date", "text", "compound", "positive", "negative", "neutral"]
ethereum_df = ethereum_df[columns]

ethereum_df.head()


Unnamed: 0,date,text,compound,positive,negative,neutral
0,2020-09-02,If you've interacted with cryptocurrencies in ...,0.7506,0.209,0.0,0.791
1,2020-09-07,TL;DR: The Complete Stock and Cryptocurrency I...,0.0,0.0,0.0,1.0
2,2020-09-18,"September\r\n18, 2020\r\n6 min read\r\nOpinion...",0.0,0.0,0.0,1.0
3,2020-08-26,LONDON (Reuters) - It sounds like a surefire b...,0.7579,0.181,0.0,0.819
4,2020-08-25,NEW YORK (Reuters) - Brooklyn-based technology...,0.0,0.0,0.0,1.0


In [94]:
# Describe the Bitcoin Sentiment
bitcoin_df.describe()

Unnamed: 0,compound,positive,negative,neutral
count,95.0,95.0,95.0,95.0
mean,0.161913,0.079758,0.042032,0.8782
std,0.476811,0.069281,0.070663,0.090157
min,-0.886,0.0,0.0,0.588
25%,0.0,0.014,0.0,0.8275
50%,0.2732,0.086,0.0,0.91
75%,0.507,0.107,0.0885,0.9195
max,0.9231,0.326,0.303,1.0


In [96]:
# Describe the Ethereum Sentiment
ethereum_df.describe()

Unnamed: 0,compound,positive,negative,neutral
count,95.0,95.0,95.0,95.0
mean,0.145749,0.070958,0.036653,0.892389
std,0.40861,0.069543,0.067799,0.088831
min,-0.91,0.0,0.0,0.607
25%,0.0,0.0,0.0,0.8395
50%,0.1027,0.075,0.0,0.905
75%,0.4718,0.1105,0.0605,0.963
max,0.8519,0.311,0.347,1.0


### Questions:

Q: Which coin had the highest mean positive score?

A: 

Q: Which coin had the highest compound score?

A: 

Q. Which coin had the highest positive score?

A: 

---

# Tokenizer

In this section, you will use NLTK and Python to tokenize the text for each coin. Be sure to:
1. Lowercase each word
2. Remove Punctuation
3. Remove Stopwords

In [119]:
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
from string import punctuation
import re

lemmatizer = WordNetLemmatizer()

In [115]:
# Expand the default stopwords list if necessary

type(ethereum_df['text'])
ethereum_df.dtypes

help(word_tokenize)

Help on function word_tokenize in module nltk.tokenize:

word_tokenize(text, language='english', preserve_line=False)
    Return a tokenized copy of *text*,
    using NLTK's recommended word tokenizer
    (currently an improved :class:`.TreebankWordTokenizer`
    along with :class:`.PunktSentenceTokenizer`
    for the specified language).
    
    :param text: text to split into words
    :type text: str
    :param language: the model name in the Punkt corpus
    :type language: str
    :param preserve_line: An option to keep the preserve the sentence and not sentence tokenize it.
    :type preserve_line: bool



In [128]:
# Complete the tokenizer function
def tokenizer(text):
    """Tokenizes text."""       
    
    # Remove the punctuation
    regex = re.compile("[^a-zA-Z ]")
    re_clean = regex.sub('', text)
    
    # Create a list of the words
    words = word_tokenize(re_clean)
    
    # Lemmatize Words into root words
    lem = [lemmatizer.lemmatize(word) for word in words]
    
    # Remove the stop words
    # Convert the words to lowercase
    sw = set(stopwords.words('english'))
    tokens = [word.lower() for word in lem if word.lower() not in sw]
        
    return tokens


# Tested function on simple text
# hello = "Hello!  My name is Andrew.  What is yours?"
# tokenizer(hello)

# Tested on one ethereum article
tokenizer(ethereum_df['text'][0])

['youve',
 'interacted',
 'cryptocurrencies',
 'past',
 'couple',
 'year',
 'good',
 'chance',
 'youve',
 'used',
 'metamask',
 'cryptocurrency',
 'wallet',
 'form',
 'browser',
 'extension',
 'support',
 'ether',
 'char']

In [135]:
# Create a new tokens column for bitcoin

bitcoin_df['tokens'] = [tokenizer(text) for text in bitcoin_df['text']]


# Rearrange columns

columns = ["date", "compound", "positive", "negative", "neutral", "text", "tokens"]
bitcoin_df = bitcoin_df[columns]

bitcoin_df.head()

Unnamed: 0,date,compound,positive,negative,neutral,text,tokens
0,2020-09-10,-0.6908,0.0,0.16,0.84,Two alleged crypto traders in Singapore appare...,"[two, alleged, crypto, trader, singapore, appa..."
1,2020-09-08,0.0,0.0,0.0,1.0,"By Alexis Akwagyiram, Tom Wilson\r\n* Monthly ...","[alexis, akwagyiram, tom, wilson, monthly, cry..."
2,2020-08-23,0.2732,0.063,0.0,0.937,“The COVID-19 pandemic has resulted in a mass ...,"[covid, pandemic, ha, resulted, mass, shift, w..."
3,2020-09-08,0.0,0.0,0.0,1.0,"LAGOS/LONDON (Reuters) - Four months ago, Abol...","[lagoslondon, reuters, four, month, ago, abola..."
4,2020-09-08,0.0,0.0,0.0,1.0,"LAGOS/LONDON (Reuters) - Four months ago, Abol...","[lagoslondon, reuters, four, month, ago, abola..."


In [136]:
# Create a new tokens column for ethereum

ethereum_df['tokens'] = [tokenizer(text) for text in ethereum_df['text']]


# Rearrange columns

columns = ["date", "compound", "positive", "negative", "neutral", "text", "tokens"]
ethereum_df = ethereum_df[columns]

ethereum_df.head()

Unnamed: 0,date,compound,positive,negative,neutral,text,tokens
0,2020-09-02,0.7506,0.209,0.0,0.791,If you've interacted with cryptocurrencies in ...,"[youve, interacted, cryptocurrencies, past, co..."
1,2020-09-07,0.0,0.0,0.0,1.0,TL;DR: The Complete Stock and Cryptocurrency I...,"[tldr, complete, stock, cryptocurrency, invest..."
2,2020-09-18,0.0,0.0,0.0,1.0,"September\r\n18, 2020\r\n6 min read\r\nOpinion...","[september, min, readopinions, expressed, entr..."
3,2020-08-26,0.7579,0.181,0.0,0.819,LONDON (Reuters) - It sounds like a surefire b...,"[london, reuters, sound, like, surefire, bet, ..."
4,2020-08-25,0.0,0.0,0.0,1.0,NEW YORK (Reuters) - Brooklyn-based technology...,"[new, york, reuters, brooklynbased, technology..."


---

# NGrams and Frequency Analysis

In this section you will look at the ngrams and word frequency for each coin. 

1. Use NLTK to produce the n-grams for N = 2. 
2. List the top 10 words for each coin. 

In [137]:
from collections import Counter
from nltk import ngrams

In [160]:
# Generate the Bitcoin N-grams where N=2

bitcoin_article_tokens = [token for token in bitcoin_df['tokens']]

bitcoin_words_tokenized = []

for articles in bitcoin_article_tokens:
    
    for word in articles:
        
        bitcoin_words_tokenized.append(word)
        
    
bitcoin_words_tokenized

bigram_counts_bitcoin = Counter(ngrams(bitcoin_words_tokenized, n=2)) 

print(dict(bigram_counts_bitcoin))

{('two', 'alleged'): 1, ('alleged', 'crypto'): 1, ('crypto', 'trader'): 1, ('trader', 'singapore'): 1, ('singapore', 'apparently'): 1, ('apparently', 'came'): 1, ('came', 'foolproof'): 1, ('foolproof', 'plan'): 1, ('plan', 'rather'): 1, ('rather', 'convert'): 1, ('convert', 'customer'): 1, ('customer', 'singapore'): 1, ('singapore', 'dollar'): 1, ('dollar', 'bitcoin'): 1, ('bitcoin', 'would'): 1, ('would', 'simply'): 1, ('simply', 'rob'): 1, ('rob', 'victim'): 1, ('victim', 'came'): 1, ('came', 'char'): 1, ('char', 'alexis'): 1, ('alexis', 'akwagyiram'): 1, ('akwagyiram', 'tom'): 1, ('tom', 'wilson'): 1, ('wilson', 'monthly'): 1, ('monthly', 'crypto'): 1, ('crypto', 'transfer'): 1, ('transfer', 'africa'): 1, ('africa', 'rise'): 1, ('rise', 'year'): 1, ('year', 'much'): 1, ('much', 'activity'): 1, ('activity', 'nigeria'): 1, ('nigeria', 'south'): 1, ('south', 'africa'): 1, ('africa', 'kenya'): 1, ('kenya', 'driven'): 1, ('driven', 'small'): 1, ('small', 'business'): 1, ('business', 'mig

In [161]:
# Generate the Ethereum N-grams where N=2

ethereum_article_tokens = [token for token in ethereum_df['tokens']]

ethereum_words_tokenized = []

for articles in ethereum_article_tokens:
    
    for word in articles:
        
        ethereum_words_tokenized.append(word)
        
    
ethereum_words_tokenized

bigram_counts_ethereum = Counter(ngrams(ethereum_words_tokenized, n=2)) 

print(dict(bigram_counts_ethereum))

{('youve', 'interacted'): 1, ('interacted', 'cryptocurrencies'): 1, ('cryptocurrencies', 'past'): 1, ('past', 'couple'): 1, ('couple', 'year'): 1, ('year', 'good'): 1, ('good', 'chance'): 1, ('chance', 'youve'): 1, ('youve', 'used'): 1, ('used', 'metamask'): 1, ('metamask', 'cryptocurrency'): 1, ('cryptocurrency', 'wallet'): 1, ('wallet', 'form'): 1, ('form', 'browser'): 1, ('browser', 'extension'): 1, ('extension', 'support'): 1, ('support', 'ether'): 1, ('ether', 'char'): 3, ('char', 'tldr'): 2, ('tldr', 'complete'): 1, ('complete', 'stock'): 1, ('stock', 'cryptocurrency'): 1, ('cryptocurrency', 'investment'): 1, ('investment', 'toolkit'): 1, ('toolkit', 'bundle'): 1, ('bundle', 'sale'): 1, ('sale', 'sept'): 1, ('sept', 'saving'): 1, ('saving', 'list'): 1, ('list', 'pricethe'): 1, ('pricethe', 'world'): 1, ('world', 'finance'): 1, ('finance', 'ha'): 1, ('ha', 'always'): 1, ('always', 'complicated'): 1, ('complicated', 'th'): 1, ('th', 'char'): 3, ('char', 'september'): 2, ('september

In [18]:
# Use the token_count function to generate the top 10 words from each coin
def token_count(tokens, N=10):
    """Returns the top N tokens from the frequency count"""
    return Counter(tokens).most_common(N)

In [19]:
# Get the top 10 words for Bitcoin
# YOUR CODE HERE!

In [20]:
# Get the top 10 words for Ethereum
# YOUR CODE HERE!

# Word Clouds

In this section, you will generate word clouds for each coin to summarize the news for each coin

In [21]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt
plt.style.use('seaborn-whitegrid')
import matplotlib as mpl
mpl.rcParams['figure.figsize'] = [20.0, 10.0]

In [22]:
# Generate the Bitcoin word cloud
# YOUR CODE HERE!

In [23]:
# Generate the Ethereum word cloud
# YOUR CODE HERE!

# Named Entity Recognition

In this section, you will build a named entity recognition model for both coins and visualize the tags using SpaCy.

In [24]:
import spacy
from spacy import displacy

In [25]:
# Optional - download a language model for SpaCy
# !python -m spacy download en_core_web_sm

In [26]:
# Load the spaCy model
nlp = spacy.load('en_core_web_sm')

## Bitcoin NER

In [27]:
# Concatenate all of the bitcoin text together
# YOUR CODE HERE!

In [28]:
# Run the NER processor on all of the text
# YOUR CODE HERE!

# Add a title to the document
# YOUR CODE HERE!

In [29]:
# Render the visualization
# YOUR CODE HERE!

In [30]:
# List all Entities
# YOUR CODE HERE!

---

## Ethereum NER

In [31]:
# Concatenate all of the bitcoin text together
# YOUR CODE HERE!

In [32]:
# Run the NER processor on all of the text
# YOUR CODE HERE!

# Add a title to the document
# YOUR CODE HERE!

In [33]:
# Render the visualization
# YOUR CODE HERE!

In [34]:
# List all Entities
# YOUR CODE HERE!