# Unit 12 - Tales from the Crypto

---


## 1. Sentiment Analysis

In [1]:
# Initializing imports
import os
import pandas as pd
from newsapi import NewsApiClient
from dotenv import load_dotenv
import nltk as nltk
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()

%matplotlib inline


ImportError: cannot import name 'NewsApiClient' from 'newsapi' (/Users/abraraman/opt/anaconda3/lib/python3.8/site-packages/newsapi/__init__.py)

In [None]:
# Reading api key environment variable

load_dotenv()

In [None]:
# Creating a newsapi client
newsapi = NewsApiClient(api_key=os.environ["NEWS_API_KEY"])

In [None]:
# Fetching the Bitcoin news articles

bitcoin_news = newsapi.get_everything(
    q="bitcoin",
    language="en")

In [None]:
# Fetching the Ethereum news articles

eth_news = newsapi.get_everything(
    q="ethereum",
    language="en")

In [None]:
# Creating the Bitcoin sentiment scores DataFrame

bitcoin_sentiments = []

for article in bitcoin_news["articles"]:
    try:
        text = article["content"]
        date = article["publishedAt"][:10]
        sentiment = analyzer.polarity_scores(text)
        compound = sentiment["compound"]
        pos = sentiment["pos"]
        neu = sentiment["neu"]
        neg = sentiment["neg"]
        
        bitcoin_sentiments.append({
            "text": text,
            "date": date,
            "compound": compound,
            "positive": pos,
            "negative": neg,
            "neutral": neu})
        
    except AttributeError:
        pass
    
bitcoin_df = pd.DataFrame(bitcoin_sentiments)

# Rearranging/cleaning up the dataframe columns
cols = ["date", "text", "compound", "positive", "negative", "neutral"]
bitcoin_df = bitcoin_df[cols]

bitcoin_df.head()

In [None]:
# Creating the Ethereum sentiment scores DataFrame
eth_sentiments = []

for article in eth_news["articles"]:
    try:
        text = article["content"]
        date = article["publishedAt"][:10]
        sentiment = analyzer.polarity_scores(text)
        compound = sentiment["compound"]
        pos = sentiment["pos"]
        neu = sentiment["neu"]
        neg = sentiment["neg"]
        
        eth_sentiments.append({
            "text": text,
            "date": date,
            "compound": compound,
            "positive": pos,
            "negative": neg,
            "neutral": neu})
        
    except AttributeError:
        pass
    
eth_df = pd.DataFrame(eth_sentiments)

# Rearranging/cleaning up the dataframe columns
cols = ["date", "text", "compound", "positive", "negative", "neutral"]
eth_df = eth_df[cols]

eth_df.head()

In [None]:
# Describing the Bitcoin Sentiment
bitcoin_df.describe()

In [None]:
# Describing the Ethereum Sentiment
eth_df.describe()

### Questions:

Q: Which coin had the highest mean positive score?

A: Ethereum had the highest meand positive score

Q: Which coin had the highest compound score?

A: A: Ethereum had the highest comnpund score

Q. Which coin had the highest positive score?

A: A: Ethereum had the highest positive score


---

## 2. Natural Language Processing
---
###   Tokenizer

In [None]:
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
from string import punctuation
import re

In [None]:
# Instantiating the lemmatizer
lemmatizer = WordNetLemmatizer()

# Creating a list of stopwords
stopwords_list = (stopwords.words('english'))

# Expanding the default stopwords list if necessary

expanded_list = ["Chars", "Data", "Another", "Photo", "Joe", "Reuters", "Reutersdado"]

stop_words = stopwords_list + expanded_list

In [None]:
# Completing the tokenizer function
def tokenizer(text):
    """Tokenizes text."""
    
    # Removing the stop words
    stpwrds = set(stop_words)
    
    # Removing the punctuation from text
    regex = re.compile("[^a-zA-Z ]")
    re_clean = regex.sub('', text)
   
    # Creating a tokenized list of the words
    words = word_tokenize(re_clean)
    
    # Lemmatizing words into root words
    lem = [lemmatizer.lemmatize(word) for word in words]
   
    # Converting the words to lowercase
    tokens = [word.lower() for word in lem if word.lower() not in stpwrds]
    
    return tokens 

In [None]:
# Creating a new tokens column for Bitcoin
bitcoin_df["tokens"] = bitcoin_df.text.apply(tokenizer)
bitcoin_df.head()

In [None]:
# Creating a new tokens column for Ethereum
eth_df["tokens"] = eth_df.text.apply(tokenizer)
eth_df.head()

---

### NGrams and Frequency Analysis

In [None]:
from collections import Counter
from nltk import ngrams

In [None]:
# Generating the Bitcoin N-grams where N=2
bitcoin_string = bitcoin_df.text.str.cat()
bitcoin_token = tokenizer(bitcoin_string)

bitcoin_bigram_counts = Counter(ngrams(bitcoin_token, n=2))
bitcoin_bigram_counts.most_common(10)

In [None]:
# Generating the Ethereum N-grams where N=2
eth_string = eth_df.text.str.cat()
eth_token = tokenizer(eth_string)

eth_bigram_counts = Counter(ngrams(tokenizer(eth_string), n=2))
eth_bigram_counts.most_common(10)

In [None]:
# Generating the top 10 words for a given coin
def token_count(tokens, N=3):
    """Returns the top N tokens from the frequency count"""
    return Counter(tokens).most_common(N)

In [None]:
# Using token_count to get the top 10 words for Bitcoin
token_count(bitcoin_token, N=10)

In [None]:
# Using token_count to get the top 10 words for Ethereum
token_count(eth_token, N=10)

---

### Word Clouds


In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt
plt.style.use('seaborn-whitegrid')
import matplotlib as mpl
mpl.rcParams['figure.figsize'] = [20.0, 10.0]

In [None]:
# Generating the Bitcoin word cloud
bit_cloud = " ".join(bitcoin_token)
wc = WordCloud().generate(bit_cloud)
plt.imshow(wc)

In [None]:
# Generating the Ethereum word cloud
eth_cloud = " ".join(eth_token)
wc = WordCloud().generate(eth_cloud)
plt.imshow(wc)

---
## 3. Named Entity Recognition


In [None]:
import spacy
from spacy import displacy

In [None]:
# Loading the spaCy model
nlp = spacy.load('en_core_web_sm')

---
### Bitcoin NER

In [None]:
# Running the NER processor on all of the text
btc_nlp=nlp(bitcoin_string)

# Adding a title to the document
btc_nlp.user_data["title"] = "Bitcoin NER"


In [None]:
# Rendering the visualization
displacy.render(btc_nlp, style='ent')


In [None]:
# Listing all Entities
btc_entities = [ent.text for ent in btc_nlp.ents]
btc_entities

---

### Ethereum NER

In [None]:
# Running the NER processor on all of the text
eth_nlp=nlp(eth_string)

# Adding a title to the document
eth_nlp.user_data["title"] = "Ethereum NER"

In [None]:
# Rendering the visualization
displacy.render(eth_nlp, style='ent')

In [None]:
# Listing all Entities
eth_entities = [ent.text for ent in eth_nlp.ents]
eth_entities

---