# Installation

In [None]:
!pip install ccxt
!pip install newsapi-python
!pip install colab-env -qU

# Imports

In [2]:
import os
import re
import pandas as pd

In [3]:
from dotenv import load_dotenv
from newsapi import NewsApiClient
from string import punctuation
from collections import Counter
from wordcloud import WordCloud

In [4]:
import matplotlib.pyplot as plt
import matplotlib as mpl

In [5]:
import spacy
from spacy import displacy

In [6]:
import nltk
from nltk import ngrams
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.tokenize import word_tokenize, sent_tokenize



# Environment Setup

In [12]:
import colab_env
from colab_env import envvar_handler

Mounted at /content/gdrive


In [13]:
# Mount google drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [40]:
# Downloads
nltk.download('vader_lexicon')
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [8]:
%matplotlib inline

In [66]:
# Get API key
news_api_key = os.getenv("NEWS_API_KEY")
type(news_api_key)

str

# Testing

In [65]:
# Create a newsapi client
# newsapi = NewsApiClient(api_key = news_api_key)

In [64]:
# Initialize the VADER sentiment analyzer
# analyzer = SentimentIntensityAnalyzer()

In [63]:
# Fetch the Bitcoin news articles
# btc_headlines = newsapi.get_everything(
#     q="Bitcoin", 
#     language="en", 
#     sort_by="relevancy",
#     page_size=100)

In [62]:
# Print total articles
# print(f"Total articles about Bitcoin: {btc_headlines['totalResults']}")

In [61]:
# Create the Bitcoin sentiment scores DataFrame
# btc_df = get_sentiment_df(btc_headlines["articles"])
# btc_df.head()

In [60]:
# btc_df.shape

In [59]:
# Describe the Bitcoin Sentiment
# btc_df.describe()

In [58]:
# Create a new tokens column for bitcoin
# btc_token_list = [tokenizer(text) for text in btc_df['Text'].tolist()]

# # Make sure same length as btc_df
# print(f'{len(btc_token_list)} should equal {len(btc_df)}')

In [57]:
# Append tokens column to btc_df
# btc_df['Tokens'] = btc_token_list
# btc_df.head()

# Functions

In [67]:
def get_sentiment_df(articles):

  # Initialize the VADER sentiment analyzer
  analyzer = SentimentIntensityAnalyzer()

  sentiments = []

  # Compute sentiment for each article
  for article in articles:
    try:
      text = article["content"]
      sentiment = analyzer.polarity_scores(text)
      compound = sentiment["compound"]
      neg = sentiment["neg"]
      neu = sentiment["neu"]
      pos = sentiment["pos"]            

      sentiments.append({
        "Compound": compound,
        "Negative": neg,
        "Neutral": neu,
        "Positive": pos,
        "Text": text })

    except AttributeError:
      pass

  # Create DataFrame
  df = pd.DataFrame(sentiments)

  # Reorder DataFrame columns
  cols = ["Compound", "Negative", "Neutral", "Positive", "Text"]
  df = df[cols]

  return df

In [68]:
# Complete the tokenizer function
def tokenizer(text):
  """Tokenizes text."""

  # Init regex
  regex = re.compile("[^a-zA-Z ]")

  # Substitute everything that is NOT a letter with empty string
  text_clean = regex.sub('', text)

  # Tokenize words
  words = word_tokenize(text_clean)

  # Init lemmatizer
  lemmatizer = WordNetLemmatizer()

  # Lemmatize Words into root words (this should happen before stopwording)
  lem_words = [lemmatizer.lemmatize(word) for word in words]

  # Init stop words
  sw = set(stopwords.words('english'))

  # Custom extended stopwords 
  sw_extended = {'said', 'sent', 'found', 'including', 'today', 'announced', 'week', 'basically', 'also'}

  # Replace stop words and lowercase
  tokens = [word.lower() for word in lem_words if word.lower() not in sw.union(sw_extended)]

  return tokens

In [69]:
def get_news_sentiment_df(crypto_name, api_key):

  # Create a newsapi client
  newsapi = NewsApiClient(api_key = api_key)

  # Initialize the VADER sentiment analyzer
  analyzer = SentimentIntensityAnalyzer()

  # Call API for latest headlines
  headlines = newsapi.get_everything(
    q = crypto_name, 
    language = "en", 
    sort_by = "relevancy",
    page_size = 100)

  # Return None if no headlines
  if len(headlines) == 0: return None

  # Log
  print(f"Fetched {headlines['totalResults']} articles about {crypto_name}")

  # Get sentiment
  results_df = get_sentiment_df(headlines["articles"])

  # Create a new tokens column for bitcoin
  token_list = [tokenizer(text) for text in results_df['Text'].tolist()]

  # Append tokens column to btc_df
  results_df['Tokens'] = token_list

  return results_df

# Usage - get news headlines and perform sentiment analysis

In [70]:
# Get latest headlines and sentiment analysis
btc_news_sentiment_df = get_news_sentiment_df('Bitcoin', news_api_key)

Fetched 7893 articles about Bitcoin


In [71]:
btc_news_sentiment_df.head()

Unnamed: 0,Compound,Negative,Neutral,Positive,Text,Tokens
0,-0.5574,0.11,0.89,0.0,One of the strictest crackdowns worldwide\r\nP...,"[one, strictest, crackdown, worldwidephoto, mi..."
1,-0.5106,0.142,0.858,0.0,The hacker behind last years big Twitter hack\...,"[hacker, behind, last, year, big, twitter, hac..."
2,0.6369,0.0,0.887,0.113,Some things are best left a mystery at least a...,"[thing, best, left, mystery, least, far, coinb..."
3,0.0,0.0,0.0,0.0,,[]
4,0.8316,0.0,0.754,0.246,TL;DR: Enter the The Complete Bitcoin (BTC) In...,"[tldr, enter, complete, bitcoin, btc, investme..."


In [72]:
btc_news_sentiment_df.describe()

Unnamed: 0,Compound,Negative,Neutral,Positive
count,98.0,98.0,98.0,98.0
mean,0.042906,0.026296,0.925449,0.038071
std,0.322431,0.057029,0.126542,0.056157
min,-0.9062,0.0,0.0,0.0
25%,0.0,0.0,0.89925,0.0
50%,0.0,0.0,0.9525,0.0
75%,0.1591,0.0,1.0,0.07425
max,0.8316,0.326,1.0,0.246
