In [1]:
# Initial imports
import os
import pandas as pd
from dotenv import load_dotenv
import nltk as nltk
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()

%matplotlib inline

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\adria\AppData\Roaming\nltk_data...


In [2]:
# Load environment variables and read the News API key enviroment variable
load_dotenv("C:/Users/adria/.env.txt")
api_key = os.getenv("NEWS_API_KEY")

In [4]:
# Create a newsapi client
from newsapi import NewsApiClient
newsapi = NewsApiClient(api_key=api_key)

In [7]:
# Fetch all the news about Bitcoin
bitcoin_headlines = newsapi.get_everything(
    q="Bitcoin",
    language="en",
    page_size=100, # the number of results to return per page
    sort_by="relevancy"
)

In [9]:
# Print total articles
print(f"Total articles about Bitcoin: {bitcoin_headlines['totalResults']}")

Total articles about Bitcoin: 7224


In [10]:
# Create the Bitcoin sentiment scores DataFrame
bitcoin_sentiments = []

for article in bitcoin_headlines["articles"]:
    try:
        text = article["content"]
        date = article["publishedAt"][:10]
        sentiment = analyzer.polarity_scores(text)
        compound = sentiment["compound"]
        pos = sentiment["pos"]
        neu = sentiment["neu"]
        neg = sentiment["neg"]
        
        bitcoin_sentiments.append({
            "text": text,
            "date": date,
            "compound": compound,
            "positive": pos,
            "negative": neg,
            "neutral": neu
            
        })
        
    except AttributeError:
        pass
    
# Create DataFrame
bitcoin_df = pd.DataFrame(bitcoin_sentiments)

# Reorder DataFrame columns
cols = ["date", "text", "compound", "positive", "negative", "neutral"]
bitcoin_df = bitcoin_df[cols]

bitcoin_df.head()

Unnamed: 0,date,text,compound,positive,negative,neutral
0,2022-08-03,It was 8:45 in the morning of June 13 when Bil...,0.5574,0.119,0.0,0.881
1,2022-08-02,"Tools to trace cryptocurrencies have, over jus...",0.0,0.0,0.0,1.0
2,2022-08-18,Cryptocurrencies are often criticized for bein...,-0.5584,0.068,0.17,0.763
3,2022-07-27,"July 27 (Reuters) - Bitcoin rose 6.85% to $22,...",0.0,0.0,0.0,1.0
4,2022-08-02,Posted \r\nFrom Bitcoin highs to blockchain br...,-0.296,0.0,0.086,0.914


In [11]:
# Get descriptive stats from the DataFrame
bitcoin_df.describe()

Unnamed: 0,compound,positive,negative,neutral
count,100.0,100.0,100.0,100.0
mean,0.041377,0.06555,0.05531,0.87915
std,0.463937,0.070761,0.072972,0.088562
min,-0.9081,0.0,0.0,0.628
25%,-0.296,0.0,0.0,0.81175
50%,0.0,0.059,0.0,0.875
75%,0.376275,0.1125,0.10975,0.951
max,0.9246,0.372,0.307,1.0


In [12]:
# Fetch all the news about Ethereum
ethereum_headlines = newsapi.get_everything(
    q="Ethereum",
    language="en",
    page_size=100, # the number of results to return per page
    sort_by="relevancy"
)

In [14]:
# Print total articles
print(f"Total articles about Ethereum: {ethereum_headlines['totalResults']}")

Total articles about Ethereum: 4960


In [15]:
# Create the Ethereum sentiment scores DataFrame
ethereum_sentiments = []

for article in ethereum_headlines["articles"]:
    try:
        text = article["content"]
        date = article["publishedAt"][:10]
        sentiment = analyzer.polarity_scores(text)
        compound = sentiment["compound"]
        pos = sentiment["pos"]
        neu = sentiment["neu"]
        neg = sentiment["neg"]
        
        ethereum_sentiments.append({
            "text": text,
            "date": date,
            "compound": compound,
            "positive": pos,
            "negative": neg,
            "neutral": neu
            
        })
        
    except AttributeError:
        pass
    
# Create DataFrame
ethereum_df = pd.DataFrame(ethereum_sentiments)

# Reorder DataFrame columns
cols = ["date", "text", "compound", "positive", "negative", "neutral"]
ethereum_df = ethereum_df[cols]

ethereum_df.head()

Unnamed: 0,date,text,compound,positive,negative,neutral
0,2022-08-18,Cryptocurrencies are often criticized for bein...,-0.5584,0.068,0.17,0.763
1,2022-08-04,The non-fungible token\r\n (NFT) market has fa...,-0.0217,0.048,0.051,0.901
2,2022-08-02,"It's a day of the week ending in the letter ""y...",-0.2732,0.059,0.115,0.827
3,2022-08-11,Developers have picked a number of so-called t...,-0.6124,0.036,0.145,0.82
4,2022-08-08,"BANGKOK, Aug 8 (Reuters) - Crypto exchange Zip...",0.0,0.0,0.0,1.0


In [16]:
# Get descriptive stats from the DataFrame
ethereum_df.describe()

Unnamed: 0,compound,positive,negative,neutral
count,100.0,100.0,100.0,100.0
mean,0.166374,0.07627,0.03844,0.88533
std,0.441933,0.069327,0.060473,0.082893
min,-0.8519,0.0,0.0,0.679
25%,-0.005425,0.0,0.0,0.8235
50%,0.2143,0.0645,0.0,0.8865
75%,0.5346,0.1295,0.05625,0.943
max,0.8402,0.249,0.243,1.0


In [17]:
# Which coin had the highest mean positive score
# Ethereum has the highest mean positive score.

# Which coin had the highest compound score?
# Bitcoin has the highest compound score.

# Which coin had the highest positive score?
# Bitcoin has the highest positive score.

In [24]:
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import reuters, stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
from string import punctuation
import re

In [25]:
# Instantiate the lemmatizer
lemmatizer = WordNetLemmatizer()

In [26]:
# Code to download corpora
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\adria\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [27]:
# Complete the tokenizer function
def tokenizer(text):
    """Tokenizes text."""
    # Define extra stop words
    sw = set(stopwords.words('english'))
    
    # Remove the punctuation from text
    regex = re.compile("[^a-zA-Z ]")
    re_clean = regex.sub('', text)
   
    # Create a tokenized list of the words
    words = word_tokenize(re_clean)
    
    # Lemmatize words into root words
    lem = [lemmatizer.lemmatize(word) for word in words]

   
    # Convert the words to lowercase
    output = [word.lower() for word in lem if word.lower() not in sw]
    
    # Remove the stop words
    
    
    return tokens

In [28]:
from collections import Counter
from nltk import ngrams

In [36]:
def bigram_counter(text): 
    # Combine all articles in corpus into one large string
    big_string = ' '.join(text)
    processed = process_text(big_string)
    bigrams = ngrams(processed, n=2)
    top_10 = dict(Counter(bigrams).most_common(10))
    return pd.DataFrame(list(top_10.items()), columns=['bigram', 'count'])

In [37]:
# Run the bigram_counter function
bigram_counter(text)

NameError: name 'process_text' is not defined