In [93]:
# Initial imports
import os
import pandas as pd
import requests
import datetime
from datetime import datetime, timedelta, date
from dateutil.parser import parse
import alpaca_trade_api as tradeapi
import matplotlib.pyplot as plt

#Twitter API imports
import tweepy as tw

# NLP & Sentiment imports
import re
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer

from dotenv import load_dotenv
load_dotenv()

True

In [94]:
# Setting twitter access and api keys
bearer_token = os.getenv("TWITTER_BEARER_TOKEN")
consumer_key= os.getenv("TWITTER_API_KEY")
consumer_secret= os.getenv("TWITTER_SECRET_KEY")
access_token= os.getenv("TWITTER_ACCESS_TOKEN")
access_token_secret= os.getenv("TWITTER_ACCESS_TOKEN_SECRET")

# Set Alpaca API key and secret
alpaca_api_key = os.getenv("ALPACA_API_KEY")
alpaca_secret_key = os.getenv("ALPACA_SECRET_KEY")

alpaca_api = tradeapi.REST(alpaca_api_key, alpaca_secret_key, api_version='v2')

In [95]:
# authentication for twitter
auth = tw.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
twitter_api = tw.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)

# test authentication
try:
    twitter_api.verify_credentials()
    print("Authentication OK")
except:
    print("Error during authentication")

Authentication OK


In [96]:
# Download/Update the VADER Lexicon
nltk.download('vader_lexicon')

# Initialize the VADER sentiment analyzer
analyzer = SentimentIntensityAnalyzer()

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/Kris/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [135]:
############################################################
"""
    Cleans the tweets and removes the included url
"""
############################################################

def remove_url(txt):
    """Replace URLs found in a text string with nothing 
    (i.e. it will remove the URL from the string).

    Parameters
    ----------
    txt : string
        A text string that you want to parse and remove urls.

    Returns
    -------
    The same txt string with url's removed.
    """

    return " ".join(re.sub("([^0-9A-Za-z \t])|(\w+:\/\/\S+)", "", txt).split())



############################################################
"""
    Sentiment calculation based on compound score
"""
############################################################

def get_normalized(score):
    """
    Calculates the sentiment based on the compound score.
    """
    result = 0  # Neutral by default
    if score >= 0.05:  # Positive
        result = 1
    elif score <= -0.05:  # Negative
        result = -1

    return result

############################################################
"""
    Function that pulls stock data from a given ticker and timeframe.
"""
############################################################

def daily_returns(ticker, timeframe):
        
    # Set current date and the date from one month ago using the ISO format
    current_date = pd.Timestamp.now(tz="America/New_York").isoformat()
    past_date = pd.Timestamp("2020-12-01 00:00", tz="America/New_York").isoformat()

    # Get 4 weeks worth of historical data for AAPL
    df = alpaca_api.get_barset(
        ticker,
        timeframe,
        limit=None,
        start=past_date,
        end=current_date,
        after=None,
        until=None,
    ).df
        
    return df



############################################################
"""
    Twitter: Scrape Tweets and Analyze Sentiment
"""
############################################################

def twitter_sentiment(search_words, date_since, items):
    
    # initializing the tweets dataframe
    df = []
    
    # adding retweet filter to search words
    search_words = search_words + " -filter:retweets"
    
    # Fetch top tweets/hastags for given ticker
    tweets = tw.Cursor(twitter_api.search,
              q=search_words,
              lang="en",
              since=date_since
                    ).items(items)
    
    for tweet in tweets:
    
        #Clean text of tweets
        tweet_clean = remove_url(tweet.text)

        # Get date of tweet
        tweet_date = pd.Timestamp(tweet.created_at, tz="America/New_York").isoformat()
        
        try:
            sentiment = analyzer.polarity_scores(tweet_clean)
            compound = sentiment["compound"]
            pos = sentiment["pos"]
            neu = sentiment["neu"]
            neg = sentiment["neg"]
        
            df.append({
                "date": tweet_date,
                "tweet": tweet_clean,
                "compound": compound,
                "positive": pos,
                "negative": neg,
                "neutral": neu
            
            })
        
        except AttributeError:
            pass
    
    df = pd.DataFrame(df)
    
    return df


In [184]:
# tweepy variables
search_words = "aapl"
date_since = "2020-10-01"
items = 5000

# call the twitter sentiment function and return a dataframe
twitter_df = twitter_sentiment(search_words, date_since, items)

# Creates datetime object and outputs only the Date and Hour for each tweet
twitter_df['date'] = pd.to_datetime(twitter_df['date'])
twitter_df['date'] = twitter_df['date'].apply(lambda x: pd.Timestamp(x).strftime('%m-%d-%Y %H'))

# twitter_df.to_csv(r'C:\Users\Kris\Documents\ucb_fintech\Homework\Project2\tweets\aapl_tweets.csv')
twitter_df.head()

# Grouping the tweets by day and taking their avererage daily sentiment
#twitter_df.rename(columns = {'index':'date'}, inplace = True)
avg_daily_sentiment = twitter_df.groupby("date").mean()
# Fix the average for the negative sentiment score. It is not producing a negative result properly for the normalizing
avg_daily_sentiment

Rate limit reached. Sleeping for: 166
Rate limit reached. Sleeping for: 855


KeyboardInterrupt: 

In [None]:
# Get descriptive stats from the DataFrame
twitter_df.describe()

In [None]:
twitter_df.plot(
    y=["compound", "positive", "negative", "neutral"],
    kind="line",
    title= "Twitter Sentiment",
    figsize= (20,10),
    grid=True,
);


In [None]:
avg_daily_sentiment["normalized"] = avg_daily_sentiment["compound"].apply(lambda x : get_normalized(x))
avg_daily_sentiment

In [None]:
# alpaca api variables
ticker = "MSFT"
timeframe = "15Min"

# call the alpaca api and return a dataframe of daily returns
daily_df = daily_returns(ticker, timeframe)


# Drop Outer Table Level and drop extra columns
daily_df = daily_df.droplevel(axis=1, level=0)
daily_df = daily_df.drop(columns=["open", "high", "low", "volume"])

returns_df = daily_df.pct_change().dropna()

# Creates datetime object and outputs only the Date and Hour for each close
returns_df.reset_index(inplace = True)
returns_df['time'] = returns_df['time'].apply(lambda x: pd.Timestamp(x).strftime('%m-%d-%Y %H'))
# is this returning the proper timestamp - should return 01-14??

avg_hourly_returns = returns_df.groupby("time").mean()
avg_hourly_returns


In [None]:
combined_df = avg_hourly_returns.join(avg_daily_sentiment).dropna(how="any")

display(combined_df)

In [None]:
# Correlate the twitter sentiment to returns
combined_df.corr().style.background_gradient()