In [1]:
# Dependencies to get stock info
import yfinance as yf
import pandas as pd
import numpy as np
import datetime

In [2]:
# Get S&P500 companies' tickers
wiki_df = pd.read_html('https://en.wikipedia.org/wiki/List_of_S%26P_500_companies')
tickers_list = wiki_df[0]['Symbol'].values.tolist()
names_list = wiki_df[0]['Security'].values.tolist()

# Store ticker and company name in dictionary for future reference
ticker_to_name = dict()
for i in range(len(tickers_list)):
    ticker_to_name[tickers_list[i]] = names_list[i]

In [3]:
# Download data for tickers
df = yf.download(tickers=tickers_list, interval='1d', period='1d')
df = df.drop(['Open', 'Close', 'High', 'Low', 'Adj Close'], axis=1)
df.head()

[*********************100%***********************]  505 of 505 completed

2 Failed downloads:
- BRK.B: No data found, symbol may be delisted
- BF.B: None


Unnamed: 0_level_0,Volume,Volume,Volume,Volume,Volume,Volume,Volume,Volume,Volume,Volume,Volume,Volume,Volume,Volume,Volume,Volume,Volume,Volume,Volume,Volume,Volume
Unnamed: 0_level_1,A,AAL,AAP,AAPL,ABBV,ABC,ABMD,ABT,ACN,ADBE,...,XEL,XLNX,XOM,XRAY,XYL,YUM,ZBH,ZBRA,ZION,ZTS
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2021-12-10,1455418,27784995,701323,110986702,6939781,1229059,261100,4122889,1925417,2739345,...,2754795,1638427,17442455,1867243,988329,1228156,1646754,217942,530279,1120745


In [4]:
# Get top 50 companies by trade volume
df.transpose()
df = df.sort_values('2021-12-10', axis=1, ascending=False)
tickers = df.columns.to_list()
tickers = list(map(lambda x: x[1], tickers))[:50]

In [5]:
# Calculate percentage change in stock price over the week
def get_percentage_change(ticker):
    df = yf.download(tickers=ticker, interval='5d', period='5d')
    
    try:
        start = df['Adj Close'][0]
        end = df['Adj Close'][1]
        change = ((end - start) / start) * 100
    except:
        change = np.nan
    finally:
        return change

In [6]:
# Create dataframe with tickers and percentage change in price
df = pd.DataFrame(tickers, columns=['ticker'])
df['% change'] = df['ticker'].apply(get_percentage_change)

# Clean data
df = df[df['% change'].notna()]

df

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%********

Unnamed: 0,ticker,% change
0,F,11.602505
1,AAPL,8.547054
2,NVDA,0.536011
3,ORCL,15.392393
4,AMD,-0.366744
5,T,-1.890037
6,C,-2.895076
7,BAC,0.83805
8,MSFT,5.012418
9,CCL,3.711676


In [7]:
# Dependencies for sentiment analysis
from textblob import TextBlob
import tweepy
import nltk
import re
import string
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [8]:
# Authentication for twitter API
consumerKey = 'AdRvt7c9vNn80ONT26cvcRU2x'
consumerSecret = '6c8uVj2N3yeFKMvsKXxsXEaMbaRYa6yNxXkIrsN2fPEaocMgOV'
accessToken = '1469595408913481730-TNdBl7a5q2d19ee8cD3TebY0OWi9EG'
accessTokenSecret = 'KcZP0J893afMER9GWWSyyHoiT2dVh8c1p2Hq5Jdh50Pi2'
auth = tweepy.OAuthHandler(consumerKey, consumerSecret)
auth.set_access_token(accessToken, accessTokenSecret)
api = tweepy.API(auth)

In [9]:
# Get english language tweets associated with given ticker for past week
def get_tweets(ticker, count):
    tweets = api.search_tweets(q=ticker, count=count, lang='en', until=datetime.date(2021, 12, 11))
    tweets_text = [tweet.text for tweet in tweets]
    return tweets_text

# Calculate sentiment for particular stock given dataframe containing tweet data
def get_stock_sentiment(df):
    num_positive = len(df[df['sentiment'] == 'positive'])
    num_negative = len(df[df['sentiment'] == 'negative'])
    num_neutral = len(df[df['sentiment'] == 'neutral'])
    
    if num_neutral > max(num_positive, num_negative):
        return 'neutral'
    elif num_positive > num_negative:
        return 'positive'
    else:
        return 'negative'

In [21]:
def get_sentiment(ticker):
    # Populate tweets in dataframe and remove duplicates
    company_name = ticker_to_name[ticker]
    tweets1 = pd.DataFrame(get_tweets(company_name, 500)) # get tweets with company name
    tweets2 = pd.DataFrame(get_tweets(ticker, 500)) # get tweets with company ticker
    tweets = pd.concat([tweets1, tweets2])
    tweets.drop_duplicates(inplace=True)

    # Create new dataframe with extra column of text
    df = pd.DataFrame(tweets)
    df['text'] = df[0]

    # Clean new column of text
    remove_rt = lambda x: re.sub('RT @\w+: ', ' ', x)
    clean_tweet = lambda x: re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", x)
    df['text'] = df.text.map(remove_rt).map(clean_tweet)
    df['text'] = df.text.str.lower()

    # Calculate parameters of tweets
    df[['polarity', 'subjectivity']] = df['text'].apply(lambda x: pd.Series(TextBlob(x).sentiment))

    for idx, text in df['text'].iteritems():
        score = SentimentIntensityAnalyzer().polarity_scores(text)
        negative = score['neg']
        neutral = score['neu']
        positive = score['pos']
        compound = score['compound']
    
        df.loc[idx, 'negative'] = negative
        df.loc[idx, 'neutral'] = neutral
        df.loc[idx, 'positive'] = positive
        df.loc[idx, 'compound'] = compound
    
        # Determine sentiment using compound parameter
        if compound >= 0.05:
            df.loc[idx, 'sentiment'] = 'positive'
        elif compound <= -0.05:
            df.loc[idx, 'sentiment'] = 'negative'
        else:
            df.loc[idx, 'sentiment'] = 'neutral'
    
    return get_stock_sentiment(df)

In [None]:
df['sentiment'] = df['ticker'].apply(get_sentiment)
df