# Sentiment Analysis for Stock Market Prediction #

Reference: https://towardsdatascience.com/sentiment-analysis-for-stock-price-prediction-in-python-bed40c65d178

### Imports and Packages ###

In [1]:
# For flair module
import flair
sentiment_model = flair.models.TextClassifier.load('en-sentiment')

# For text cleaning and preparation
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer() 

# For dealing with dataframe
import pandas as pd

# For making request to API
import requests

# For calling yahoo finance to get stock price 
import pandas_datareader as pdr
import datetime as dt
import dateutil.parser as dparser


2021-10-27 17:50:30,803 loading file /Users/aj/.flair/models/sentiment-en-mix-distillbert_4.pt


### Model Properties ###

In [2]:
# Twitter Properties
TWITTER_SEARCH_URI = 'https://api.twitter.com/1.1/search/tweets.json'  # Twitter API search URI
TWITTER_BEARER_TOKEN = 'AAAAAAAAAAAAAAAAAAAAAF8pUQEAAAAAL7QXs9hTVl4Zq2GE%2FYhVQeJ9kyE%3DUaV8oJRddqkRSZfDXYqI7qS2lBnfRV8lj2S0K6BPJeg4pPfRQf'   # Token for twitter API
TWITTER_MAX_SENTIMENT = 100   # Maximum number of sentiments to fetch from twitter API

# Dataframe Properties
AUTHOR_NAME = 'Author_Name'
FOLLOWERS_COUNT = 'Followers_Count'
USER_MENTIONS = 'User_Mentions'         # Contains comma separated full name of the mentioned user
HASH_TAGS = 'Hash_Tags'                 # Contains comma separated hashtag texts
FRIENDS_COUNT = 'Friends_Count'
TEXT = 'Text'
RETWEET_COUNT = 'Retweet_Count'
CREATED_AT = 'Created_At'
SENTIMENT = 'Sentiment'
SENTIMENT_PROBABILITY = 'Sentiment_Probability'
STOCK_PRICE = 'Stock_Price'

# Stock Properties 
STOCK_NAME = 'tesla'    # Stock name to run model
STOCK_TICKER = 'TSLA'

## 1. Extract Raw Data ##
Calling Twitter API to get raw sentiments from users

In [3]:
# Parmeters to pass to twitter API
params = {
    'q': STOCK_NAME,
    'tweet_mode': 'extended',
    'lang': 'en',
    'count': TWITTER_MAX_SENTIMENT
}

# Call twitter API
response = requests.get(
    TWITTER_SEARCH_URI,
    params=params,
    headers={
        'authorization': 'Bearer '+ TWITTER_BEARER_TOKEN
})

# Get response in json
response_json = response.json()

## 2. Dataset Creation ##

In [4]:
# Defining dataframe with features 
df = pd.DataFrame(columns=[
    AUTHOR_NAME, 
    FOLLOWERS_COUNT, 
    USER_MENTIONS, 
    HASH_TAGS, 
    FRIENDS_COUNT, 
    TEXT, 
    RETWEET_COUNT, 
    CREATED_AT
])

In [5]:
# Converting twitter response json to dataset selecting required features
for status in response_json['statuses']:    
    df = df.append({
        AUTHOR_NAME: status['user']['name'],
        FOLLOWERS_COUNT: status['user']['followers_count'], 
        USER_MENTIONS: ','.join([ um['name'] for um in status['entities']['user_mentions']]),    # joined user mentions with comma separated value
        HASH_TAGS: ','.join([ ht['text'] for ht in status['entities']['hashtags']]),         # joined hashtags with comma separated value
        FRIENDS_COUNT: status['user']['friends_count'],
        TEXT: status['full_text'],
        RETWEET_COUNT: status['retweet_count'],
        CREATED_AT: status['created_at']
    }, ignore_index=True)

In [6]:
# Displaying top 5 record for visualizing
df.head(5)

Unnamed: 0,Author_Name,Followers_Count,User_Mentions,Hash_Tags,Friends_Count,Text,Retweet_Count,Created_At
0,Bills Folly,552,Newsweek,,504,"RT @Newsweek: ""Eventually, they run out of oth...",10,Wed Oct 27 21:50:32 +0000 2021
1,Nani?!,0,"Elon Musk,Tesla Raj,Elon Musk",shibatothemoon,2,@elonmusk @tesla_raj Mr @elonmusk we are wait...,0,Wed Oct 27 21:50:31 +0000 2021
2,Anand #BLM Sharma (आनंद),1131,"Uber,Hertz,Tesla",,3225,RT @Uber: Proud to partner with @Hertz as they...,165,Wed Oct 27 21:50:30 +0000 2021
3,Luke nicholson🎃,52,Daz Black,,126,RT @daz_black: Pick up my Tesla in two weeks! ...,4,Wed Oct 27 21:50:30 +0000 2021
4,DIRTYOUWHORZ,0,"Elon Musk,Tesla Raj",,0,@elonmusk @tesla_raj Daycare For kids on the m...,0,Wed Oct 27 21:50:29 +0000 2021


## 3. Feature Engineering ##

### I. Applying text cleaning methods ###

In [7]:
# Function to clean text by applying several text cleaning methods
def clean_prepare_text(text):
    # 1. Clean the text - cleaning special character
    filtered_text = text.replace("\r", " ").replace("\n", " ").replace("  ", " ")

    # 2. Remove all the punctuations
    for punct_sign in list("?:!,;\""):
        filtered_text = filtered_text.replace(punct_sign, '')

    # 3. Convert to lowercase 
    filtered_text = filtered_text.lower()

    # 4. Remove stopwords
    filtered_text = " ".join([text for text in filtered_text.split(" ") if text not in stopwords.words('english')])

    # 5. Remove meaningles words 
    filtered_text = filtered_text.replace("\'ve", '').replace("\'s", '').replace("\'m", '').replace("\'t", '')

    # 6.a. Stemming
    filtered_text = " ".join([stemmer.stem(text) for text in filtered_text.split(" ")])

    # 6.b. lemmatization
    filtered_text = " ".join([lemmatizer.lemmatize(text) for text in filtered_text.split(" ")])

    return filtered_text


In [8]:
# Clean and Prepare text for further processing
df[TEXT] = [clean_prepare_text(text) for text in df[TEXT]]

### II. Creating Sentiment Feature by using pretrained 'flair' Model ###

In [9]:
sentiments = []
sentiment_probabilities = []

for text in df[TEXT]:
    sentence = flair.data.Sentence(text)
    sentiment_model.predict(sentence)
    
    probability = sentence.labels[0].score  # numerical value 0-1
    sentiment = sentence.labels[0].value    # 'POSITIVE' or 'NEGATIVE'
    
    sentiments.append(sentiment)
    sentiment_probabilities.append(probability)
    
df[SENTIMENT] = sentiments
df[SENTIMENT_PROBABILITY] = sentiment_probabilities

### III. Creating Stock Price Feature by using yahoo finance module ###

In [24]:
# Creating dictionary of date and stock price for particular stock
stock_price_dict = {}

for created_at in df[CREATED_AT]:
    parsed_date = dparser.parse(created_at, fuzzy=True)
    date = dt.datetime(date.year,date.month,date.day)
    if date not in stock_price_dict:
        stock_price = pdr.get_data_yahoo(STOCK_TICKER, start = date, end = date)['Close'][0]
        stock_price_dict[date] = stock_price

In [26]:
stock_prices = []
for created_at in df[CREATED_AT]:
    parsed_date = dparser.parse(created_at, fuzzy=True)
    date = dt.datetime(date.year,date.month,date.day)
    stock_prices.append(stock_price_dict[date])

df[STOCK_PRICE] = stock_prices

In [27]:
# Displaying top 5 record for visualizing
df.head(5)

Unnamed: 0,Author_Name,Followers_Count,User_Mentions,Hash_Tags,Friends_Count,Text,Retweet_Count,Created_At,Sentiment,Sentiment_Probability,Stock_Price
0,Bills Folly,552,Newsweek,,504,rt @newsweek eventu run peopl money come tesla...,10,Wed Oct 27 21:50:32 +0000 2021,NEGATIVE,0.980521,1037.859985
1,Nani?!,0,"Elon Musk,Tesla Raj,Elon Musk",shibatothemoon,2,@elonmusk @tesla_raj mr @elonmusk wait go moon...,0,Wed Oct 27 21:50:31 +0000 2021,POSITIVE,0.833236,1037.859985
2,Anand #BLM Sharma (आनंद),1131,"Uber,Hertz,Tesla",,3225,rt @uber proud partner @hertz make 50000 @tesl...,165,Wed Oct 27 21:50:30 +0000 2021,POSITIVE,0.995043,1037.859985
3,Luke nicholson🎃,52,Daz Black,,126,rt @daz_black pick tesla two week excit,4,Wed Oct 27 21:50:30 +0000 2021,NEGATIVE,0.991044,1037.859985
4,DIRTYOUWHORZ,0,"Elon Musk,Tesla Raj",,0,@elonmusk @tesla_raj daycar kid minibu school ...,0,Wed Oct 27 21:50:29 +0000 2021,POSITIVE,0.567728,1037.859985


## 4. Train the Model ##

## 5. Evaluate the Model ## 

## 6. Predict the Stock Price ##

## 7. Apply Abstractive Summarization ##