# Sentiment Analysis for Stock Market Prediction #

Reference: https://towardsdatascience.com/sentiment-analysis-for-stock-price-prediction-in-python-bed40c65d178

### Imports and Packages ###

In [1]:
# For flair module
import flair
sentiment_model = flair.models.TextClassifier.load('en-sentiment')

# For text cleaning and preparation
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer() 

# For dealing with dataframe
import pandas as pd

# For making request to API
import requests

# For calling yahoo finance to get stock price 
import pandas_datareader as pdr
import datetime as dt
import dateutil.parser as dparser


2021-10-27 18:28:09,783 loading file /Users/aj/.flair/models/sentiment-en-mix-distillbert_4.pt


### Model Properties ###

In [2]:
# Twitter Properties
TWITTER_SEARCH_URI = 'https://api.twitter.com/1.1/search/tweets.json'  # Twitter API search URI
TWITTER_BEARER_TOKEN = 'AAAAAAAAAAAAAAAAAAAAAF8pUQEAAAAAL7QXs9hTVl4Zq2GE%2FYhVQeJ9kyE%3DUaV8oJRddqkRSZfDXYqI7qS2lBnfRV8lj2S0K6BPJeg4pPfRQf'   # Token for twitter API
TWITTER_MAX_SENTIMENT = 100   # Maximum number of sentiments to fetch from twitter API

# Dataframe Properties
AUTHOR_NAME = 'Author_Name'
FOLLOWERS_COUNT = 'Followers_Count'
USER_MENTIONS = 'User_Mentions'         # Contains comma separated full name of the mentioned user
HASH_TAGS = 'Hash_Tags'                 # Contains comma separated hashtag texts
FRIENDS_COUNT = 'Friends_Count'
TEXT = 'Text'
RETWEET_COUNT = 'Retweet_Count'
CREATED_AT = 'Created_At'
SENTIMENT = 'Sentiment'
SENTIMENT_PROBABILITY = 'Sentiment_Probability'
STOCK_PRICE = 'Stock_Price'

# Stock Properties 
STOCK_NAME = 'tesla'    # Stock name to run model
STOCK_TICKER = 'TSLA'

## 1. Extract Raw Data ##
Calling Twitter API to get raw sentiments from users

In [3]:
# Parmeters to pass to twitter API
params = {
    'q': STOCK_NAME,
    'tweet_mode': 'extended',
    'lang': 'en',
    'count': TWITTER_MAX_SENTIMENT
}

# Call twitter API
response = requests.get(
    TWITTER_SEARCH_URI,
    params=params,
    headers={
        'authorization': 'Bearer '+ TWITTER_BEARER_TOKEN
})

# Get response in json
response_json = response.json()

## 2. Dataset Creation ##

### I. Converting twitter response json to dataset ###

In [4]:
# Defining dataframe with features 
df = pd.DataFrame(columns=[
    AUTHOR_NAME, 
    FOLLOWERS_COUNT, 
    USER_MENTIONS, 
    HASH_TAGS, 
    FRIENDS_COUNT, 
    TEXT, 
    RETWEET_COUNT, 
    CREATED_AT
])

In [5]:
# Converting twitter response json to dataset selecting required features
for status in response_json['statuses']:    
    df = df.append({
        AUTHOR_NAME: status['user']['name'],
        FOLLOWERS_COUNT: status['user']['followers_count'], 
        USER_MENTIONS: ','.join([ um['name'] for um in status['entities']['user_mentions']]),    # joined user mentions with comma separated value
        HASH_TAGS: ','.join([ ht['text'] for ht in status['entities']['hashtags']]),         # joined hashtags with comma separated value
        FRIENDS_COUNT: status['user']['friends_count'],
        TEXT: status['full_text'],
        RETWEET_COUNT: status['retweet_count'],
        CREATED_AT: status['created_at']
    }, ignore_index=True)

### II. Creating Stock Price label by using yahoo finance module ###

In [7]:
# Creating dictionary of date and stock price for particular stock
stock_price_dict = {}

for created_at in df[CREATED_AT]:
    parsed_date = dparser.parse(created_at, fuzzy=True)
    date = dt.datetime(parsed_date.year,parsed_date.month,parsed_date.day)
    if date not in stock_price_dict:
        stock_price = pdr.get_data_yahoo(STOCK_TICKER, start = date, end = date)['Close'][0]
        stock_price_dict[date] = stock_price

In [8]:
# Adding stock price label in the dataframe
stock_prices = []
for created_at in df[CREATED_AT]:
    parsed_date = dparser.parse(created_at, fuzzy=True)
    date = dt.datetime(parsed_date.year,parsed_date.month,parsed_date.day)
    stock_prices.append(stock_price_dict[date])

df[STOCK_PRICE] = stock_prices

In [9]:
# Displaying top 5 record for visualizing
df.head(5)

Unnamed: 0,Author_Name,Followers_Count,User_Mentions,Hash_Tags,Friends_Count,Text,Retweet_Count,Created_At,Stock_Price
0,Joseph L,35,"Derry Doberman,Joseph L,SwiftOnSecurity,Tesla,...",,384,RT @DerryDoberman: @JoeLibuszowski @SwiftOnSec...,1,Wed Oct 27 22:28:10 +0000 2021,1037.859985
1,Sarbjeet Johal,25605,"Sarbjeet Johal,Market Rebellion,Tesla,General ...",,17519,"RT @sarbjeetjohal: @MarketRebels Today, @Tesla...",1,Wed Oct 27 22:28:08 +0000 2021,1037.859985
2,Jorge A. Sanchez,456,CleanTechnica,,1226,RT @cleantechnica: Tesla’s Horde Of Megapacks ...,9,Wed Oct 27 22:28:08 +0000 2021,1037.859985
3,8of10doesTheSopranos,3,"𝗧𝗲𝘀𝗹𝗮 𝗙𝗮𝗰𝘁𝘀 🔋,Charlie Schulze,Paul Krugman",,104,@truth_tesla @schulzecharlie2 @paulkrugman Yo ...,0,Wed Oct 27 22:28:08 +0000 2021,1037.859985
4,Arturo Ibarra,286,"Elon Musk,Jay in Shanghai 🇨🇳",,544,RT @elonmusk: @JayinShanghai That math is inco...,271,Wed Oct 27 22:28:06 +0000 2021,1037.859985


## 3. Feature Engineering ##

### I. Applying text cleaning methods ###

In [10]:
# Function to clean text by applying several text cleaning methods
def clean_prepare_text(text):
    # 1. Clean the text - cleaning special character
    filtered_text = text.replace("\r", " ").replace("\n", " ").replace("  ", " ")

    # 2. Remove all the punctuations
    for punct_sign in list("?:!,;\""):
        filtered_text = filtered_text.replace(punct_sign, '')

    # 3. Convert to lowercase 
    filtered_text = filtered_text.lower()

    # 4. Remove stopwords
    filtered_text = " ".join([text for text in filtered_text.split(" ") if text not in stopwords.words('english')])

    # 5. Remove meaningles words 
    filtered_text = filtered_text.replace("\'ve", '').replace("\'s", '').replace("\'m", '').replace("\'t", '')

    # 6.a. Stemming
    filtered_text = " ".join([stemmer.stem(text) for text in filtered_text.split(" ")])

    # 6.b. lemmatization
    filtered_text = " ".join([lemmatizer.lemmatize(text) for text in filtered_text.split(" ")])

    return filtered_text


In [11]:
# Clean and Prepare text for further processing
df[TEXT] = [clean_prepare_text(text) for text in df[TEXT]]

### II. Creating Sentiment Feature by using pretrained 'flair' Model ###

In [12]:
sentiments = []
sentiment_probabilities = []

for text in df[TEXT]:
    sentence = flair.data.Sentence(text)
    sentiment_model.predict(sentence)
    
    probability = sentence.labels[0].score  # numerical value 0-1
    sentiment = sentence.labels[0].value    # 'POSITIVE' or 'NEGATIVE'
    
    sentiments.append(sentiment)
    sentiment_probabilities.append(probability)
    
df[SENTIMENT] = sentiments
df[SENTIMENT_PROBABILITY] = sentiment_probabilities

In [13]:
# Displaying top 5 record for visualizing
df.head(5)

Unnamed: 0,Author_Name,Followers_Count,User_Mentions,Hash_Tags,Friends_Count,Text,Retweet_Count,Created_At,Stock_Price,Sentiment,Sentiment_Probability
0,Joseph L,35,"Derry Doberman,Joseph L,SwiftOnSecurity,Tesla,...",,384,rt @derrydoberman @joelibuszowski @swiftonsecu...,1,Wed Oct 27 22:28:10 +0000 2021,1037.859985,POSITIVE,0.918251
1,Sarbjeet Johal,25605,"Sarbjeet Johal,Market Rebellion,Tesla,General ...",,17519,rt @sarbjeetjoh @marketrebel today @tesla ~70%...,1,Wed Oct 27 22:28:08 +0000 2021,1037.859985,POSITIVE,0.904611
2,Jorge A. Sanchez,456,CleanTechnica,,1226,rt @cleantechnica tesla’ hord megapack begin e...,9,Wed Oct 27 22:28:08 +0000 2021,1037.859985,NEGATIVE,0.792723
3,8of10doesTheSopranos,3,"𝗧𝗲𝘀𝗹𝗮 𝗙𝗮𝗰𝘁𝘀 🔋,Charlie Schulze,Paul Krugman",,104,@truth_tesla @schulzecharlie2 @paulkrugman yo ...,0,Wed Oct 27 22:28:08 +0000 2021,1037.859985,NEGATIVE,0.770787
4,Arturo Ibarra,286,"Elon Musk,Jay in Shanghai 🇨🇳",,544,rt @elonmusk @jayinshanghai math incorrect sti...,271,Wed Oct 27 22:28:06 +0000 2021,1037.859985,NEGATIVE,0.998797


## 4. Train the Model ##

## 5. Evaluate the Model ## 

## 6. Predict the Stock Price ##

## 7. Apply Abstractive Summarization ##