In [1]:
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble

import pandas as pd
import numpy as np
import emoji, tldextract, xgboost, textblob, string, ekphrasis, nltk, re, gensim

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

from keras.preprocessing import text, sequence
from keras import layers, models, optimizers
from xgboost.sklearn import XGBClassifier

from sklearn.svm import NuSVC, SVC
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_validate
from mlxtend.classifier import StackingCVClassifier

from ekphrasis.classes.spellcorrect import SpellCorrector
from ekphrasis.classes.preprocessor import TextPreProcessor
from ekphrasis.classes.tokenizer import SocialTokenizer
from ekphrasis.dicts.emoticons import emoticons

from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag

from gensim.models import Word2Vec

from tqdm import tqdm
tqdm.pandas(desc="progress-bar")
from gensim.models import Doc2Vec
from sklearn import utils
from langdetect import detect
from gensim.models.doc2vec import TaggedDocument
from datetime import datetime, timedelta

sp = SpellCorrector(corpus="english") 

Using TensorFlow backend.
  from pandas import Panel


Reading english - 1grams ...


In [21]:
media = pd.read_csv('media_sample.csv', index_col=0)
media = media[['id','content','created_datetime']]
media = media.dropna()
media = media.rename({'content':'text'}, axis = 1)

In [4]:
#Detect language to filter English-only documents
l = []
for i in media.text:
    try:
        k = detect(i)
        l.append(k)
    except:
        l.append('en')
media['lang'] = l
media = media[media.lang == 'en']

## Part 1: Text preprocessing


In [5]:
text_processor = TextPreProcessor(
    normalize=['url', 'email', 'percent', 'money', 'phone', 'time', 'date', 'number'],
    fix_html=True,  
    segmenter="twitter", 
    corrector="twitter", 
    unpack_hashtags=True,  
    unpack_contractions=True, 
    spell_correct_elong=True,
    tokenizer=SocialTokenizer(lowercase=True).tokenize,
    dicts=[emoticons]
)

  self.tok = re.compile(r"({})".format("|".join(pipeline)))


Reading twitter - 1grams ...
Reading twitter - 2grams ...
Reading twitter - 1grams ...


  regexes = {k.lower(): re.compile(self.expressions[k]) for k, v in


In [6]:
stopwords = nltk.corpus.stopwords.words('english')
rooter = nltk.stem.WordNetLemmatizer().lemmatize
punctuation = '!"$%&\'()*+,-./:;=?[\\]^_`{|}~•'

def get_word_and_tag(tokens):
    tagged = pos_tag(tokens)
    cleaned_tags = []
    for word, tag in tagged:
        if tag.startswith("NN"):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else:
            pos = 'a'
        cleaned_tags.append((word,pos))
    return cleaned_tags

def clean_tweet(tweet):
    tweet = tweet.lower() # lower case
    tweet = emoji.demojize(tweet) #emojis to text
    tweet = re.sub('['+punctuation + ']+', ' ', tweet) # remove punctuation
    tokens = [word for word in tweet.split(' ') if word not in stopwords] # remove stopwords
    tokens = [word for word in tokens if len(word)>0] #remove double spaces
    
    tokens = [rooter(word,tag) for word,tag in get_word_and_tag(tokens)] # apply word rooter with POS tagging
    tweet = ' '.join(tokens)
    return tweet

In [10]:
media['corrected_text'] = [" ".join(text_processor.pre_process_doc(s)) for s in media.text]
media['corrected_text'] = media['corrected_text'].apply(clean_tweet)

In [11]:
#Create time-series DataFrame
media['datetime'] = pd.to_datetime(media['created_datetime'])
media_ts = media.set_index('datetime', drop = False)

## Part 2: Index data preprocessing

In [13]:
index = pd.read_csv('S&P 500 Information Technology Historical Data.csv')
index['datetime'] = pd.to_datetime(index['Date'])
index['datetime'] = [i + timedelta(hours=9.5) for i in index.datetime] #Set index to the time that stock market opens
index = index.set_index('datetime', drop = False)
index.drop(['Date', 'Price', 'Open', 'High', 'Low', 'Vol.'], axis=1, inplace=True)
index = index['2020-04-24':'2020-01-27']
index = index.sort_index(ascending=True)
index['change'] = [float(i[:-1]) for i in index['Change %']]
index

Unnamed: 0_level_0,Change %,datetime,change
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2020-01-27 09:30:00,-2.36%,2020-01-27 09:30:00,-2.36
2020-01-28 09:30:00,1.87%,2020-01-28 09:30:00,1.87
2020-01-29 09:30:00,0.21%,2020-01-29 09:30:00,0.21
2020-01-30 09:30:00,0.88%,2020-01-30 09:30:00,0.88
2020-01-31 09:30:00,-2.72%,2020-01-31 09:30:00,-2.72
...,...,...,...
2020-04-20 09:30:00,-1.78%,2020-04-20 09:30:00,-1.78
2020-04-21 09:30:00,-4.10%,2020-04-21 09:30:00,-4.10
2020-04-22 09:30:00,3.87%,2020-04-22 09:30:00,3.87
2020-04-23 09:30:00,-0.69%,2020-04-23 09:30:00,-0.69


## Part 3: Calculation of sentiment

* Build vocabulary
* Run algorithm
* Accumulate Word sentiment
* Calculate Article sentiment

In [12]:
def get_range(date, n):
    """
    date: Date of stock price
    n: Range of days
    returns: String representation of date range.
    """
    return str(date - timedelta(days = n+1)), str(date - timedelta(days = n))

In [14]:
def get_vocab(l):
    #Get vocabulary from corpus
    t = [i.split() for i in l]
    return list(set([i for sl in t for i in sl]))

In [15]:
word_list = get_vocab(media.corrected_text)

d = dict()
av_d = dict()

#Initialize word dictionaries
#Each word is represented by a list of stock market changes (d) and the average change (av_d)
for w in word_list:
    d[w] = []
    av_d[w] = 0

In [17]:
n=3 
"""
Number of days before
#e.g. when n=3, the sentiment lexicon will be built based on documents that were published within 
#3 days before the openning of the stock market at the day in study
"""

prev = str(min(media.datetime)) #Date of first document

for i, row in index.iterrows():
    change = row['change']
    r = get_range(row['datetime'], n)
    sl = media_ts[prev:r[1]] #Subset documents based on date range
    
    words = get_vocab(sl.corrected_text) #Get vocabulary of subset
    
    for word in words:
        d[word].append(change)
        
    prev = r[1]

for word in d: #Calculate average fluctuations n days after publishing an article which contains the word
    av_d[word] = np.mean(d[word])

In [18]:
def get_sentiment_from_text(t):
    """
    t: Text of document
    Tokenizes the document and calculates the average sentiment of the words consisting the document
    """
    tokenized = t.split()
    sentiment = 0
    for i in tokenized:
        sentiment+=av_d[i]
        
    return sentiment

In [19]:
media['sentiment_index'] = [get_sentiment_from_text(i) for i in media.corrected_text]

In [20]:
media

Unnamed: 0,id,text,created_datetime,lang,corrected_text,datetime,sentiment_index
1,46534906195,"FarmerHarv wrote: ↑ Jan 26th, 2020 11:04 am Wh...",2020-01-30 21:59:00,en,farmerharv write ↑ <date> <time> market impact...,2020-01-30 21:59:00,5.394955
2,46534908104,10 am Eastern start today in case folks want t...,2020-02-21 15:26:29,en,<number> eastern start today case folk want op...,2020-02-21 15:26:29,-6.780057
3,46534908442,"Beijing, Jan 31 The World Health Organization ...",2020-01-31 02:40:55,en,beijing <date> world health organization decla...,2020-01-31 02:40:55,-15.527924
4,46534909270,Quote: Originally Posted by Mikala43 Has anyon...,2020-01-31 14:48:00,en,quote originally post mikala43 anyone see resp...,2020-01-31 14:48:00,4.063406
5,46534911681,I wanted to touch base on how the lack of expe...,2020-02-22 17:20:38,en,want touch base lack expertise professionalism...,2020-02-22 17:20:38,-50.752430
...,...,...,...,...,...,...,...
9993,48223491438,"In this QUAH Sal, Adam, &amp; Justin answer th...",2020-04-16 11:55:32,en,quah sal adam justin answer question “ somethi...,2020-04-16 11:55:32,-6.290081
9994,48223499707,"Ex-footballer Mark Lawrenson, 62, claimed the ...",2020-04-19 09:48:06,en,ex footballer mark lawrenson <number> claim br...,2020-04-19 09:48:06,-9.857453
9995,48223759008,"Update from Slovenia 1335 cases (+5), 77 death...",2020-04-20 20:03:43,en,update slovenia <number> case <number> <number...,2020-04-20 20:03:43,6.167458
9996,48223762895,The Tories have backed themselves into a corne...,2020-04-20 15:09:30,en,tory back corner inaction negligence manage ro...,2020-04-20 15:09:30,8.602135
