In [1]:
import pandas as pd
import numpy as np
from transformers import pipeline
from modules.tweet_data import read_raw_data, clean_sentiment
from modules.spacy import spacy_twitter_model
import datetime as dt
import time
import pickle

pd.options.mode.chained_assignment = None  # default='warn'

In [11]:
# sentiment-analysis uses model 'distilbert-base-uncased-finetuned-sst-2-english'
# https://huggingface.co/transformers/_modules/transformers/pipelines.html#TextClassificationPipeline
sentiment_model = pipeline('sentiment-analysis', device=0)

In [3]:
tweet_df = read_raw_data()
#tweet_df = tweet_df['2020-02-01':'2020-02-10']
tweet_df.tweet = clean_sentiment(tweet_df.tweet)

Reading data
Filtering link
Filtering piclink
Filtering hashtag
Filtering email
Filtering add


In [15]:
sentiment = tweet_df[['tweet_id','handle']].copy()
sentiment['sentiment'] = 0
sentiment['score'] = 0

In [16]:
t = time.time()
batch_size = 16
for i in range(0,len(tweet_df),batch_size):
    if i%10000 == 0:
        print("{}/{}, time {:.2f}".format(i, len(tweet_df),(time.time() - t)/60),end="\r")
    sentiment_dict = sentiment_model(tweet_df.tweet.iloc[i:i+batch_size].to_list())
    sentiment.sentiment.iloc[i:i+batch_size] = [1 if d['label'] == 'POSITIVE' else -1 for d in sentiment_dict]
    sentiment.score.iloc[i:i+batch_size] = [d['score'] for d in sentiment_dict]
    
print("{}/{}, time {:.2f}".format(i, len(tweet_df),(time.time() - t)/60),end="\r")

35109344/35109352, time 471.11

In [17]:
pickle.dump( sentiment, open( "sentiment.p", "wb" ) )

Sentiment stats:
    
Average sentiment
Avg sentiment by person
Avg sentiment by non fintwit?
Average sentiment by fintwit
Average sentiment by trader

When fitting market, can we find the most predictive twitters

In [153]:
sentiment = pickle.load(open( "sentiment_2020-02-01_2020-02-10.p", "rb" ))
classi = pd.read_csv('data/handle_class.csv')
sentiment = pd.merge(sentiment,classi[['class','handle','subclass']],on=['handle'], right_index=True)
sentiment.head()

Unnamed: 0_level_0,tweet_id,handle,sentiment,score,class,subclass
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020-02-01 00:00:00,1223395537186983936,RT_com,1,0.999453,news,feed
2020-02-01 00:15:00,1223399311746224135,RT_com,-1,0.997467,news,feed
2020-02-01 00:23:26,1223401436358356992,RT_com,-1,0.984975,news,feed
2020-02-01 00:30:00,1223403087081033729,RT_com,-1,0.501654,news,feed
2020-02-01 00:45:00,1223406861493272578,RT_com,-1,0.934397,news,feed


In [19]:
es = pd.read_csv('data/ES.csv')
es = es[es.Time == '15:00'][['Date','Time','Close']]
es.Date = pd.to_datetime(es.Date +' ' +'15:45')
es.drop('Time',axis=1,inplace=True)
es.set_index('Date', inplace=True)
es['Post'] = np.roll(es['Close'].pct_change(),-1)
date_range = es['2020-01-31':'2020-02-10'].index
date_range

In [154]:
def final_score(x):
    return (x.sentiment * x.score).mean()

sentiment_features = pd.DataFrame(index = date_range[1:])
date_bins = pd.cut(sentiment.index, bins=date_range)

# Daily Average
sentiment_features['daily_average'] = sentiment.groupby(date_bins).apply(final_score)

#Average by user class
class_averages = sentiment.groupby([date_bins,'class']).apply(final_score).unstack().add_prefix('daily_average_')
sentiment_features[class_averages.columns] = class_averages
del class_averages

sentiment_features['daily_average_trader'] = sentiment.groupby([date_bins,'subclass']).apply(final_score)[:,'trader']

sentiment_features

Unnamed: 0_level_0,daily_average,daily_average_fintwit,daily_average_news,daily_average_politics,daily_average_trader
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-02-03 15:45:00,-0.221281,-0.225047,-0.285019,-0.133503,-0.211787
2020-02-04 15:45:00,-0.25987,-0.253511,-0.313242,-0.203983,-0.231847
2020-02-05 15:45:00,-0.252943,-0.264692,-0.28522,-0.166692,-0.230644
2020-02-06 15:45:00,-0.262527,-0.270511,-0.30581,-0.179714,-0.244877
2020-02-07 15:45:00,-0.256294,-0.251116,-0.335806,-0.164688,-0.192704
2020-02-10 15:45:00,-0.197401,-0.181056,-0.234058,-0.187706,-0.137451


In [193]:
user_features = sentiment.groupby([date_bins, 'handle']).apply(final_score).unstack()
user_features.index = sentiment_features.index

In [214]:
user_features

handle,17catspaw,20committee,2yrflipper,4xForecaster,4xguy,50Pips,5_min_macro,64tetrahedron,77cyko,ABartonMacro,...,vtg2,wagner_farms,waltergmurphy,washingtonpost,wesbury,wikileaks,yesandnotyes,zatapatique,zerobeta,zlj517
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-02-03 15:45:00,0.959037,,-0.192121,-0.742036,,0.128771,-0.939472,-0.197316,0.036971,-0.461473,...,0.305758,0.105436,0.997557,-0.181076,-0.251108,-0.997513,,-0.331154,0.748121,0.509497
2020-02-04 15:45:00,-0.023877,0.439851,-0.19927,0.000916,,0.16541,,-0.439504,0.06825,-0.488484,...,-0.416616,-0.377885,,-0.249937,0.263029,,,,0.767505,-0.008247
2020-02-05 15:45:00,-0.969098,-0.999745,-0.430338,,-0.932774,-0.027754,,-0.256301,-0.016244,-0.506156,...,-0.591793,0.808213,-0.00033,-0.2911,-0.597055,,,,,0.505707
2020-02-06 15:45:00,-0.994794,0.998223,-0.463977,0.748121,,-0.21478,,-0.298961,-0.223173,-0.295644,...,0.943217,-0.83176,,-0.33522,-0.444097,0.718143,0.999558,,,0.650885
2020-02-07 15:45:00,-0.453243,0.992693,-0.025119,-0.995841,,-0.126773,0.938604,-0.315821,0.239096,-0.572562,...,0.017046,0.072457,,-0.415263,0.037315,,,,,0.468835
2020-02-10 15:45:00,-0.572317,-0.487331,-0.911606,-0.998495,,-0.518924,-0.999212,0.033597,0.076276,-0.44784,...,-0.01768,-0.093669,,-0.216706,-0.907143,-0.338289,,,0.000793,0.445845


In [216]:
import statsmodels.api as sm

X = sm.add_constant(user_features[['20committee']])
y = es['Post'].loc[user_features[['20committee']].index]

# Fit and summarize OLS model
mod = sm.OLS(y,X,missing='drop')
res = mod.fit()
res.summary()



0,1,2,3
Dep. Variable:,Post,R-squared:,0.0
Model:,OLS,Adj. R-squared:,-0.333
Method:,Least Squares,F-statistic:,0.00084
Date:,"Mon, 27 Jul 2020",Prob (F-statistic):,0.979
Time:,16:19:33,Log-Likelihood:,18.75
No. Observations:,5,AIC:,-33.5
Df Residuals:,3,BIC:,-34.28
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.0037,0.003,1.108,0.349,-0.007,0.014
20committee,-0.0001,0.004,-0.029,0.979,-0.013,0.013

0,1,2,3
Omnibus:,,Durbin-Watson:,2.114
Prob(Omnibus):,,Jarque-Bera (JB):,0.243
Skew:,-0.205,Prob(JB):,0.886
Kurtosis:,2.002,Cond. No.,1.35


In [None]:
def twitter_tokenizer(data,
                      model=nlp,
                      urls=True,
                      stop_words=False,
                      lowercase=True,
                      alpha_only=True,
                      hashtags=True,
                      lemma=False):
    """
    Full tokenizer with flags for processing steps

    Parameters:
        data: string
            String to be tokenized
        model: Spacy model
            Ideally, an output from the method spacy_twitter_model() from modules.spacy
        urls: bool
            If True, remove URLs and Twitter picture links
        stop_words: bool
            If True, removes stop words
        lowercase: bool
            If True, turns all tokens to lowercase
        alpha_only: bool
            If True, removes all non-alpha characters
        hashtags: bool
            If True, removes hashtags
        lemma: bool
            If True, lemmatizes words
    """
    parsed = model(data)
    # token collector
    tokens = []
    for t in parsed:
        # remove URLs abd Twitter picture links
        if t.like_url or t._.is_piclink & urls:
            continue
        # remove stopwords
        if t.is_stop & stop_words:
            continue
        # alpha characters only
        if not t.is_alpha & alpha_only:
            # if not alpha only, remove hashtags
            if hashtags:
                continue
            else:
                if not t._.is_hashtag:
                    continue
        # lemmatize
        if lemma:
            t = t.lemma_
        else:
            t = t.text
        # turn to lowercase
        if lowercase:
            t = t.lower()
        tokens.append(t)
    return tokens

t = time.time()
for i in range(0,len(tweet_df),16):
    if i%1000 == 0:
        print("{}/{}, time {:.2f}".format(i, len(tweet_df),(time.time() - t)/60),end="\r")
    tokenized = [' '.join(twitter_tokenizer(tweet)) for tweet in tweet_df.tweet.iloc[i:i+16]]
    sentiment_dict = sentiment_model(tokenized)
    sentiment.sentiment.iloc[i:i+16] = [1 if d['label'] == 'POSITIVE' else -1 for d in sentiment_dict]
    sentiment.score.iloc[i:i+16] = [d['score'] for d in sentiment_dict]
    
print("{}/{}, time {:.2f}".format(i, len(tweet_df),(time.time() - t)/60),end="\r")