In [1]:
# Libraries.
import pandas as pd 
import numpy as np      
import datetime

In [2]:
# UPDATE: due to the sheer ammount of time and computing required for this, I won't be doing it anymore.
# Goals:
# 1. sentiment, retweets, etc. regression on stock price.
# 2. Neural network classification of tweet influence on
# [high increase, low increase, no change, low decrease, high decrease] - this corresponds to stock
# price movements.
# 3. Neural network regression of tweet influence on stock price.

In [3]:
# Importing the datasets.
tweets = pd.read_csv('tweets.csv')
stocks = pd.read_csv('snp500.csv')

In [4]:
# Fixing variable formats.
stocks['Gmt time'] = stocks['Gmt time'].astype('datetime64')
stocks = stocks.rename(index=str, columns={'Gmt time': 'created_at'})

tweets['text'] = tweets['text'].astype('str')
tweets['created_at'] = tweets['created_at'].astype('datetime64')
tweets['source'] = tweets['source'].astype('str')

# Setting indices as datetimes and dropping now irrelevant columns.
stocks = stocks.set_index(pd.DatetimeIndex(stocks['created_at'])).iloc[:, 1:] 
tweets = tweets.set_index(pd.DatetimeIndex(tweets['created_at'])).drop(labels='created_at', axis=1)
tweets.index = tweets.index.round(freq='min')

# Subsetting tweets and stocks, which occured during opening hours.
stocks = stocks.between_time('15:29', '21:01')
tweets = tweets.between_time('15:29', '21:01')

In [5]:
# VADER sentiment analysis.
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()

# Getting the polarity scores of tweets
scores = []
for tweet in tweets['text']:
    scores.append(sid.polarity_scores(tweet))
    
# Converting to pandas dataframe and joining with the final dataframe.
scores = pd.DataFrame(scores)
scores.index = tweets.index
tweets = tweets.join(scores)

In [6]:
tweets.sort_values(by='compound', ascending=False).head(1)

Unnamed: 0_level_0,source,text,retweet_count,favorite_count,is_retweet,id_str,compound,neg,neu,pos
created_at,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2017-12-09 18:20:00,Twitter for iPhone,It was my great honor to celebrate the opening...,14844,66079,False,939560154269405184,0.9769,0.041,0.568,0.39


In [7]:
'''
# This dataset has ALL the STOCK data, and boolean variable which indicates whether there was a tweet
# at that time.(166517 rows.)
stocks_full = pd.merge(stocks, tweets, on='created_at', how='left')
stocks_full['tweet_present'] = ~stocks_full['text'].isnull()
'''

# This dataset has ONLY the STOCK data when stocks MATCH TWEET TIME. This dataset has stock
# value differences between 5 min, 20 min, 60 min, 120 min, 180 min and  1440 min (24 hrs), 
# 2880 min (48 hrs) intervals. (Only 691 rows!)
tweets_full = stocks
for i in [5, 20, 60, 120, 180, 1440, 2880]:
    tweets_full = tweets_full.join(stocks.diff(periods=i), rsuffix=str(i))

# Dropping irrelevant values and merging with tweet data.
tweets_full = tweets_full.dropna()
tweets_full = tweets.merge(tweets_full, how='inner', on='created_at')

In [8]:
# UPDATE: Note the update above.
# I need to train the data on ALL of trump's tweets to get the correct sentiment.
# 1. This is also unsupervised learning.
# 2. Pre-trained methods can be used to analyse sentiment.

# Identifying Companies, etc. in tweets. Making it a categorical variable (then one-hot encoding it).
# ['Amazon', 'Facebook', etc. etc.]

In [9]:
tweets_full.head(1)

Unnamed: 0_level_0,source,text,retweet_count,favorite_count,is_retweet,id_str,compound,neg,neu,pos,...,Open1440,High1440,Low1440,Close1440,Volume1440,Open2880,High2880,Low2880,Close2880,Volume2880
created_at,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2017-02-04 20:44:00,Twitter for Android,What is our country coming to when a judge can...,33272,148698,False,827981079042805761,-0.6908,0.233,0.689,0.079,...,0.06,0.039,0.062,0.041,-100.0,1.823,1.823,1.823,1.823,0.0


In [10]:
# Splitting into train-test datasets.
X = tweets_full[['retweet_count', 'favorite_count', 'is_retweet', 'compound']].values
y = tweets_full['High5'].values # Or High 60, 1440, etc. (this should be done as time series).

from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)



In [11]:
# Fitting a basic Kernel SVR.
from sklearn.svm import SVR
regressor = SVR()
regressor.fit(
    X=X_train,
    y=y_train)

SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='auto',
  kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [12]:
# Returns SVR results.
from sklearn.metrics import mean_squared_error

y_pred = regressor.predict(X_test)
def standardise(series):
    return np.array([val - series.std(ddof=0) for val in series])

results = pd.DataFrame({
    'Actual value': y_test,
    'SVR predicted value': y_pred,
    'Standardised difference': standardise(y_test) - standardise(y_pred)
})

print(results.sample(5)) # Sample, because head() is uninformative.
print('\n')
print(results.describe().iloc[:, :2])
print('\n')
print('RMSE: %s' % mean_squared_error(y_test, y_pred))

    Actual value  SVR predicted value  Standardised difference
72           0.0             0.000771                -0.204283
96           0.0             0.000771                -0.204283
54           0.0             0.000771                -0.204283
25           0.0             0.000771                -0.204283
71           0.0             0.000771                -0.204283


       Actual value  SVR predicted value
count    146.000000           146.000000
mean       0.026027             0.003116
std        0.221898             0.017686
min       -0.599000            -0.069423
25%        0.000000             0.000771
50%        0.000000             0.000771
75%        0.010750             0.000771
max        2.051000             0.130129


RMSE: 0.04845077226634818
