## Config

In [None]:
%matplotlib inline
%load_ext autoreload
%autoreload 2
from punisher.common import *
from punisher.utils.dates import utc_to_epoch, epoch_to_utc
import re
from textblob import TextBlob
REDDIT_DIR = Path(cfg.DATA_DIR, 'reddit')

In [None]:
def plot_price(df, field, asset, ex_id):
    col_name = ohlcv_feed.get_col_name(field, asset.symbol, ex_id)
    punisher.utils.charts.plot_range(
        df, start=None, end=None, 
        column_name=col_name)

def color_y_axis(ax, color):
    """Color your axes."""
    for t in ax.get_yticklabels():
        t.set_color(color)
    return None

def plot_price_and_sentiment(df, sentiment_field, price_field):
    fig, ax1 = plt.subplots(figsize=(24,18))
    ax2 = ax1.twinx()
    
    ax1.plot(df['utc'], df[price_field], color='r')
    ax1.set_xlabel('time (utc)')
    ax1.set_ylabel(price_field)
    color_y_axis(ax1, 'r')
    
    ax2.plot(df['utc'], df[sentiment_field], color='blue')
    ax2.set_ylabel(sentiment_field)
    color_y_axis(ax2, 'b')
    
    plt.xticks(rotation=45)
    plt.title(price_field + " vs " + sentiment_field)
    ax1.grid(True)
    plt.show()
    
def get_tweet_counts(df, freq='D'):
    df = df.groupby(pd.Grouper(key='date', freq=freq)).size().to_frame()
    df['epoch'] = [utc_to_epoch(d) for d in df.index]
    df['utc'] = df.index
    df.set_index('epoch', inplace=True)
    df = df.rename(mapper={0:'tweets'}, axis='columns')
    return df

def get_tweet_sentiment(df, freq='D'):
    df = df[['date', 'sentiment']]
    df = df.groupby(pd.Grouper(key='date', freq='H')).mean()
    df['epoch'] = [utc_to_epoch(d) for d in df.index]
    df['utc'] = df.index
    df.set_index('epoch', inplace=True)
    df = df.rename(mapper={0:'sentiment'}, axis='columns')
    return df

def plot_tweets(df, freq='D'):
    # http://pandas.pydata.org/pandas-docs/stable/timeseries.html#offset-aliases
    tweets = get_tweet_counts(df, freq=freq)
    fig = plt.figure(figsize=(24,18))
    plt.plot(tweets['utc'], tweets['tweets'])
    plt.xticks(rotation=45)
    plt.grid(True)
    plt.title("Tweet Count")
    plt.show()

## Price

In [None]:
# https://coinmarketcap.com/api/
exchange_ids = [ex_cfg.GDAX]#, ex_cfg.BINANCE]#, ex_cfg.POLONIEX]
symbols = ['BTC/USD']#,'BTC/USDT']
assets = [Asset.from_symbol(sym) for sym in symbols]
start = datetime.datetime(year=2016, month=1, day=1)
timeframe = Timeframe.ONE_DAY
downloaded_df = ohlcv_feed.load_multiple_assets(
    exchange_ids, assets, timeframe, start, end=None)

In [None]:
columns = ['open_BTC/USD_gdax', 'close_BTC/USD_gdax', 'utc']#,'close_BTC/USDT_binance']
price_df = downloaded_df.copy()[columns]
#df = df.rename(mapper={columns[i]:exchange_ids[i] for i in range(len(columns)-1)}, axis='columns')
price_df.sort_values(by='utc').head()

In [None]:
start = datetime.datetime(year=2016, month=1, day=1, hour=0)
end = datetime.datetime(year=2018, month=2, day=15, hour=0)
#times = [datetime(year=2018, month=2, day=1, hour=10, minute=i) for i in range(60)] + [datetime(year=2018, month=2, day=1, hour=11, minute=i) for i in range(60)]
plot_df = price_df.copy()
plot_df = plot_df[(plot_df['utc'] >= start) & (price_df['utc'] < end)]
plot_df.plot(x='utc', figsize=(24,18), grid=True)

## Reddit Client

In [None]:
subreddit_name = 'bitcoin'
start = datetime.datetime(year=2016, month=1, day=1, hour=0)
end = datetime.datetime(year=2018, month=2, day=15, hour=0)
top_n_comments = 10
subreddit = reddit_client.get_subreddit(subreddit_name)

In [None]:
# Fetch from Reddit
#subs = reddit_client.get_submissions(subreddit_name, start, end, top_n_comments)
#subs[0]

In [None]:
# Save locally
# reddit_client.save_submissions(subs, subreddit_name, start)

In [None]:
# Load from file
# subs = reddit_client.load_submissions(subreddit_name, start)

In [None]:
import heapq
n_top_subs = 25
def load_top_n_subs(subreddit, start, n_top):
    subs = reddit_client.load_submissions(subreddit, start)
    return heapq.nlargest(n_top, subs, key=lambda s: s['score'])

def get_sub_titles_df(subreddit, start, end, n_top_subs):
    columns = ['epoch', 'utc'] + ['T{:d}'.format(i) for i in range(1, n_top_subs+1)]
    top_subs = []
    time_delta = datetime.timedelta(days=1)
    cur_start = start
    while cur_start < end:
        subs = load_top_n_subs(subreddit, cur_start, n_top_subs)
        titles = [sub['title'] for sub in subs]
        top_subs.append([utc_to_epoch(cur_start), cur_start] + titles)
        cur_start += time_delta
    df = pd.DataFrame(top_subs, columns=columns)
    df.set_index('epoch', inplace=True)
    return df

In [None]:
# top_subs = load_top_n_subs(subreddit_name, start, n_top_subs)

## Data Preprocessing

In [None]:
# Load reddit headlines
n_top_subs = 50
subs_df = get_sub_titles_df(
    subreddit_name, start, end, n_top_subs
)
subs_df = subs_df.fillna(value="")
#subs_df.isnull().sum()

In [None]:
# Join price and reddit dataframes
subs_columns = ['T{:d}'.format(i) for i in range(1, n_top_subs+1)]
df = pd.concat([price_df, subs_df[subs_columns]], axis=1)
df['label'] = df['close_BTC/USD_gdax'] > df['open_BTC/USD_gdax']
df.sort_index(inplace=True)
df.head(2)

In [None]:
## Cleanup
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression

def clean_text(text):
    text = re.sub("[0-9]+", "number", text)
    text = re.sub("#", "", text)
    text = re.sub("\n", "", text)
    text = re.sub("$[^\s]+", "", text)
    text = re.sub("@[^\s]+", "", text)
    text = re.sub("(http|https)://[^\s]*", "", text)
    text = re.sub("[^\s]+@[^\s]+", "", text)
    text = re.sub('[^a-z A-Z]+', '', text)
    return text

def cleanup_titles(df, columns):
    for col in columns:
        df[col] = df[col].str.lower()
        df[col] = df[col].apply(clean_text)
    return df

In [None]:
df = cleanup_titles(df, subs_columns)
df.head(1)

In [None]:
# Create frequency table for title
example = df.iloc[0]['T1']
example = CountVectorizer().build_tokenizer()(example)
pd.DataFrame([[x, example.count(x)] for x in set(example)], columns = ['Word', 'Count']).head()

In [None]:
# Train/Test Split
split_date = datetime.datetime(year=2017, month=11, day=1)
train = df[df['utc'] < split_date]
test = df[df['utc'] >= split_date]

In [None]:
subs_columns = ['T{:d}'.format(i) for i in range(1, n_top_subs+1)]
def concat_titles(df):
    # Combine all titles into one string per day
    titles = []
    for row in range(0, len(df.index)):
        titles.append(' '.join(str(x) for x in df.iloc[row][subs_columns]))
    return titles

def create_ngram_matrix(df, vectorizer, fit=True):
    titles = concat_titles(df)
    # http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html#sklearn.feature_extraction.text.CountVectorizer.fit_transform
    # Returns spares matrix of word --> frequency
    if fit:
        titles = vectorizer.fit_transform(titles)
    else:
        titles = vectorizer.transform(titles)
    return titles, vectorizer

def get_coefficients(vectorizer, model):
    # Inspect coefficients to see which words trigger predictions
    words = vectorizer.get_feature_names()
    coeffs = model.coef_.tolist()[0]
    coeffdf = pd.DataFrame({'Word' : words, 
                            'Coefficient' : coeffs})
    coeffdf = coeffdf.sort_values(['Coefficient', 'Word'], ascending=[0, 1])
    return coeffdf

In [None]:
vectorizer = CountVectorizer()
train_input, vectorizer = create_ngram_matrix(train, vectorizer)
train_target = train['label'].values
test_input, _ = create_ngram_matrix(test, vectorizer, fit=False)
test_target = test['label'].values
train_input.shape, test_input.shape

## Bag of Words

* Place all titles into a "bag" and count the frequency of each word
* http://pytorch.org/tutorials/beginner/nlp/deep_learning_tutorial.html#example-logistic-regression-bag-of-words-classifier

In [None]:
model = LogisticRegression()
model = model.fit(train_input, train_target)
preds = model.predict(test_input)

In [None]:
print("Accuracy", np.sum(preds == test_target)/len(preds))
pd.crosstab(test_target, preds, rownames=["Actual"], colnames=["Predicted"])

In [None]:
coeffdf = get_coefficients(vectorizer, model)
coeffdf.head(10)

In [None]:
# Negative coefficients mean a word is associates with lower price
coeffdf.tail(10)

## N-Grams

* Similar to bag-of-words (frequencies), but we count the frequency of word sequences
* Bag of words = 1Gram

In [None]:
vectorizer = CountVectorizer(ngram_range=(2,3))
train_input, vectorizer = create_ngram_matrix(train, vectorizer)
train_target = train['label'].values
test_input, _ = create_ngram_matrix(test, vectorizer, fit=False)
test_target = test['label'].values
train_input.shape, test_input.shape

In [None]:
model = LogisticRegression()
model = model.fit(train_input, train_target)
preds = model.predict(test_input)

In [None]:
print("Accuracy", np.sum(preds == test_target)/len(preds))
pd.crosstab(test_target, preds, rownames=["Actual"], colnames=["Predicted"])

In [None]:
coeffdf = get_coefficients(vectorizer, model)
coeffdf.head(10)

In [None]:
coeffdf.tail(10)

## Enhancements?

* Measure the distance and matching from some previously defined n-grams related to stock market
* topic modeling LDA, Naive Bayes (https://www.kaggle.com/rahulvks/topic-modeling-using-gensim)
* Remove common/uncommon words

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
vectorizer = TfidfVectorizer(
    min_df=0.03, max_df=0.97, 
    max_features = 200000, ngram_range = (3, 3))
    
train_input, vectorizer = create_ngram_matrix(train, vectorizer)
train_target = train['label'].values
test_input, _ = create_ngram_matrix(test, vectorizer, fit=False)
test_target = test['label'].values
train_input.shape, test_input.shape

In [None]:
# Logistic Regression
model = LogisticRegression()
model = model.fit(train_input, train_target)
preds = model.predict(test_input)
print("Accuracy", np.sum(preds == test_target)/len(preds))
coeffdf = get_coefficients(vectorizer, model)
print(coeffdf.head(10))
print(coeffdf.tail(10))
pd.crosstab(test_target, preds, rownames=["Actual"], colnames=["Predicted"])

## Naive Bayes

In [None]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model = model.fit(train_input, train_target)
preds = model.predict(test_input)
print("Accuracy", np.sum(preds == test_target)/len(preds))
coeffdf = get_coefficients(vectorizer, model).sort_values(['Coefficient', 'Word'], ascending=[0,1])
print(coeffdf.head(10))
print(coeffdf.tail(10))
pd.crosstab(test_target, preds, rownames=["Actual"], colnames=["Predicted"])

## Random Forest

In [None]:
#from sklearn.linear_model import SGDClassifier, SGDRegressor
from sklearn.ensemble import RandomForestClassifier#, GradientBoostingClassifier

vectorizer = TfidfVectorizer(
    min_df=0.03, max_df=0.2, 
    max_features = 200000, ngram_range = (3, 3))
    
train_input, vectorizer = create_ngram_matrix(train, vectorizer)
train_target = train['label'].values
test_input, _ = create_ngram_matrix(test, vectorizer, fit=False)
test_target = test['label'].values
train_input.shape, test_input.shape

In [None]:
model = RandomForestClassifier()
model = model.fit(train_input, train_target)
preds = model.predict(test_input)
print("Accuracy", np.sum(preds == test_target)/len(preds))
pd.crosstab(test_target, preds, rownames=["Actual"], colnames=["Predicted"])

## Gradient Boosting Machine

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

vectorizer = TfidfVectorizer(
    min_df=0.02, max_df=0.175, 
    max_features = 200000, ngram_range = (2, 2))
    
train_input, vectorizer = create_ngram_matrix(train, vectorizer)
train_target = train['label'].values
test_input, _ = create_ngram_matrix(test, vectorizer, fit=False)
test_target = test['label'].values
train_input.shape, test_input.shape

In [None]:
model = GradientBoostingClassifier()
model = model.fit(train_input, train_target)
preds = model.predict(test_input)
print("Accuracy", np.sum(preds == test_target)/len(preds))
pd.crosstab(test_target, preds, rownames=["Actual"], colnames=["Predicted"])

## SGD Classifier

In [None]:
from sklearn.linear_model import SGDClassifier

vectorizer = TfidfVectorizer(
    min_df=0.03, max_df=0.2, 
    max_features = 200000, ngram_range = (3, 3))
    
train_input, vectorizer = create_ngram_matrix(train, vectorizer)
train_target = train['label'].values
test_input, _ = create_ngram_matrix(test, vectorizer, fit=False)
test_target = test['label'].values
train_input.shape, test_input.shape

In [None]:
model = SGDClassifier(loss='modified_huber', max_iter=3, random_state=0, shuffle=True)
model = model.fit(train_input, train_target)
preds = model.predict(test_input)
print("Accuracy", np.sum(preds == test_target)/len(preds))
pd.crosstab(test_target, preds, rownames=["Actual"], colnames=["Predicted"])

## SVM

In [None]:
from sklearn import svm

from sklearn.linear_model import SGDClassifier

vectorizer = TfidfVectorizer(
    min_df=0.03, max_df=0.3, 
    max_features = 200000, ngram_range = (3, 3))
    
train_input, vectorizer = create_ngram_matrix(train, vectorizer)
train_target = train['label'].values
test_input, _ = create_ngram_matrix(test, vectorizer, fit=False)
test_target = test['label'].values
train_input.shape, test_input.shape

In [None]:
model = svm.SVC()
model.fit(train_input, train_target)
preds = model.predict(test_input)
print("Accuracy", np.sum(preds == test_target)/len(preds))
pd.crosstab(test_target, preds, rownames=["Actual"], colnames=["Predicted"])

## MLP

* http://pytorch.org/docs/0.3.0/sparse.html

In [449]:
import torch
import torch.nn as nn
from torch.autograd import Variable
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader
import torch.nn.functional as F

In [450]:
vectorizer = TfidfVectorizer(
    min_df=0.04, max_df=0.3, 
    max_features = 200000, ngram_range = (2, 2))
    
train_input, vectorizer = create_ngram_matrix(train, vectorizer)
train_target = train['label'].values
test_input, _ = create_ngram_matrix(test, vectorizer, fit=False)
test_target = test['label'].values

In [451]:
train_input = torch.from_numpy(train_input.toarray().astype('float32'))
test_input = torch.from_numpy(test_input.toarray().astype('float32'))
train_target = torch.from_numpy(np.expand_dims(train_target.astype('float32'),axis=1))
test_target = torch.from_numpy(np.expand_dims(test_target.astype('float32'),axis=1))
train_input.size(), test_input.size(), train_target.size(), test_target.size()

(torch.Size([670, 839]),
 torch.Size([106, 839]),
 torch.Size([670, 1]),
 torch.Size([106, 1]))

In [452]:
batch_size = 32
trn_dataset = TensorDataset(train_input, train_target)
tst_dataset = TensorDataset(test_input, test_target)
trn_loader = DataLoader(trn_dataset, batch_size, shuffle=True)
tst_loader = DataLoader(tst_dataset, batch_size, shuffle=False)

In [497]:
from sklearn import metrics as scipy_metrics

def get_predictions(probs, thresholds):
    preds = np.copy(probs)
    preds[preds >= thresholds] = 1
    preds[preds < thresholds] = 0
    return preds.astype('uint8')

def get_accuracy(preds, targets):
    preds = preds.flatten() 
    targets = targets.flatten()
    correct = np.sum(preds==targets)
    return correct / len(targets)

def get_recall(preds, targets):
    return scipy_metrics.recall_score(targets.flatten(), preds.flatten())


def get_precision(preds, targets):
    return scipy_metrics.precision_score(targets.flatten(), preds.flatten())

def run(model, optim, criterion, trn_loader, val_loader, n_epochs, n_classes):
    for epoch in range(n_epochs):
        trn_loss = train_model(model, trn_loader, optim, criterion, epoch, n_epochs)
        val_loss, _, _ = predict(model, val_loader, criterion, n_classes)
        print("Epoch {:d} Loss - Trn: {:.4f} | Val: {:4f}".format(
            epoch, trn_loss, val_loss))
        
def train_model(model, dataloader, optimizer, criterion, epoch, n_epochs):
    model.train()
    
    loss_data = 0
    n_batches = len(dataloader)
    for inputs, targets in dataloader:
        inputs = Variable(inputs)#.cuda(async=True))
        targets = Variable(targets)#.cuda(async=True))
        
        ## Forward Pass
        output = model(inputs)

        ## Clear Gradients
        model.zero_grad()

        # Metrics
        loss = criterion(output, targets)
        loss_data += loss.data[0]

        ## Backprop
        loss.backward()
        optimizer.step()
    return loss_data / n_batches

def predict(model, loader, criterion, n_classes):
    model.eval()

    loss = 0
    probs = np.empty((0, n_classes))
    labels = np.empty((0, n_classes))
    
    for inputs, targets in loader:
        inputs = Variable(inputs)#.cuda(async=True), volatile=True)
        targets = Variable(targets)#.cuda(async=True), volatile=True)

        output = model(inputs)
        loss += criterion(output, targets).data[0]
        probs = np.vstack([probs, output.data.numpy()])
        labels = np.vstack([labels, targets.data.numpy()])
        
    loss /= len(loader)
    return loss, probs, labels

In [498]:
class MLP(nn.Module):
    def __init__(self, n_features, n_classes):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(n_features, 256),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(128, n_classes),
            nn.Sigmoid()
        )

    def forward(self, inp):
        return self.layers.forward(inp)

In [499]:
n_classes = 1
n_features = train_input.shape[1]
model = MLP(n_features, n_classes)
optim = torch.optim.Adam(model.parameters())
criterion = nn.BCELoss()
out = model(Variable(train_input))
model

RuntimeError: Variable data has to be a tensor, but got numpy.ndarray

In [500]:
# inputs,targets = next(iter(trn_loader))
# inputs.size(),targets.size()

In [461]:
n_epochs = 5
run(model, optim, criterion, trn_loader, tst_loader, n_epochs, n_classes)

Epoch 0 Loss - Trn: 0.6927 | Val: 0.691096
Epoch 1 Loss - Trn: 0.6777 | Val: 0.695715
Epoch 2 Loss - Trn: 0.6636 | Val: 0.696614
Epoch 3 Loss - Trn: 0.6336 | Val: 0.685428
Epoch 4 Loss - Trn: 0.5777 | Val: 0.671541


In [464]:
loss, probs, labels = predict(model, tst_loader, criterion, n_classes, thresh=0.5)
preds = get_predictions(probs, 0.5)
assert test_target.shape == probs.shape
probs[:5], preds[:5],labels[:5]
print("Accuracy", get_accuracy(preds, labels))
print("Precision", get_precision(preds, labels))
print("Recall", get_recall(preds, labels))
pd.crosstab(labels.flatten(), preds.flatten(), rownames=["Actual"], colnames=["Predicted"])

Accuracy 0.556603773585
Precision 0.558139534884
Recall 0.842105263158


Predicted,0,1
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,11,38
1.0,9,48


## Word Vectors/Embeddings

* http://pytorch.org/tutorials/beginner/nlp/word_embeddings_tutorial.html
* Word embeddings are a representation of the *semantics* of a word, efficiently encoding semantic information that might be relevant to the task at hand.

In [None]:
word_to_ix = {"hello": 0, "world": 1}
embeds = nn.Embedding(2, 5)  # 2 words in vocab, 5 dimensional embeddings (i.e. 5 neurons, randomly initialized)
lookup_tensor = torch.LongTensor([word_to_ix["hello"]])
hello_embed = embeds(Variable(lookup_tensor))
print(hello_embed)

In [None]:
CONTEXT_SIZE = 2
EMBEDDING_DIM = 10

test_sentence = """When forty winters shall besiege thy brow,
And dig deep trenches in thy beauty's field,
Thy youth's proud livery so gazed on now,
Will be a totter'd weed of small worth held:
Then being asked, where all thy beauty lies,
Where all the treasure of thy lusty days;
To say, within thine own deep sunken eyes,
Were an all-eating shame, and thriftless praise.
How much more praise deserv'd thy beauty's use,
If thou couldst answer 'This fair child of mine
Shall sum my count, and make my old excuse,'
Proving his beauty by succession thine!
This were to be new made when thou art old,
And see thy blood warm when thou feel'st it cold.""".split()

# we should tokenize the input, but we will ignore that for now
# build a list of tuples.  Each tuple is ([ word_i-2, word_i-1 ], target word)
trigrams = [([test_sentence[i], test_sentence[i + 1]], test_sentence[i + 2])
            for i in range(len(test_sentence) - 2)]

# print the first 3, just so you can see what they look like
print(trigrams[:3])

In [None]:
vocab = set(test_sentence)
word_to_ix = {word: i for i, word in enumerate(vocab)}

In [None]:
class NGramLanguageModeler(nn.Module):
    def __init__(self, vocab_size, embedding_dim, context_size):
        super().__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear1 = nn.Linear(context_size * embedding_dim, 128)
        self.relu = nn.ReLU()
        self.linear2 = nn.Linear(128, vocab_size)

    def forward(self, inputs):
        embeds = self.embeddings(inputs).view((1, -1))
        out = self.linear1(embeds)
        out = self.relu(out)
        out = self.linear2(out)
        log_probs = F.log_softmax(out, dim=1)
        return log_probs

In [None]:
losses = []
loss_function = nn.NLLLoss()
model = NGramLanguageModeler(len(vocab), EMBEDDING_DIM, CONTEXT_SIZE)
optimizer = torch.optim.SGD(model.parameters(), lr=0.001)

In [None]:
for epoch in range(10):
    total_loss = torch.Tensor([0])
    for context, target in trigrams:
        model.zero_grad()
        
        # Turn the words into integer indices
        context_idxs = [word_to_ix[w] for w in context]
        context_var = Variable(torch.LongTensor(context_idxs))

        log_probs = model(context_var)
        loss = loss_function(log_probs, Variable(
            torch.LongTensor([word_to_ix[target]])))

        loss.backward()
        optimizer.step()

        total_loss += loss.data
    losses.append(total_loss)

## LSTM

* https://github.com/pytorch/text
* http://pytorch.org/tutorials/beginner/nlp/sequence_models_tutorial.html#lstm-s-in-pytorch
* https://github.com/eladhoffer/seq2seq.pytorch/blob/master/seq2seq/tools/tokenizer.py
* https://spacy.io/usage/spacy-101

### PyTorch Intro

In [None]:
# Pass through one at a time
lstm = nn.LSTM(3,3) #input dim and output dim = 3
inputs = [Variable(torch.randn(1,3)) for _ in range(5)]
hidden = Variable(torch.randn(1,1,3)), Variable(torch.randn(1,1,3))

In [None]:
for i in inputs:
    # Step through the sequence one element at a time.
    # after each step, hidden contains the hidden state.
    out, hidden = lstm(i.view(1, 1, -1), hidden)

In [None]:
# Pass all inputs at once
inputs = torch.cat(inputs).view(len(inputs), 1, -1)
hidden = Variable(torch.randn(1,1,3)), Variable(torch.randn(1,1,3))
out, hidden = lstm(inputs, hidden)

### Prepare Input

* https://www.kaggle.com/lseiyjg/use-news-to-predict-stock-markets

In [501]:
# Construction 1
import torchtext
from spacy.lang.en import English
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from keras.utils import np_utils

vocab_size = 10000
embedding_dim = 200
hidden_dim = 128
batch_size = 32
n_classes = 2

In [502]:
train_titles = concat_titles(train)
train_labels = train['label']
test_titles = concat_titles(test)
test_labels = test['label']
len(train_titles),len(test_titles)

(670, 106)

In [503]:
# tokenizer = English().Defaults.create_tokenizer()
tokenizer = Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(train_titles)
sequences_train = tokenizer.texts_to_sequences(train_titles)
sequences_test = tokenizer.texts_to_sequences(test_titles)

In [504]:
print('Pad sequences (samples x time)')
train_input = sequence.pad_sequences(sequences_train, maxlen=embedding_dim)
test_input = sequence.pad_sequences(sequences_test, maxlen=embedding_dim)

train_target = np_utils.to_categorical(train_labels, n_classes)
test_target = np_utils.to_categorical(test_labels, n_classes)
print('train shape:', train_input.shape, train_target.shape)
print('test shape:', test_input.shape, test_target.shape)

Pad sequences (samples x time)
train shape: (670, 200) (670, 2)
test shape: (106, 200) (106, 2)


In [505]:
trn_inp = torch.FloatTensor(train_input)#.view(train_input.shape[0], 1, -1)
tst_inp = torch.FloatTensor(test_input)#.view(test_input.shape[0], 1, -1)
trn_targ = torch.FloatTensor(train_target)
tst_targ = torch.FloatTensor(test_target)

In [506]:
batch_size = 1
trn_dataset = TensorDataset(trn_inp, trn_targ)
tst_dataset = TensorDataset(tst_inp, tst_targ)
trn_loader = DataLoader(trn_dataset, batch_size, shuffle=True)
tst_loader = DataLoader(tst_dataset, batch_size, shuffle=False)

### LSTM

In [509]:
def train_model(model, dataloader, optimizer, criterion, epoch, n_epochs):
    model.train()
    
    loss_data = 0
    n_batches = len(dataloader)
    for inputs, targets in dataloader:
        inputs = Variable(inputs)#.cuda(async=True))
        targets = Variable(targets)#.cuda(async=True))

        # Also, we need to clear out the hidden state of the LSTM,
        # detaching it from its history on the last instance.
        model.hidden = model.init_hidden(len(inputs))
        
        ## Forward Pass
        output = model(inputs)

        ## Clear Gradients
        model.zero_grad()

        # Metrics
        loss = criterion(output, targets)
        loss_data += loss.data[0]

        ## Backprop
        loss.backward()
        optimizer.step()
    return loss_data / n_batches

def run(model, optim, criterion, trn_loader, val_loader, n_epochs, n_classes):
    for epoch in range(n_epochs):
        trn_loss = train_model(model, trn_loader, optim, criterion, epoch, n_epochs)
        val_loss, _, _ = predict(model, val_loader, criterion, n_classes)
        print("Epoch {:d} Loss - Trn: {:.4f} | Val: {:4f}".format(
            epoch, trn_loss, val_loss))

In [510]:
class LSTMModel(nn.Module):
    # https://gist.github.com/spro/ef26915065225df65c1187562eca7ec4
    def __init__(self, vocab_size, embedding_dim, hidden_dim, n_classes):
        super().__init__()
        #self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(
            input_size=embedding_dim, 
            hidden_size=hidden_dim, 
            num_layers=1,
            dropout=0.5)
        self.linear2 = nn.Linear(hidden_dim, n_classes)
        self.softmax = nn.Softmax(dim=1)
        
        self.hidden_dim = hidden_dim
        self.hidden = self.init_hidden(batch_size)
        

    def init_hidden(self, batch_size):
        # The axes semantics are (num_layers, minibatch_size, hidden_dim)
        return (Variable(torch.zeros(1, batch_size, self.hidden_dim)),
                Variable(torch.zeros(1, batch_size, self.hidden_dim)))

    def forward(self, inputs):
        #out = self.embeddings(inputs)
        #print(out.size())
        out, self.hidden = self.lstm(inputs.view(1, len(inputs), -1), self.hidden)
        out = self.linear2(out.view(len(inputs), -1))
        out = self.softmax(out)
        return out

In [528]:
model = LSTMModel(tokenizer.num_words, embedding_dim, hidden_dim, n_classes)
optim = torch.optim.Adam(model.parameters())
criterion = F.binary_cross_entropy
model

LSTMModel(
  (lstm): LSTM(200, 128, dropout=0.5)
  (linear2): Linear(in_features=128, out_features=2, bias=True)
  (softmax): Softmax()
)

In [529]:
inputs,targets = next(iter(trn_loader))
inputs.size(),targets.size()

(torch.Size([1, 200]), torch.Size([1, 2]))

In [530]:
model(Variable(inputs))

Variable containing:
 0.4788  0.5212
[torch.FloatTensor of size 1x2]

In [531]:
n_epochs = 3
run(model, optim, criterion, trn_loader, tst_loader, n_epochs, n_classes)

Epoch 0 Loss - Trn: 0.6936 | Val: 0.714858
Epoch 1 Loss - Trn: 0.6875 | Val: 0.723402
Epoch 2 Loss - Trn: 0.6811 | Val: 0.708694


In [533]:
loss, probs, labels = predict(model, tst_loader, criterion, n_classes)
preds = np.argmax(probs, axis=1)
labels = np.argmax(labels, axis=1)
print("Accuracy", get_accuracy(preds, labels))
print("Precision", get_precision(preds, labels))
print("Recall", get_recall(preds, labels))
pd.crosstab(labels.flatten(), preds.flatten(), rownames=["Actual"], colnames=["Predicted"])

Accuracy 0.490566037736
Precision 0.516483516484
Recall 0.824561403509


Predicted,0,1
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,5,44
1,10,47


## CNN

## Future Price Predictions

* https://www.kaggle.com/katerynad/bernoulli-naive-bayes-auc-59

## Technical Indicators

* https://www.kaggle.com/hiteshp/money-money-share-market-study/notebook
* https://www.kaggle.com/hiteshp/money-money-share-market-study-2

## HMM

* http://amunategui.github.io/markov-chains/index.html
* https://www.kaggle.com/hiteshp/hidden-markov-model-predicting-stock-market

## Sentiment

* https://github.com/abdulfatir/twitter-sentiment-analysis
* http://textblob.readthedocs.io/en/dev/
* https://github.com/fnielsen/afinn
* https://github.com/anfederico/Stocktalk/blob/master/stocktalk/scripts/streaming.py
* http://www.nltk.org/
* http://textblob.readthedocs.io/en/dev/quickstart.html#translation-and-language-detection
* https://github.com/juvaroka/tweetwise
* https://www.kaggle.com/shreyams/stock-price-prediction-94-xgboost (*)

In [None]:
def get_sentiment(text):
    text = TextBlob(text)
    return text.polarity
    
def add_sentiment_score(df, columns):
    for col in columns:
        texts = df[col].tolist()
        scores = []
        for text in texts:
            score = get_sentiment(text)
            scores.append(score)
        df['sentiment_'+col] = scores

In [None]:
# Average Sentiment
add_sentiment_score(df, columns=subs_columns)
sentiment_cols = ['sentiment_'+col for col in subs_columns]
df['sentiment'] = df[sentiment_cols].mean(axis=1)

In [None]:
# Price Change
open_col = ohlcv_feed.get_col_name('open', assets[0].symbol, exchange_ids[0])
close_col = ohlcv_feed.get_col_name('close', assets[0].symbol, exchange_ids[0])
df['price_delta'] = (df[close_col] - df[open_col])/df[open_col]

In [None]:
df.head(2)

In [None]:
# Plot Price Change vs Sentiment
plot_price_and_sentiment(df, 'sentiment', 'price_delta')

In [None]:
# Correlation
# http://www.dummies.com/education/math/statistics/how-to-interpret-a-correlation-coefficient-r/
# Most statisticians like to see correlations beyond at least +0.5 or –0.5 before getting too excited about them
print(np.corrcoef(df['price_delta'], df['sentiment']))
print(np.correlate(df['price_delta'], df['sentiment'])[0])
plt.matshow(df[['price_delta','sentiment']].corr())
pd.scatter_matrix(df[['price_delta','sentiment']], 
                  alpha = 0.3, figsize = (14,8), diagonal = 'kde');