In [2]:
#
import matplotlib.pyplot as plt
import numpy as np
import time
import pandas as pd
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.recurrent import LSTM
from keras.models import Sequential

In [7]:
#load data
df_all = pd.read_csv('./Hashtag_Tweet_Count_Hourly_normamlized.csv', index_col=0)
df = df_all.ix[:,1:]
print('Raw Data shape:', df.shape)

Raw Data shape: (1687, 466)
2017071515    0.0
2017071516    0.0
2017071517    0.0
2017071518    0.0
2017071519    0.0
2017071520    0.0
2017071521    0.0
2017071522    0.0
2017071523    0.0
2017071600    0.0
2017071601    0.0
2017071602    0.0
2017071603    0.0
2017071604    0.0
2017071605    0.0
2017071606    0.0
2017071607    0.0
2017071608    0.0
2017071609    0.0
2017071610    0.0
2017071611    0.0
2017071612    0.0
2017071613    0.0
2017071614    0.0
2017071615    0.0
2017071616    0.0
2017071617    0.0
2017071618    0.0
2017071619    0.0
2017071620    0.0
             ... 
2017080219    0.0
2017080220    0.0
2017080221    0.0
2017080222    0.0
2017080223    0.0
2017080300    0.0
2017080301    0.0
2017080302    0.0
2017080303    0.0
2017080304    0.0
2017080305    0.0
2017080306    0.0
2017080307    0.0
2017080308    0.0
2017080309    0.0
2017080310    0.0
2017080311    0.0
2017080312    0.0
2017080313    0.0
2017080314    0.0
2017080315    0.0
2017080316    0.0
2017080317    0.0


In [8]:
#filter rows that contains a lot of zero
rows = (df != 0).sum(1)
#print(rows[0:5])
rows_filtered = rows > 0.1*df.shape[1]
#print(rows_filtered[0:5])

In [9]:
df_filtered = df[rows_filtered]
print(df_filtered.shape)
#print(df_filtered[0:5])

(1214, 466)


In [13]:
from sklearn.preprocessing import normalize
# define a function to convert a vector of time series into a 2D matrix
def convertDataToSequence(df, seq_len, normalise_window):
    sequence_length = seq_len + 1
    result = []
    for index in range(len(df.columns) - sequence_length):
        #print(index, index+sequence_length)
        result.append(np.array(df.ix[:,index:index + sequence_length]))
    print(len(result))
    if normalise_window:
        result = normalise_windows(result)
        #result = result.div(result.sum(axis=1), axis=0)   

    result = np.array(result)    
    print('result:', result.shape)
    result = np.reshape(result, (result.shape[0]*result.shape[1], result.shape[2]))
    print('result:', result.shape)

    row = round(0.9 * result.shape[0])
    train = result[:int(row), :]
    print('train:', train.shape)
    np.random.shuffle(train)
    x_train = train[:, :-1]
    y_train = train[:, -1]
    x_test = result[int(row):, :-1]
    y_test = result[int(row):, -1]
    print('x_train:', x_train.shape, 'y_train:', y_train.shape)
    print('x_test:', x_test.shape, 'y_test:', y_test.shape)

    x_train = np.reshape(x_train, (x_train.shape[0], x_train.shape[1], 1))
    x_test = np.reshape(x_test, (x_test.shape[0], x_test.shape[1], 1)) 

    return [x_train, y_train, x_test, y_test]

def normalise_windows(window_data):
    normalised_data = []
    for window in window_data:
        #normalised_window = [((float(p) / float(window[0])) - 1) for p in window]
        #print(window.shape)
        #row_sums = window.sum(axis=1)
        #normalised_data.append(window / row_sums[:, np.newaxis])
        normalised_window = normalize(window, axis=1, norm='l1')
        normalised_data.append(normalised_window)
    return normalised_data
def build_model(layers):
    model = Sequential()

    model.add(LSTM(
        input_shape=(layers[1], layers[0]),
        output_dim=layers[1],
        return_sequences=True))
    model.add(Dropout(0.2))

    model.add(LSTM(
        layers[2],
        return_sequences=False))
    model.add(Dropout(0.2))

    model.add(Dense(
        output_dim=layers[3]))
    model.add(Activation("linear"))

    start = time.time()
    model.compile(loss="mse", optimizer="rmsprop")
    print("> Compilation Time : ", time.time() - start)
    return model

def predict_point_by_point(model, data):
    #Predict each timestep given the last sequence of true data, in effect only predicting 1 step ahead each time
    predicted = model.predict(data)
    predicted = np.reshape(predicted, (predicted.size,))
    return predicted

def predict_sequence_full(model, data, window_size):
    #Shift the window by 1 new prediction each time, re-run predictions on new window
    curr_frame = data[0]
    predicted = []
    for i in range(len(data)):
        predicted.append(model.predict(curr_frame[newaxis,:,:])[0,0])
        curr_frame = curr_frame[1:]
        curr_frame = np.insert(curr_frame, [window_size-1], predicted[-1], axis=0)
    return predicted

def predict_sequences_multiple(model, data, window_size, prediction_len):
    #Predict sequence of 50 steps before shifting prediction run forward by 50 steps
    prediction_seqs = []
    for i in range(int(len(data)/prediction_len)):
        curr_frame = data[i*prediction_len]
        predicted = []
        for j in range(prediction_len):
            predicted.append(model.predict(curr_frame[newaxis,:,:])[0,0])
            curr_frame = curr_frame[1:]
            curr_frame = np.insert(curr_frame, [window_size-1], predicted[-1], axis=0)
        prediction_seqs.append(predicted)
    return prediction_seqs

In [14]:
import time
import matplotlib.pyplot as plt
from numpy import newaxis

def plot_results(predicted_data, true_data, fileName):
    fig = plt.figure(facecolor='white')
    ax = fig.add_subplot(111)
    ax.plot(true_data, label='True Data')
    plt.plot(predicted_data, label='Prediction')
    plt.legend()
    plt.show()
    fig.savefig(fileName, bbox_inches='tight')

def plot_results_multiple(predicted_data, true_data, prediction_len):
    fig = plt.figure(facecolor='white')
    ax = fig.add_subplot(111)
    ax.plot(true_data, label='True Data')
    #Pad the list of predictions to shift it in the graph to it's correct start
    for i, data in enumerate(predicted_data):
        padding = [None for p in range(i * prediction_len)]
        plt.plot(padding + data, label='Prediction')
        plt.legend()
    plt.show()

In [None]:
global_start_time = time.time()
epochs  = 100
seq_len = 24

print('> Loading data... ')

X_train, y_train, X_test, y_test = convertDataToSequence(df_filtered, seq_len, False) #True

print('> Data Loaded. Compiling...')

model = build_model([1, 24, 128, 1])

model.fit(
    X_train,
    y_train,
    batch_size=512,
    nb_epoch=epochs,
    validation_split=0.05)

#predictions = predict_sequences_multiple(model, X_test, seq_len, 50)
#predicted = predict_sequence_full(model, X_test, seq_len)
predicted = predict_point_by_point(model, X_test)        

print('Training duration (s) : ', time.time() - global_start_time)
#plot_results_multiple(predictions, y_test, 50)
plot_results(predicted, y_test, 'output_prediction.jpg')
np.savetxt('output_result.txt', predicted)

> Loading data... 
441
result: (441, 1214, 25)
result: (535374, 25)
train: (481837, 25)
x_train: (481837, 24) y_train: (481837,)
x_test: (53537, 24) y_test: (53537,)
> Data Loaded. Compiling...




> Compilation Time :  0.027101993560791016
Train on 457745 samples, validate on 24092 samples
Epoch 1/100
Epoch 4/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
 92160/457745 [=====>........................] - ETA: 65s - loss: 8.1024e-04

In [78]:
# evaluate the result
test_mse = model.evaluate(X_test, y_test, verbose=1)
print('\nThe mean squared error (MSE) on the test data set is %.6f over %d test samples.' 
      % (test_mse, len(y_test)))

The mean squared error (MSE) on the test data set is 0.000108 over 3183 test samples.
