In [1]:
import pandas as pd
import os

path = './Data/'

filename = os.path.join(path,'SN_d_tot_V2.0.csv')

In [2]:
names = ['year','month','day','dec_year','sn_value','sn_error','obs_num']

In [3]:
df = pd.read_csv(filename, sep=';',
                header=None, names=names,
                na_values=['-1'],
                index_col = False)

In [4]:
print("start:")
print(df[0:10])

start:
   year  month  day  dec_year  sn_value  sn_error  obs_num
0  1818      1    1  1818.001        -1       NaN        0
1  1818      1    2  1818.004        -1       NaN        0
2  1818      1    3  1818.007        -1       NaN        0
3  1818      1    4  1818.010        -1       NaN        0
4  1818      1    5  1818.012        -1       NaN        0
5  1818      1    6  1818.015        -1       NaN        0
6  1818      1    7  1818.018        -1       NaN        0
7  1818      1    8  1818.021        65      10.2        1
8  1818      1    9  1818.023        -1       NaN        0
9  1818      1   10  1818.026        -1       NaN        0


In [5]:
start_id = max(df[df['obs_num']==0].index.tolist())+1
print(start_id)
df = df[start_id:]

11314


In [6]:
df_train = df[df['year']<2000]
df_test = df[df['year']>=2000]

sports_train = df_train['sn_value'].tolist()
sports_test = df_test['sn_value'].tolist()

print("training: {}".format(len(sports_train)))
print("testing: {}".format(len(sports_test)))

training: 55160
testing: 6787


In [7]:
import numpy as np

def to_sequences(seq_size, obs):
    x = []
    y = []
    
    for i in range(len(obs)-seq_size-1):
        window = obs[i:(i+seq_size)]
        after_window = obs[i+seq_size]
        window = [[x] for x in window]
        x.append(window)
        y.append(after_window)
    return np.array(x),np.array(y)

In [8]:
seq_size = 10
x_train, y_train = to_sequences(seq_size, sports_train)
x_test, y_test = to_sequences(seq_size, sports_test)

In [9]:
x_train[1:4]

array([[[240],
        [275],
        [352],
        [268],
        [285],
        [343],
        [340],
        [238],
        [287],
        [294]],

       [[275],
        [352],
        [268],
        [285],
        [343],
        [340],
        [238],
        [287],
        [294],
        [342]],

       [[352],
        [268],
        [285],
        [343],
        [340],
        [238],
        [287],
        [294],
        [342],
        [287]]])

In [10]:
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Embedding
from keras.layers import LSTM
from keras.datasets import imdb
from keras.callbacks import EarlyStopping
import numpy as np

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [11]:
model = Sequential()
model.add(LSTM(64,
              dropout=0.0,
              recurrent_dropout=0.0,
              input_shape=(None,1)))
model.add(Dense(32))
model.add(Dense(1))
model.compile(loss='mean_squared_error',
             optimizer='adam')
monitor = EarlyStopping(monitor='val_loss',
                       min_delta = 1e-3,
                       patience=5,
                       verbose=1,
                       mode='auto')

model.fit(x_train, y_train, validation_data = (x_test,y_test),
         callbacks=[monitor],
         verbose=2,
         epochs=1000)

Train on 55149 samples, validate on 6776 samples
Epoch 1/1000
 - 38s - loss: 1388.8479 - val_loss: 203.0044
Epoch 2/1000
 - 37s - loss: 519.4890 - val_loss: 203.8726
Epoch 3/1000
 - 34s - loss: 511.4188 - val_loss: 206.6642
Epoch 4/1000
 - 33s - loss: 510.4086 - val_loss: 212.4187
Epoch 5/1000
 - 33s - loss: 509.0526 - val_loss: 201.3940
Epoch 6/1000
 - 33s - loss: 509.6792 - val_loss: 203.3017
Epoch 7/1000
 - 33s - loss: 503.2708 - val_loss: 209.6514
Epoch 8/1000
 - 35s - loss: 501.8529 - val_loss: 202.0593
Epoch 9/1000
 - 38s - loss: 502.3665 - val_loss: 211.7105
Epoch 10/1000
 - 38s - loss: 503.8213 - val_loss: 203.6000
Epoch 00010: early stopping


<keras.callbacks.History at 0x14b29f856d8>

In [12]:
from sklearn import metrics

pred = model.predict(x_test)
score = np.sqrt(metrics.mean_squared_error(pred,y_test))
print("score:{}".format(score))

score:14.26884752035647
