In [8]:
import numpy as np
import pandas as pd
import os
import sys
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns

foobar_path = os.path.abspath(os.path.join('..'))
if foobar_path not in sys.path:
    sys.path.append(foobar_path)

import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader

from sklearn.preprocessing import MinMaxScaler, StandardScaler
from foobar.trainer.lstm_trainer import train_model
from foobar.ml_preprocessing.timeseries_preprocessing import wind

In [35]:
S3_FILE_NAME_WIDE = 'wide.csv'
S3_FILE_NAME_GAMESTOP = 'gme.csv'

LOCAL_FILE_PATH_WIDE = '../foobar/data/processed/wide.csv'
LOCAL_FILE_PATH_GAMESTOP = '../foobar/data/processed/gme.csv'


df_wide = pd.read_csv(LOCAL_FILE_PATH_WIDE)
df_gamestop = pd.read_csv(LOCAL_FILE_PATH_GAMESTOP)
print(df_wide[['hour','closeprice']].head(50))
df_gamestop[['hour','closeprice']].head(50)

                   hour  closeprice
0   2020-04-09 00:00:00       4.000
1   2020-04-09 01:00:00       4.000
2   2020-04-09 02:00:00       4.000
3   2020-04-09 03:00:00       4.000
4   2020-04-09 04:00:00       4.000
5   2020-04-09 05:00:00       4.000
6   2020-04-09 06:00:00       4.000
7   2020-04-09 07:00:00       4.000
8   2020-04-09 08:00:00       4.000
9   2020-04-09 09:00:00       4.000
10  2020-04-09 10:00:00       4.000
11  2020-04-09 11:00:00       4.000
12  2020-04-09 12:00:00       4.000
13  2020-04-09 13:00:00       4.000
14  2020-04-09 14:00:00       4.000
15  2020-04-09 15:00:00       4.000
16  2020-04-09 16:00:00       4.000
17  2020-04-09 17:00:00       4.000
18  2020-04-09 18:00:00       4.000
19  2020-04-09 19:00:00       4.000
20  2020-04-09 20:00:00       4.000
21  2020-04-09 21:00:00       4.000
22  2020-04-09 22:00:00       3.950
23  2020-04-09 23:00:00       3.950
24  2020-04-13 00:00:00       4.160
25  2020-04-13 01:00:00       4.160
26  2020-04-13 02:00:00     

Unnamed: 0,hour,closeprice
0,2020-04-09 13:00:00,3.7
1,2020-04-09 14:00:00,3.86
2,2020-04-09 15:00:00,4.06
3,2020-04-09 16:00:00,4.12
4,2020-04-09 17:00:00,4.03
5,2020-04-09 18:00:00,3.91
6,2020-04-09 19:00:00,3.89
7,2020-04-09 20:00:00,3.82
8,2020-04-09 21:00:00,4.0
9,2020-04-09 22:00:00,3.95


In [18]:
    feature_set_gamestop = ['openprice', 'highprice', 'lowprice', 'volume', 'closeprice']
    feature_set_wide = ['avg_all_post_pos', 'avg_all_post_neg', 'avg_all_post_neu',
       'cnt_all_user', 'cnt_all_tag', 'cnt_all_post', 'cnt_all_comments',
       'avg_gme_post_pos', 'avg_gme_post_neg', 'avg_gme_post_neu',
       'cnt_gme_user', 'cnt_gme_tag', 'cnt_gme_post', 'cnt_gme_comments',
       'volume', 'openprice', 'closeprice', 'highprice', 'lowprice']

    df_wide.columns

Index(['hour', 'avg_all_post_pos', 'avg_all_post_neg', 'avg_all_post_neu',
       'cnt_all_user', 'cnt_all_tag', 'cnt_all_post', 'cnt_all_comments',
       'avg_gme_post_pos', 'avg_gme_post_neg', 'avg_gme_post_neu',
       'cnt_gme_user', 'cnt_gme_tag', 'cnt_gme_post', 'cnt_gme_comments', 'id',
       'volume', 'openprice', 'closeprice', 'highprice', 'lowprice',
       'prediction'],
      dtype='object')

In [30]:
# df_wide = df_wide.set_index('hour')
df_wide['datetime'] = pd.to_datetime(df_wide['hour'], format='%Y-%m-%d %H:%M:%S')
df_wide = df_wide.set_index('datetime')

df_temp = df_wide[df_wide.index.year == 2020]
df_temp

Unnamed: 0_level_0,hour,avg_all_post_pos,avg_all_post_neg,avg_all_post_neu,cnt_all_user,cnt_all_tag,cnt_all_post,cnt_all_comments,avg_gme_post_pos,avg_gme_post_neg,...,cnt_gme_tag,cnt_gme_post,cnt_gme_comments,id,volume,openprice,closeprice,highprice,lowprice,prediction
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-04-09 00:00:00,2020-04-09 00:00:00,0.097898,0.090536,0.807021,1201.0,190.0,3500.0,3477.0,0.000000,0.000000,...,0.0,0.0,0.0,1.0,1379.0,3.94,4.00,4.00,3.94,-1.0
2020-04-09 01:00:00,2020-04-09 01:00:00,0.099473,0.087383,0.808619,1202.0,189.0,3540.0,3519.0,0.000000,0.000000,...,0.0,0.0,0.0,1.0,1379.0,3.94,4.00,4.00,3.94,-1.0
2020-04-09 02:00:00,2020-04-09 02:00:00,0.099551,0.087438,0.805654,1107.0,185.0,3258.0,3235.0,0.000000,0.000000,...,0.0,0.0,0.0,1.0,1379.0,3.94,4.00,4.00,3.94,-1.0
2020-04-09 03:00:00,2020-04-09 03:00:00,0.099052,0.091373,0.804884,945.0,183.0,2781.0,2767.0,0.000000,0.000000,...,0.0,0.0,0.0,1.0,1379.0,3.94,4.00,4.00,3.94,-1.0
2020-04-09 04:00:00,2020-04-09 04:00:00,0.093404,0.080233,0.821032,687.0,153.0,1874.0,1858.0,0.000000,0.000000,...,0.0,0.0,0.0,1.0,1379.0,3.94,4.00,4.00,3.94,-1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-12-31 19:00:00,2020-12-31 19:00:00,0.109813,0.079860,0.810343,91.0,48.0,149.0,130.0,0.110126,0.087626,...,1.0,8.0,6.0,1601.0,792064.0,19.47,18.99,19.47,18.91,-1.0
2020-12-31 20:00:00,2020-12-31 20:00:00,0.113935,0.081334,0.796540,168.0,51.0,243.0,228.0,0.125801,0.000001,...,1.0,5.0,4.0,1602.0,1081315.0,18.98,18.81,19.03,18.80,-1.0
2020-12-31 21:00:00,2020-12-31 21:00:00,0.127015,0.069499,0.800059,183.0,74.0,291.0,281.0,0.142701,0.080701,...,1.0,10.0,8.0,1603.0,1270488.0,18.84,18.80,18.95,18.29,-1.0
2020-12-31 22:00:00,2020-12-31 22:00:00,0.119612,0.076276,0.804085,147.0,63.0,262.0,249.0,0.105401,0.038201,...,1.0,5.0,3.0,1603.0,1270488.0,18.84,18.80,18.95,18.29,-1.0
