# Google Drive 마운트

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

# 모듈 임포트

In [None]:
import os
import time
from datetime import datetime
import missingno as msno
import requests

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Conv1D, Lambda, Input
from tensorflow.keras.losses import Huber
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint


# 데이터 다운로드

In [None]:
!cp /content/gdrive/MyDrive/tmp/bitstampUSD_1-min_data_2012-01-01_to_2021-03-31.csv.zip ./

## 데이터 압축 풀기

In [None]:
!unzip bitstampUSD_1-min_data_2012-01-01_to_2021-03-31.csv.zip

## 데이터 불러오기

In [None]:
df = pd.read_csv('bitstampUSD_1-min_data_2012-01-01_to_2021-03-31.csv')
df.head()

# 결측치 처리

In [None]:
df.info()

In [None]:
msno.matrix(df)

In [None]:
df.isnull().sum()

In [None]:
print(len(df))

## 결측치 분포 파악

In [None]:
print(df.Timestamp.min(), df.Timestamp.max())
diff = df.Timestamp.max() - df.Timestamp.min()
print(diff)
count_with_60_interval = diff / 60

# timestamp 최소와 최대 값 사이를 60으로 나눈 갯수와 전체 데이터 갯수가 다르다.
# 중간에 timestamp가 빈곳이 있다.
print(count_with_60_interval)
print(len(df))


In [None]:
last_quantile = 0
null_counts = []
for q in np.arange(0, 1.0, 0.01):
    current_quantile = df.Timestamp.quantile(q)
    null_count = df[(df.Timestamp>=last_quantile) & (df.Timestamp<current_quantile)].Open.isnull().sum()
    null_counts.append(null_count)
    last_quantile = current_quantile



In [None]:
null_counts = np.array(null_counts)
plt.plot(null_counts/len(df)*100.)
plt.show()

plt.plot(null_counts[80:]/len(df)*100.)
plt.show()

전체 데이터 중 80% 이후의 데이터는 최대 0.035%만 null이다. 

이 데이터 만을 가지고 하자. 그리고 결측치는 앞의 값으로 채우자.

# 대상 데이터 추림

In [None]:
index_of_80_percent = int( len(df) * 0.8 )
df = df[df.index>index_of_80_percent]
print(len(df))

In [None]:
last_quantile = 0
null_counts = []
for q in np.arange(0, 1.0, 0.01):
    current_quantile = df.Timestamp.quantile(q)
    null_count = df[(df.Timestamp>=last_quantile) & (df.Timestamp<current_quantile)].Open.isnull().sum()
    null_counts.append(null_count)
    last_quantile = current_quantile

In [None]:
null_counts = np.array(null_counts)
plt.plot(null_counts/len(df)*100.)
plt.show()

1개 구간이 1만 개 이다. 최대가 0.07이면 만개 중에 7개.
연속되어 있더라도 무시해도 되겠다.

In [None]:
interval = int(len(df)/100)
for i in range(100):
    print(i)
    plt.figure(figsize=(20,1))
    plt.plot(df.Open[i*interval:(i+1)*interval])
    plt.xticks([])
    plt.show()

# 결측치 처리

In [None]:
print(df.isnull().sum())

In [None]:
df.fillna(method='ffill', inplace=True)

In [None]:
msno.matrix(df)

In [None]:
df.head()

## Timestamp 컬럼 변환
Unix time => datetime

In [None]:
df['Timestamp'] = pd.to_datetime(df['Timestamp'],unit='s')

In [None]:
df.head()

In [None]:
df.tail()

# 불필요한 컬럼 삭제

In [None]:
df.drop(columns=['Timestamp'
        # , 'Volume_(BTC)'
        # , 'Volume_(Currency)'
        ], inplace=True)

In [None]:
df.head()

# 데이터 분포 보기

In [None]:
for i in range(len(df.columns)):
    print(df.columns[i])
    plt.hist(df.iloc[:,i], bins=1000)
    plt.show()

# 컬럼 로그 변환

In [None]:
df = np.log1p(df)

In [None]:
for i in range(len(df.columns)):
    print(df.columns[i])
    plt.hist(df.iloc[:,i], bins=1000)
    plt.show()

# MinMaxScaler
역변환 : inverse_scaled_data = scaler.inverse_transform(scaled_data)

In [None]:
scaler = MinMaxScaler()

In [None]:
scaled = scaler.fit_transform(df)
print(scaled)

# Data Sampling

## 파라미터 설정

In [None]:
SEQUENCE_LENGTH = 16 * 60   # <<-- 확인 필요 * 60 이유
OFFSET = 16 * 60            # <<--
BATCH_SIZE = 32
SAMPLING_COUNT = 10000      # 샘플링 크기

In [None]:
numbers = scaled
print(numbers)

In [None]:
raw_x = []
raw_y = []

for j in range(SAMPLING_COUNT):
  i = np.random.randint(len(numbers) - SEQUENCE_LENGTH - OFFSET)
  if i + SEQUENCE_LENGTH + OFFSET >= len(numbers): break
  raw_x.append(numbers[i:i + SEQUENCE_LENGTH, :])
  raw_y.append(numbers[i + SEQUENCE_LENGTH + OFFSET - 1, :])
#              0         1              
#              01234567890123456789
# numbers[0] = 12345678901234567890
#                 i = 3
#                 <-x->   = [3:8] = 45678
#                      y  = [8]   = 9



In [None]:
print(raw_x[0])
print(raw_y[0])
print()
print(raw_x[1])
print(raw_y[1])

# 데이터셋 나누기

In [None]:
x = np.array(raw_x)
y = np.array(raw_y)
print(x.shape)
print(y.shape)

from sklearn.model_selection import train_test_split

train_x, test_x, train_y, test_y = train_test_split(x, y, test_size=0.1, shuffle=True)


print("Training Data:")
print(train_x.shape)
print(train_y.shape)

print("Test Data:")
print(test_x.shape)
print(test_y.shape)


In [None]:
print("train_x[0]\n", train_x[0])
print()
print("train_y[0]\n", train_y[0])

plt.plot(train_x[0].T[0], '.', color='blue')
plt.plot([len(train_x[0])], train_y[0,0], 'x', color='blue')
plt.plot(train_x[0].T[1], '.', color='red')
plt.plot([len(train_x[0])], train_y[0,1], 'o', color='red')

plt.show()

In [None]:
print(train_x.shape)
print(train_y.shape)
input_shape = train_x.shape[1:]
output_shape = train_y.shape[-1]
print(input_shape)
print(output_shape)

# 모델 생성 및 학습

In [None]:
from tensorflow import keras
from tensorflow.keras.layers import Dense, RepeatVector, Flatten
from tensorflow.keras.layers import Bidirectional, LSTM, GRU

model = keras.Sequential()
model.add(LSTM(128, input_shape=input_shape))
model.add(Flatten())
model.add(Dense(10, activation='relu'))
model.add(Dense(output_shape))
model.compile(loss="mse", optimizer="adam", metrics=["mape"])
model.summary()

history = model.fit(train_x, train_y, epochs=10, verbose=1, validation_split=0.1, batch_size=BATCH_SIZE)


plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.show()



loss, mape = model.evaluate(test_x, test_y)
print("loss=", loss)
print("mape=", mape)



y_ = model.predict(test_x).squeeze()

In [None]:
for i in range(output_shape):
    plt.scatter(test_y[i], y_[i])
    plt.show()

## 결과 보기

In [None]:
plt.plot(test_y[0], 'b.')
plt.plot(y_[0], 'r.')
plt.show()

plt.plot(test_y[0,:-1], 'b.')
plt.plot(y_[0,:-1], 'r.')
plt.show()

# 역정규화 및 지수 변환

In [None]:
denomalized_test_y = scaler.inverse_transform(test_y)
denomalized_y_ = scaler.inverse_transform(y_)
delogged_test_y = np.expm1(denomalized_test_y)
delogged_y = np.expm1(denomalized_y_)

In [None]:
plt.plot(delogged_test_y[0], 'bo')
plt.plot(delogged_y[0], 'r.')
plt.show()

plt.plot(delogged_test_y[0,:-1], 'bo')
plt.plot(delogged_y[0,:-1], 'r.')
plt.show()

# Target을 weighted_Price만으로 

## 데이터 불러오기

In [None]:
df = pd.read_csv('bitstampUSD_1-min_data_2012-01-01_to_2021-03-31.csv')
df.head()

# 결측치 처리

In [None]:
df.info()

In [None]:
msno.matrix(df)

In [None]:
df.isnull().sum()

In [None]:
print(len(df))

## 결측치 분포 파악

In [None]:
print(df.Timestamp.min(), df.Timestamp.max())
diff = df.Timestamp.max() - df.Timestamp.min()
print(diff)
count_with_60_interval = diff / 60

# timestamp 최소와 최대 값 사이를 60으로 나눈 갯수와 전체 데이터 갯수가 다르다.
# 중간에 timestamp가 빈곳이 있다.
print(count_with_60_interval)
print(len(df))


In [None]:
last_quantile = 0
null_counts = []
for q in np.arange(0, 1.0, 0.01):
    current_quantile = df.Timestamp.quantile(q)
    null_count = df[(df.Timestamp>=last_quantile) & (df.Timestamp<current_quantile)].Open.isnull().sum()
    null_counts.append(null_count)
    last_quantile = current_quantile



In [None]:
null_counts = np.array(null_counts)
plt.plot(null_counts/len(df)*100.)
plt.show()

plt.plot(null_counts[80:]/len(df)*100.)
plt.show()

전체 데이터 중 80% 이후의 데이터는 최대 0.035%만 null이다. 

이 데이터 만을 가지고 하자. 그리고 결측치는 앞의 값으로 채우자.

# 대상 데이터 추림

In [None]:
index_of_80_percent = int( len(df) * 0.8 )
df = df[df.index>index_of_80_percent]
print(len(df))

In [None]:
last_quantile = 0
null_counts = []
for q in np.arange(0, 1.0, 0.01):
    current_quantile = df.Timestamp.quantile(q)
    null_count = df[(df.Timestamp>=last_quantile) & (df.Timestamp<current_quantile)].Open.isnull().sum()
    null_counts.append(null_count)
    last_quantile = current_quantile

In [None]:
null_counts = np.array(null_counts)
plt.plot(null_counts/len(df)*100.)
plt.show()

1개 구간이 1만 개 이다. 최대가 0.07이면 만개 중에 7개.
연속되어 있더라도 무시해도 되겠다.

In [None]:
interval = int(len(df)/100)
for i in range(100):
    print(i)
    plt.figure(figsize=(20,1))
    plt.plot(df.Open[i*interval:(i+1)*interval])
    plt.xticks([])
    plt.show()

# 결측치 처리

In [None]:
print(df.isnull().sum())

In [None]:
df.fillna(method='ffill', inplace=True)

In [None]:
msno.matrix(df)

In [None]:
df.head()

## Timestamp 컬럼 변환
Unix time => datetime

In [None]:
df['Timestamp'] = pd.to_datetime(df['Timestamp'],unit='s')

In [None]:
df.head()

In [None]:
df.tail()

# 불필요한 컬럼 삭제

In [None]:
df.drop(columns=['Timestamp'
        # , 'Volume_(BTC)'
        # , 'Volume_(Currency)'
        ], inplace=True)

In [None]:
df.head()

# 데이터 분포 보기

In [None]:
for i in range(len(df.columns)):
    print(df.columns[i])
    plt.hist(df.iloc[:,i], bins=1000)
    plt.show()

# 컬럼 로그 변환

In [None]:
df = np.log1p(df)

In [None]:
for i in range(len(df.columns)):
    print(df.columns[i])
    plt.hist(df.iloc[:,i], bins=1000)
    plt.show()

# MinMaxScaler
역변환 : inverse_scaled_data = scaler.inverse_transform(scaled_data)

In [None]:
scaler = MinMaxScaler()

In [None]:
scaled = scaler.fit_transform(df)
print(scaled)

# Data Sampling

## 파라미터 설정

In [None]:
SEQUENCE_LENGTH = 16 * 60   # <<-- 확인 필요 * 60 이유
OFFSET = 16 * 60            # <<--
BATCH_SIZE = 32
SAMPLING_COUNT = 10000      # 샘플링 크기

In [None]:
numbers = scaled
print(numbers)

In [None]:
raw_x = []
raw_y = []

for j in range(SAMPLING_COUNT):
  i = np.random.randint(len(numbers) - SEQUENCE_LENGTH - OFFSET)
  if i + SEQUENCE_LENGTH + OFFSET >= len(numbers): break
  raw_x.append(numbers[i:i + SEQUENCE_LENGTH, :])
  raw_y.append(numbers[i + SEQUENCE_LENGTH + OFFSET - 1, -1])
#              0         1              
#              01234567890123456789
# numbers[0] = 12345678901234567890
#                 i = 3
#                 <-x->   = [3:8] = 45678
#                      y  = [8]   = 9



In [None]:
print(raw_x[0])
print(raw_y[0])
print()
print(raw_x[1])
print(raw_y[1])

# 데이터셋 나누기

In [None]:
x = np.array(raw_x)
y = np.array(raw_y)
print(x.shape)
print(y.shape)

from sklearn.model_selection import train_test_split

train_x, test_x, train_y, test_y = train_test_split(x, y, test_size=0.1, shuffle=True)


print("Training Data:")
print(train_x.shape)
print(train_y.shape)

print("Test Data:")
print(test_x.shape)
print(test_y.shape)


In [None]:
print("train_x[0]\n", train_x[0])
print()
print("train_y[0]\n", train_y[0])

plt.plot(train_x[0].T[0], '.', color='blue')
# plt.plot([len(train_x[0])], train_y[0,0], 'x', color='blue')
plt.plot([len(train_x[0])], train_y[0], 'x', color='blue')
plt.plot(train_x[0].T[1], '.', color='red')
# plt.plot([len(train_x[0])], train_y[0,1], 'o', color='red')
plt.plot([len(train_x[0])], train_y[1], 'o', color='red')

plt.show()

In [None]:
print(train_x.shape)
print(train_y.shape)
input_shape = train_x.shape[1:]
output_shape = train_y.shape[-1]
print(input_shape)
print(output_shape)

# 모델 생성 및 학습

In [None]:
from tensorflow import keras
from tensorflow.keras.layers import Dense, RepeatVector, Flatten
from tensorflow.keras.layers import Bidirectional, LSTM, GRU

model = keras.Sequential()
model.add(LSTM(128, input_shape=input_shape))
model.add(Flatten())
model.add(Dense(10, activation='relu'))
model.add(Dense(output_shape))
model.compile(loss="mse", optimizer="adam", metrics=["mape"])
model.summary()

history = model.fit(train_x, train_y, epochs=10, verbose=1, validation_split=0.1, batch_size=BATCH_SIZE)


plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.show()



loss, mape = model.evaluate(test_x, test_y)
print("loss=", loss)
print("mape=", mape)



y_ = model.predict(test_x).squeeze()

In [None]:
for i in range(output_shape):
    plt.scatter(test_y[:,i], y_[:,i])
    plt.show()

## 결과 보기

In [None]:
plt.plot(test_y[0], 'b.')
plt.plot(y_[0], 'r.')
plt.show()

plt.plot(test_y[0,:-1], 'b.')
plt.plot(y_[0,:-1], 'r.')
plt.show()

# 역정규화 및 지수 변환

In [None]:
denomalized_test_y = scaler.inverse_transform(test_y)
denomalized_y_ = scaler.inverse_transform(y_)
delogged_test_y = np.expm1(denomalized_test_y)
delogged_y = np.expm1(denomalized_y_)

In [None]:
plt.plot(delogged_test_y[:100], 'bo')
plt.plot(delogged_y[:100], 'r.')
plt.show()

# plt.plot(delogged_test_y[0,:-1], 'bo')
# plt.plot(delogged_y[0,:-1], 'r.')
# plt.show()