data from https://www.kaggle.com/mattiuzc/commodity-futures-price-history

# 데이터 압축 풀기

In [None]:
!wget https://github.com/dhrim/deep_learning_data/raw/master/commodity.zip

In [None]:
!rm -rf commodity
!unzip commodity.zip

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Silver 예측

## 파라미터 설정

In [None]:
SEQUENCE_LENGTH = 10
OFFSET = 1
BATCH_SIZE = 32

## 데이터 로딩

In [None]:
df = pd.read_csv('commodity/Silver.csv')
df.head()

In [None]:
print(len(df))

## 결측치 처리

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
df = df.dropna()

In [None]:
df.isnull().sum()

In [None]:
df.info()

## 정렬 순서 확인

### 날짜순서 정렬 확인

In [None]:
df.sort_values(by=['Date'])

In [None]:
sorted_index = df.sort_values(by=['Date']).index

In [None]:
print(sorted_index)
print(sorted_index - df.index)
print(sum(sorted_index - df.index))

### Close 와 Adj Close 컬럼 비교

In [None]:
print(df['Close'] - df['Adj Close'])
print(sum(df['Close'] - df['Adj Close']))

In [None]:
df.drop(columns=["Date", "Adj Close"], inplace=True)

## 컬럼 이름 및 Histogram 출력

In [None]:
for i in range(len(df.columns)):
    print(df.columns[i])
    plt.hist(df.iloc[:,i], bins=1000)
    plt.show()

## Open, Volume 그래프 시각화

In [None]:
plt.figure(figsize=(14,3))
plt.plot(df.iloc[:,0])
plt.show()

plt.figure(figsize=(14,3))
plt.plot(df.iloc[:,4])
plt.show()


In [None]:
raw_data = df.to_numpy()
print(raw_data.shape)

In [None]:
print(raw_data[:5])

## Normalization

In [None]:
maxs = raw_data.max(axis=0)
print(maxs)
mins = raw_data.min(axis=0)
print(mins)

In [None]:
normalized = (raw_data - mins) / (maxs - mins)
print(normalized[:5])

In [None]:
preprocessed = normalized

## by Data Sampling

In [None]:
numbers = preprocessed

In [None]:
raw_x = []
raw_y = []

for i in range(len(numbers)):
  if i+SEQUENCE_LENGTH+OFFSET >= len(numbers): break
  raw_x.append(numbers[i:i+SEQUENCE_LENGTH,:])
  raw_y.append(numbers[i+SEQUENCE_LENGTH+OFFSET-1,:])
#              0         1              
#              01234567890123456789
# numbers[0] = 12345678901234567890
#                 i = 3
#                 <-x->   = [3:8] = 45678
#                      y  = [8]   = 9



In [None]:
print(raw_x[0])
print(raw_y[0])
print()
print(raw_x[1])
print(raw_y[1])

## 데이터셋 나누기

In [None]:
x = np.array(raw_x)
y = np.array(raw_y)
print(x.shape)
print(y.shape)

from sklearn.model_selection import train_test_split

train_x, test_x, train_y, test_y = train_test_split(x, y, test_size=0.1, shuffle=True)


print("Training Data:")
print(train_x.shape)
print(train_y.shape)

print("Test Data:")
print(test_x.shape)
print(test_y.shape)


In [None]:
print("train_x[0]\n", train_x[0])
print()
print("train_y[0]\n", train_y[0])

plt.plot(train_x[0].T[0], '.', color='blue')
plt.plot([len(train_x[0])], train_y[0,0], 'x', color='blue')
plt.plot(train_x[0].T[1], '.', color='red')
plt.plot([len(train_x[0])], train_y[0,1], 'o', color='red')

plt.show()

In [None]:
print(train_x.shape)
print(train_y.shape)
input_shape = train_x.shape[1:]
output_shape = train_y.shape[-1]
print(input_shape)
print(output_shape)

In [None]:
from tensorflow import keras
from tensorflow.keras.layers import Dense, RepeatVector, Flatten
from tensorflow.keras.layers import Bidirectional, LSTM, GRU

model = keras.Sequential()
model.add(LSTM(128, input_shape=input_shape))
model.add(Flatten())
model.add(Dense(10, activation='relu'))
model.add(Dense(output_shape))
model.compile(loss="mse", optimizer="adam", metrics=["mape"])
model.summary()

history = model.fit(train_x, train_y, epochs=10, verbose=1, validation_split=0.1, batch_size=BATCH_SIZE)


plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.show()



loss, mape = model.evaluate(test_x, test_y)
print("loss=", loss)
print("mape=", mape)



y_ = model.predict(test_x).squeeze()
for i in range(output_shape):
    plt.scatter(test_y[:,i], y_[:,i])
    plt.show()

## 예측, 실제 비교

In [None]:
plt.plot(test_y[0], 'b.')
plt.plot(y_[0], 'r.')
plt.show()

plt.plot(test_y[0,:-1], 'b.')
plt.plot(y_[0,:-1], 'r.')
plt.show()

## DeNormalize
n = (x - m) / (M - m)

x = n(M - m) + m

In [None]:
denomalized_test_y = test_y * (maxs - mins) + mins
denomalized_y_ = y_ * (maxs - mins) + mins

In [None]:
plt.plot(denomalized_test_y[0], 'b.')
plt.plot(denomalized_y_[0], 'r.')
plt.show()

plt.plot(denomalized_test_y[0,:-1], 'b.')
plt.plot(denomalized_y_[0,:-1], 'r.')
plt.show()

# Gold 예측

## 파라미터 설정

In [None]:
SEQUENCE_LENGTH = 10
OFFSET = 1
BATCH_SIZE = 32

## 데이터 로딩

In [None]:
df = pd.read_csv('commodity/Gold.csv')
df.head()

In [None]:
print(len(df))

## 결측치 처리

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
df = df.dropna()

In [None]:
df.isnull().sum()

In [None]:
df.info()

## 정렬 순서 확인

### 날짜순서 정렬 확인

In [None]:
df.sort_values(by=['Date'])

In [None]:
sorted_index = df.sort_values(by=['Date']).index

In [None]:
print(sorted_index)
print(sorted_index - df.index)
print(sum(sorted_index - df.index))

### Close 와 Adj Close 컬럼 비교

In [None]:
print(df['Close'] - df['Adj Close'])
print(sum(df['Close'] - df['Adj Close']))

### 불필요 컬럼 삭제

In [None]:
df.drop(columns=["Date", "Adj Close"], inplace=True)

## 컬럼별 분포 보기

In [None]:
for i in range(len(df.columns)):
    print(df.columns[i])
    plt.hist(df.iloc[:,i], bins=1000)
    plt.show()

In [None]:
plt.hist(df.Volume+1, bins=100)
plt.show()

In [None]:
df['Volume'] = np.log(df.Volume+1)

In [None]:
plt.hist(df.Volume, bins=100)
plt.show()

## Open, Volume 그래프 시각화

In [None]:
plt.figure(figsize=(14,3))
plt.plot(df.iloc[:,0])
plt.show()

plt.figure(figsize=(14,3))
plt.plot(df.iloc[:,4])
plt.show()


In [None]:
raw_data = df.to_numpy()
print(raw_data.shape)

In [None]:
print(raw_data[:5])

## Normalization

In [None]:
maxs = raw_data.max(axis=0)
print(maxs)
mins = raw_data.min(axis=0)
print(mins)

In [None]:
normalized = (raw_data - mins) / (maxs - mins)
print(normalized[:5])

In [None]:
preprocessed = normalized

## by Data Sampling

In [None]:
numbers = preprocessed

In [None]:
raw_x = []
raw_y = []

for i in range(len(numbers)):
  if i+SEQUENCE_LENGTH+OFFSET >= len(numbers): break
  raw_x.append(numbers[i:i+SEQUENCE_LENGTH,:])
  raw_y.append(numbers[i+SEQUENCE_LENGTH+OFFSET-1,:])
#              0         1              
#              01234567890123456789
# numbers[0] = 12345678901234567890
#                 i = 3
#                 <-x->   = [3:8] = 45678
#                      y  = [8]   = 9



In [None]:
print(raw_x[0])
print(raw_y[0])
print()
print(raw_x[1])
print(raw_y[1])

## 데이터셋 나누기

In [None]:
x = np.array(raw_x)
y = np.array(raw_y)
print(x.shape)
print(y.shape)

from sklearn.model_selection import train_test_split

train_x, test_x, train_y, test_y = train_test_split(x, y, test_size=0.1, shuffle=True)


print("Training Data:")
print(train_x.shape)
print(train_y.shape)

print("Test Data:")
print(test_x.shape)
print(test_y.shape)


In [None]:
print("train_x[0]\n", train_x[0])
print()
print("train_y[0]\n", train_y[0])

plt.plot(train_x[0].T[0], '.', color='blue')
plt.plot([len(train_x[0])], train_y[0,0], 'x', color='blue')
plt.plot(train_x[0].T[1], '.', color='red')
plt.plot([len(train_x[0])], train_y[0,1], 'o', color='red')

plt.show()

In [None]:
print(train_x.shape)
print(train_y.shape)
input_shape = train_x.shape[1:]
output_shape = train_y.shape[-1]
print(input_shape)
print(output_shape)

In [None]:
from tensorflow import keras
from tensorflow.keras.layers import Dense, RepeatVector, Flatten
from tensorflow.keras.layers import Bidirectional, LSTM, GRU

model = keras.Sequential()
model.add(LSTM(128, input_shape=input_shape))
model.add(Flatten())
model.add(Dense(10, activation='relu'))
model.add(Dense(output_shape))
model.compile(loss="mse", optimizer="adam", metrics=["mape"])
model.summary()

history = model.fit(train_x, train_y, epochs=10, verbose=1, validation_split=0.1, batch_size=BATCH_SIZE)


plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.show()



loss, mape = model.evaluate(test_x, test_y)
print("loss=", loss)
print("mape=", mape)



y_ = model.predict(test_x).squeeze()
for i in range(output_shape):
    plt.scatter(test_y[:,i], y_[:,i])
    plt.show()

## 예측, 실제 비교

In [None]:
print(test_y[0])

In [None]:
plt.plot(test_y[0], 'b.')
plt.plot(y_[0], 'r.')
plt.show()

plt.plot(test_y[0,:-1], 'b.')
plt.plot(y_[0,:-1], 'r.')
plt.show()

## DeNormalize
n = (x - m) / (M - m)

x = n(M - m) + m

In [None]:
denomalized_test_y = test_y * (maxs - mins) + mins
denomalized_y_ = y_ * (maxs - mins) + mins

In [None]:
plt.plot(denomalized_test_y[0], 'b.')
plt.plot(denomalized_y_[0], 'r.')
plt.show()

plt.plot(denomalized_test_y[0,:-1], 'b.')
plt.plot(denomalized_y_[0,:-1], 'r.')
plt.show()

## Open 가격 예측 비교

In [None]:
print(test_y)

In [None]:
plt.figure(figsize=(14,3))
plt.plot(denomalized_test_y[:,0], 'b.')
plt.plot(denomalized_y_[:,0], 'r.')
plt.show()

In [None]:
plt.figure(figsize=(14,3))
plt.plot(df.iloc[:,0])
plt.plot(denomalized_y_[:,0], 'r')
plt.show()

# Gold 시기별 예측

## 파라미터 설정

In [None]:
SEQUENCE_LENGTH = 10
OFFSET = 1
BATCH_SIZE = 32

## 데이터 로딩

In [None]:
df = pd.read_csv('commodity/Gold.csv')
df.head()

In [None]:
print(len(df))

## 결측치 처리

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
df = df.dropna()

In [None]:
df.isnull().sum()

In [None]:
df.info()

## 정렬 순서 확인

### 날짜순서 정렬 확인

In [None]:
df.sort_values(by=['Date'])

In [None]:
sorted_index = df.sort_values(by=['Date']).index

In [None]:
print(sorted_index)
print(sorted_index - df.index)
print(sum(sorted_index - df.index))

### Close 와 Adj Close 컬럼 비교

In [None]:
print(df['Close'] - df['Adj Close'])
print(sum(df['Close'] - df['Adj Close']))

### 불필요 컬럼 삭제

In [None]:
df.drop(columns=["Adj Close"], inplace=True)

## 컬럼별 분포 보기

In [None]:
for i in range(len(df.columns)):
    print(df.columns[i])
    plt.hist(df.iloc[:,i], bins=1000)
    plt.show()

## Open, Volume 그래프 시각화

In [None]:
plt.figure(figsize=(14,3))
plt.plot(df.iloc[:,0])
plt.show()

plt.figure(figsize=(14,3))
plt.plot(df.iloc[:,4])
plt.show()


In [None]:
raw_data = df.to_numpy()
print(raw_data.shape)

In [None]:
print(raw_data[:5])

## Normalization

In [None]:
maxs = raw_data.max(axis=0)
print(maxs)
mins = raw_data.min(axis=0)
print(mins)

In [None]:
normalized = (raw_data - mins) / (maxs - mins)
print(normalized[:5])

In [None]:
preprocessed = normalized

## by Data Sampling

In [None]:
numbers = preprocessed

In [None]:
raw_x = []
raw_y = []

for i in range(len(numbers)):
  if i+SEQUENCE_LENGTH+OFFSET >= len(numbers): break
  raw_x.append(numbers[i:i+SEQUENCE_LENGTH,:])
  raw_y.append(numbers[i+SEQUENCE_LENGTH+OFFSET-1,:])
#              0         1              
#              01234567890123456789
# numbers[0] = 12345678901234567890
#                 i = 3
#                 <-x->   = [3:8] = 45678
#                      y  = [8]   = 9



In [None]:
print(raw_x[0])
print(raw_y[0])
print()
print(raw_x[1])
print(raw_y[1])

## 데이터셋 나누기

In [None]:
x = np.array(raw_x)
y = np.array(raw_y)
print(x.shape)
print(y.shape)

from sklearn.model_selection import train_test_split

train_x, test_x, train_y, test_y = train_test_split(x, y, test_size=0.1, shuffle=True)


print("Training Data:")
print(train_x.shape)
print(train_y.shape)

print("Test Data:")
print(test_x.shape)
print(test_y.shape)


In [None]:
print("train_x[0]\n", train_x[0])
print()
print("train_y[0]\n", train_y[0])

plt.plot(train_x[0].T[0], '.', color='blue')
plt.plot([len(train_x[0])], train_y[0,0], 'x', color='blue')
plt.plot(train_x[0].T[1], '.', color='red')
plt.plot([len(train_x[0])], train_y[0,1], 'o', color='red')

plt.show()

In [None]:
print(train_x.shape)
print(train_y.shape)
input_shape = train_x.shape[1:]
output_shape = train_y.shape[-1]
print(input_shape)
print(output_shape)

In [None]:
from tensorflow import keras
from tensorflow.keras.layers import Dense, RepeatVector, Flatten
from tensorflow.keras.layers import Bidirectional, LSTM, GRU

model = keras.Sequential()
model.add(LSTM(128, input_shape=input_shape))
model.add(Flatten())
model.add(Dense(10, activation='relu'))
model.add(Dense(output_shape))
model.compile(loss="mse", optimizer="adam", metrics=["mape"])
model.summary()

history = model.fit(train_x, train_y, epochs=10, verbose=1, validation_split=0.1, batch_size=BATCH_SIZE)


plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.show()



loss, mape = model.evaluate(test_x, test_y)
print("loss=", loss)
print("mape=", mape)



y_ = model.predict(test_x).squeeze()
for i in range(output_shape):
    plt.scatter(test_y[:,i], y_[:,i])
    plt.show()

## 예측, 실제 비교

In [None]:
print(test_y[0])

In [None]:
plt.plot(test_y[0], 'b.')
plt.plot(y_[0], 'r.')
plt.show()

plt.plot(test_y[0,:-1], 'b.')
plt.plot(y_[0,:-1], 'r.')
plt.show()

## DeNormalize
n = (x - m) / (M - m)

x = n(M - m) + m

In [None]:
denomalized_test_y = test_y * (maxs - mins) + mins
denomalized_y_ = y_ * (maxs - mins) + mins

In [None]:
plt.plot(denomalized_test_y[0], 'b.')
plt.plot(denomalized_y_[0], 'r.')
plt.show()

plt.plot(denomalized_test_y[0,:-1], 'b.')
plt.plot(denomalized_y_[0,:-1], 'r.')
plt.show()

## Open 가격 예측 비교

In [None]:
print(test_y)

In [None]:
plt.figure(figsize=(14,3))
plt.plot(denomalized_test_y[:,0], 'r.')
plt.plot(denomalized_y_[:,0], 'b.')
plt.show()