[View in Colaboratory](https://colab.research.google.com/github/ccehshmily/learnDL/blob/master/tfs_data_preprocess.ipynb)

In [79]:
import math

from IPython import display
from matplotlib import cm
from matplotlib import gridspec
from matplotlib import pyplot as plt
import io
import numpy as np
import pandas as pd
from sklearn import metrics
import tensorflow as tf
from tensorflow.python.data import Dataset

!wget https://raw.githubusercontent.com/ccehshmily/tfs/master/datasource/sampledata/sampleGOOG.txt -O /tmp/sample_data_GOOG.csv

--2018-05-19 23:48:02--  https://raw.githubusercontent.com/ccehshmily/tfs/master/datasource/sampledata/sampleGOOG.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 218008 (213K) [text/plain]
Saving to: ‘/tmp/sample_data_GOOG.csv’


2018-05-19 23:48:02 (7.70 MB/s) - ‘/tmp/sample_data_GOOG.csv’ saved [218008/218008]



In [80]:
# Normalizes a list of values to the [-1, 1] range
def normalize_list(list_values):
  min_val = min(list_values)
  max_val = max(list_values)
  avg_val = (min_val + max_val) / 2.0
  range_val = (max_val - min_val) / 2.0
  return [(val - avg_val) / range_val for val in list_values]

def log_normalize_list(list_values):
  min_val = min(list_values)
  return normalize_list([math.log(x+1.1-min_val) for x in list_values])

# validate method
print log_normalize_list([3.8,1,2,3,4,5])

[0.6502266418305604, -1.0, -0.15690153631711687, 0.35089826599010526, 0.7154322364014578, 1.0]


In [81]:
# Preprocess raw data saved in /tmp/sample_data_[code].csv, extract data from
# recent history, and save processed and normailized data to
# /tmp/processed_data_[code].csv and /tmp/normalized_data_[code].csv
def preprocess_raw_data(code, price_lookback, volume_lookback, change_lookback):
  sample_data = open("/tmp/sample_data_" + code + ".csv", 'r')
  sample_data_lines = sample_data.readlines()
  sample_data.close()

  # Read each line into a map from column name to value
  column_titles = sample_data_lines[0][:-1].split(',')
  mapped_data = []
  for one_line in sample_data_lines[1:]:
    one_day_data = {column_titles[i]:one_line[:-1].split(',')[i] for i in range(len(column_titles))}
    mapped_data.append(one_day_data)

  # Expand each days data by reading from past N days, cannot process the furthest N days
  processed_data = open("/tmp/processed_data_" + code + ".csv", 'w')
  expanded_titles = ["Date"] + ["price_" + str(i) for i in range(price_lookback*4+1)] + ["volume_" + str(i) for i in range(volume_lookback)] + ["change_" + str(i) for i in range(change_lookback+1)]
  processed_data.write(','.join(expanded_titles) + "\n")
  for i in range(len(mapped_data) - max(price_lookback, volume_lookback, change_lookback)):
    price_history = [mapped_data[i]["Open"]]
    volume_history = []
    change_history = []

    for j in range(max(price_lookback, volume_lookback, change_lookback)):
      if j < price_lookback:
        price_history.append(mapped_data[i+j+1]["Close"])
        price_history.append(mapped_data[i+j+1]["High"])
        price_history.append(mapped_data[i+j+1]["Low"])
        price_history.append(mapped_data[i+j+1]["Open"])
      if j < volume_lookback:
        volume_history.append(mapped_data[i+j+1]["Volume"])
      if j < change_lookback:
        change_history.append(str((float(mapped_data[i+j+1]["Close"]) - float(mapped_data[i+j+1]["Open"])) / float(mapped_data[i+j+1]["Open"]) * 100.00))

    # Add today's change, this will be the label that we try to predict
    change_history.append(str((float(mapped_data[i]["Close"]) - float(mapped_data[i]["Open"])) / float(mapped_data[i]["Open"]) * 100.00))

    one_day_data = [code + "-" + mapped_data[i]["Date"]] + price_history + volume_history + change_history
    processed_data.write(','.join(one_day_data) + "\n")
  processed_data.close()

  # Normalize data
  pre_normalized_file = open("/tmp/processed_data_" + code + ".csv", 'r')
  pre_normalized_data = pre_normalized_file.readlines()
  pre_normalized_file.close()

  normalized_data = open("/tmp/normalized_data_" + code + ".csv", 'w')
  normalized_data.write(pre_normalized_data[0])
  for one_line in pre_normalized_data[1:]:
    one_day_data = one_line[:-1].split(',')
    price_data = [float(price) for price in one_day_data[1:1+price_lookback*4+1]]
    volume_data = [float(volume) for volume in one_day_data[1+price_lookback*4+1:1+price_lookback*4+1+volume_lookback]]
    change_data = [float(change) for change in one_day_data[1+price_lookback*4+1+volume_lookback:]]
    
    nor_price_data = [str(price) for price in normalize_list(price_data)]
    nor_volume_data = [str(volume) for volume in normalize_list(volume_data)]
    nor_change_data = [str(change) for change in normalize_list(change_data)]
    
    nor_one_day_data = ','.join([one_day_data[0]] + nor_price_data + nor_volume_data + nor_change_data)
    normalized_data.write(nor_one_day_data + "\n")
  normalized_data.close()

code = "GOOG"
price_lookback = 10
volume_lookback = 25
change_lookback = 25

preprocess_raw_data(code, price_lookback, volume_lookback, change_lookback)

# validate method
f1 = open("/tmp/processed_data_GOOG.csv", 'r')
f2 = open("/tmp/normalized_data_GOOG.csv", 'r')
print f1.readline()
print f1.readline()
print f2.readline()
print f2.readline()
print len(f2.readlines())

Date,price_0,price_1,price_2,price_3,price_4,price_5,price_6,price_7,price_8,price_9,price_10,price_11,price_12,price_13,price_14,price_15,price_16,price_17,price_18,price_19,price_20,price_21,price_22,price_23,price_24,price_25,price_26,price_27,price_28,price_29,price_30,price_31,price_32,price_33,price_34,price_35,price_36,price_37,price_38,price_39,price_40,volume_0,volume_1,volume_2,volume_3,volume_4,volume_5,volume_6,volume_7,volume_8,volume_9,volume_10,volume_11,volume_12,volume_13,volume_14,volume_15,volume_16,volume_17,volume_18,volume_19,volume_20,volume_21,volume_22,volume_23,volume_24,change_0,change_1,change_2,change_3,change_4,change_5,change_6,change_7,change_8,change_9,change_10,change_11,change_12,change_13,change_14,change_15,change_16,change_17,change_18,change_19,change_20,change_21,change_22,change_23,change_24,change_25

GOOG-2016-05-24,706.859985,704.23999,711.478027,704.179993,706.530029,709.73999,714.580017,700.52002,701.619995,700.320007,706.00,696.799988,702.

In [82]:
tf.logging.set_verbosity(tf.logging.ERROR)
pd.options.display.max_rows = 10
pd.options.display.float_format = '{:.4f}'.format

google_data = pd.read_csv("/tmp/normalized_data_GOOG.csv", sep=",")
display.display(google_data.describe())
google_data.head(10)

Unnamed: 0,price_0,price_1,price_2,price_3,price_4,price_5,price_6,price_7,price_8,price_9,...,change_16,change_17,change_18,change_19,change_20,change_21,change_22,change_23,change_24,change_25
count,2937.0,2937.0,2937.0,2937.0,2937.0,2937.0,2937.0,2937.0,2937.0,2937.0,...,2937.0,2937.0,2937.0,2937.0,2937.0,2937.0,2937.0,2937.0,2937.0,2937.0
mean,0.1346,0.1072,0.3742,-0.1676,0.1138,0.0866,0.3538,-0.1869,0.0943,0.0675,...,0.0033,0.0021,0.0021,0.0026,0.0028,0.0014,0.0014,0.0019,0.0019,-0.0042
std,0.64,0.6095,0.5514,0.5855,0.5515,0.5367,0.4955,0.5252,0.5072,0.4998,...,0.5139,0.5149,0.5163,0.516,0.5181,0.5189,0.5195,0.5221,0.523,0.5107
min,-1.0,-1.0,-0.959,-1.0,-1.0,-1.0,-0.959,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
25%,-0.4475,-0.4484,-0.1306,-0.7399,-0.393,-0.3892,-0.0592,-0.6519,-0.3242,-0.3342,...,-0.3557,-0.3578,-0.3672,-0.3617,-0.3616,-0.36,-0.361,-0.3617,-0.3553,-0.3617
50%,0.223,0.1709,0.4865,-0.1353,0.1821,0.1403,0.4352,-0.1621,0.1296,0.0961,...,0.0078,0.005,0.0069,0.0061,0.008,0.0118,0.0123,0.0185,0.0167,-0.0106
75%,0.7275,0.6814,0.9075,0.3761,0.6045,0.5652,0.7914,0.2708,0.5241,0.4862,...,0.369,0.3718,0.3753,0.377,0.3813,0.3807,0.376,0.3794,0.3766,0.3584
max,1.0,1.0,1.0,0.841,1.0,0.9906,1.0,0.82,1.0,0.9862,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


Unnamed: 0,Date,price_0,price_1,price_2,price_3,price_4,price_5,price_6,price_7,price_8,...,change_16,change_17,change_18,change_19,change_20,change_21,change_22,change_23,change_24,change_25
0,GOOG-2016-05-24,-0.2731,-0.4624,0.0606,-0.4668,-0.297,-0.065,0.2847,-0.7312,-0.6517,...,0.286,-1.0,0.0354,-0.9758,0.5879,-0.3509,0.3618,-0.196,-0.8098,1.0
1,GOOG-2016-05-23,-0.297,-0.065,0.2847,-0.7312,-0.6517,-0.7457,-0.3353,-1.0,-0.5983,...,-1.0,0.0469,-0.9755,0.6056,-0.3437,0.377,-0.187,-0.8077,0.5231,-0.009
2,GOOG-2016-05-20,-0.6517,-0.7457,-0.3353,-1.0,-0.5983,-0.2897,0.0694,-0.7233,-0.5036,...,0.0469,-0.9755,0.6056,-0.3437,0.377,-0.187,-0.8077,0.5231,0.4559,0.6868
3,GOOG-2016-05-19,-0.5382,-0.2413,0.1043,-0.6586,-0.4471,-0.2691,0.7942,-0.4165,0.4096,...,-0.9755,0.6056,-0.3437,0.377,-0.187,-0.8077,0.5231,0.4559,0.0928,0.0068
4,GOOG-2016-05-18,-0.1734,-0.029,0.8331,-0.1486,0.5213,0.5495,0.6617,-0.0617,0.1345,...,0.6056,-0.3437,0.377,-0.187,-0.8077,0.5231,0.4559,0.0928,0.3037,0.3408
5,GOOG-2016-05-17,0.5213,0.5495,0.6617,-0.0617,0.1345,0.2303,0.5592,0.1418,0.2924,...,-0.3437,0.377,-0.187,-0.8077,0.5231,0.4559,0.0928,0.3037,0.4672,-0.497
6,GOOG-2016-05-16,0.1345,0.2303,0.5592,0.1418,0.2924,0.3702,0.7051,0.1272,0.5816,...,0.377,-0.187,-0.8077,0.5231,0.4559,0.0928,0.3037,0.4672,-0.2942,0.6307
7,GOOG-2016-05-13,0.2926,0.3704,0.7052,0.1274,0.5817,0.482,1.0,0.3416,0.9397,...,-0.187,-0.8077,0.5231,0.4559,0.0928,0.3037,0.4672,-0.2942,-0.1611,0.0707
8,GOOG-2016-05-12,0.5817,0.482,1.0,0.3416,0.9397,0.9267,0.9448,0.5062,0.5643,...,-0.8077,0.5231,0.4559,0.0928,0.3037,0.4672,-0.2942,-0.1611,-0.1775,-0.1024
9,GOOG-2016-05-11,0.9948,0.9814,1.0,0.549,0.6087,0.3855,0.7223,0.2174,0.3333,...,0.5231,0.4559,0.0928,0.3037,0.4672,-0.2942,-0.1611,-0.1775,0.7765,-0.384
