### 学习使用Tushare， 下载A股数据

参考：
- [1]: https://mp.weixin.qq.com/s?__biz=MzAwOTgzMDk5Ng==&mid=2650833972&idx=1&sn=4de9f9ee81bc8bf85d1e0a4a8f79b0de&chksm=80adb30fb7da3a19817c72ff6f715ee91d6e342eb0402e860e171993bb0293bc4097e2dc4fe9&mpshare=1&scene=1&srcid=1106BPAdPiPCnj6m2Xyt5p2M#wechat_redirect
- [2]: http://tushare.org/

In [None]:
import sys
sys.path.append('C:\\Users\\docul\\ai4stocks-projects\\myStockAILib')
import stockbasic as sb
import dataprep as dp


import pandas as pd
import tushare as ts
print(ts.__version__)

### 例子，由沪深300 成分股，生成训练集

#### Fetch full data sets

In [None]:
stock_list_hs300 = ts.get_hs300s()  # 沪深300成分股
# drop '002839'

sd = '2010-01-01'  # start date
ed = '2017-12-01'  # end date

# fetch all data in the list
all_data = dp.fetch_raw_data(stock_list_hs300, sd, ed)

#### Save to CSV

In [None]:
all_data.to_csv('hs300_20100101-20171124.csv')
print(all_data.shape)
print(all_data.head())

#### Read from CSV

In [None]:
rb = pd.read_csv('hs300_20100101-20171124.csv', index_col=[1], dtype={'code':str})

# set index
rb.set_index([rb['code'], rb.index], inplace=True)
# remove duplicated 'code.1'
rb.drop(['code.1'], axis=1, inplace=True)

print(rb.head(5))
all_data = rb

In [None]:
stock_list = all_data.groupby(level=0).size().reset_index(name='counts')
for index, row in stock_list.iterrows():
    print(row['code'])

#### Generate Stock Features

In [None]:
# Input: all_data, MultiIndex'ed by 'code' and 'date'
#

CONST_DROP_THRESHOLD = 1000

all_data_and_features = pd.DataFrame()  # stock data with all available features, such as 'SQZ', 'HIST1', and all.


stock_list = all_data.groupby(level=0).size().reset_index(name='counts')
# Initial call to print 0% progress
total = stock_list.shape[0]
i = 0
dp.printProgressBar(i, total, prefix = 'Progress:', suffix = 'Complete', length = 60)


for index, row in stock_list.iterrows():  # iterate the stock list
    # slicing one stock_data
    stock_data = all_data.loc[row['code']]
    
    # skip if stock_data has less than CONST_DROP_THRESHOLD bars
    if stock_data.shape[0] < CONST_DROP_THRESHOLD:
        i += 1
        print('DROP ' + row['code'])
        continue
        
    # print(row['code'], row['name'])
    # print(stock_data.shape)
    
    # add 'EMA8', 'EMA21'
    stock_data = stock_data.join(sb.ttm_propulsion(stock_data))
    # add SQZ
    stock_data = stock_data.join(sb.ttm_squeeze(stock_data))
    # add WAVE
    stock_data = stock_data.join(sb.ttm_wave(stock_data))
    # add ADX
    stock_data = stock_data.join(sb.talib_adx(stock_data))
    # add ATR
    stock_data = stock_data.join(sb.talib_atr(stock_data))
    # add N-bar LOW
    stock_data = stock_data.join(sb.talib_nbarlow(stock_data, N_BAR_LOWEST = 10))
    
    # append to all_data_and_features
    all_data_and_features = all_data_and_features.append(stock_data)
    
    # update progress bar
    i += 1
    dp.printProgressBar(i, total, prefix = 'Progress:', suffix = 'Complete', length = 60)

# MulitIndex with both 'code' and 'date'
all_data_and_features.set_index([all_data_and_features['code'], all_data_and_features.index], inplace=True)

print('original all_data: ')
print(all_data.columns, '\n', all_data.shape)

print('after adding featuresa: ')
print(all_data_and_features.columns, '\n', all_data_and_features.shape)
print(all_data_and_features.head())

#### Create Samples

In [None]:
# to calculate how many stocks we have in all_data_and_features DataFrame
all_data_and_features.groupby(level=0).size().shape

In [None]:
# Initialize X_all and Y_all
X_all = pd.DataFrame()
Y_all = pd.DataFrame()

# Initial call to print 0% progress
total = all_data_and_features.groupby(level=0).size().shape[0]
i = 0
dp.printProgressBar(i, total, prefix = 'Progress:', suffix = 'Complete', length = 60)

for code, onestock_df in all_data_and_features.groupby(level=0):
    print(code, ' ', onestock_df.shape)
    onestock_X_all, onestock_Y_all = dp.generate_samples(onestock_df)
    print(onestock_X_all.shape, onestock_Y_all.shape)
    
    # Add samples to X_all
    X_all = X_all.append(onestock_X_all, ignore_index = False)
    # Add sell-point information to Y_all.
    Y_all = Y_all.append(onestock_Y_all, ignore_index = True)
    
    # update progress
    i += 1
    dp.printProgressBar(i, total, prefix = 'Progress:', suffix = 'Complete', length = 60)
    
    # testing
    # if i > 5:
    #    break


In [None]:
print('Total Samples: ', Y_all.shape[0])
print('Total timesteps in all Samples: ', X_all.shape[0])

# check how many Samples have positive profit
print('No. of samples with positive profit higher than 20%: ', Y_all[Y_all.sell_price > (Y_all.buy_price*1.2)]['code'].count())

#### Save X_all/Y_all (samples) to CSV

In [None]:
X_all.to_csv('hs300_20100101-20171124-samples-X-all.csv')
Y_all.to_csv('hs300_20100101-20171124-samples-Y-all.csv')


#### Read X_all/Yall from CSV

In [None]:
# read X_all samples
rbx = pd.read_csv('hs300_20100101-20171124-samples-X-all.csv', index_col=[1], dtype={'code':str})

# set index
rbx.set_index([rbx['code'], rbx.index], inplace=True)
# remove duplicated 'code.1'
rbx.drop(['code.1'], axis=1, inplace=True)

print(rbx.head(1))

In [None]:
# read Y_all samples
rby = pd.read_csv('hs300_20100101-20171124-samples-Y-all.csv', index_col=[0], dtype={'code':str})

print(rby.head(1))

In [None]:
# update Y_all buy_date and sell_date

buy_dates = []
sell_dates = []

for i in rby.index:
    b_date = rby.iloc[i].buy_date[12:22]  # truncate string
    buy_dates.append(b_date)
    s_date = rby.iloc[i].sell_date[12:22]  # truncate string
    sell_dates.append(s_date)
    
rby['buy_date'] = buy_dates
rby['sell_date'] = sell_dates

print(rby.head(5))

In [None]:
X_all = rbx
Y_all = rby

### Visualize stock_data

In [None]:
# -*- coding:utf-8 -*-
import numpy as np
import tushare as ts
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.finance as mpf
%matplotlib inline

In [None]:

print(X_all.columns)
print(Y_all.columns)

#### Given 'code', show stock's 'name'

### Define stock plot

In [None]:
from matplotlib.pylab import date2num
import datetime

#
# date_to_num
#


def date_to_num(dates):
    ''' Function to convert tushare 'date' string to matplotlib datenum
    
    Input
    =====
    dates: ndarray of tushare 'date' strings. Eg. ['2013-01-31', ...]
    
    Output
    ======
    Return: list of float datetime value compatible to matplotlib: floating point 
            numbers which represent time in days since 0001-01-01 UTC, plus 1. 
            For example, 0001-01-01, 06:00 is 1.25, not 0.25.
    
    Example
    =======
        stock_data['mpl.date'] = date_to_num(stock_data['date'].values)

    '''
    num_time = []
    for date in dates:
        date_time = datetime.datetime.strptime(date,'%Y-%m-%d')
        num_date = date2num(date_time)
        '''
            matplotlib.dates.date2num(d)
                Converts datetime objects to Matplotlib dates
        '''
        num_time.append(num_date)
    return num_time


#
# plog_stock_data
#


def plot_stock_data(stock_data, title_postfix=''):
    ''' Function to plot stock_data
    
    Input
    =====
    stock_data: DataFrame, with columns 'date', 'code', 'close', 'volumn', etc.
    
    Output
    ======
    Return: None
    '''
   
    # make a local copy
    sdata = stock_data.copy(deep=False)
    # convert index 'date' to a column
    sdata.reset_index(level=1, inplace=True)
    
    # convert date to num
    sdata['mpl.date'] = date_to_num(sdata['date'].values)

    fig, axes = plt.subplots(7, sharex=True, figsize=(15,14),
                             gridspec_kw={'height_ratios':[3,1,1,1,1,1,1]})
    
    # axes[0]: k-line
    mpf.candlestick_ochl(axes[0],
                         sdata[['mpl.date', 'open', 'close', 'high', 'low']].values,
                         width=1.0,
                         colorup = 'g',
                         colordown = 'r')
    # axes[0]: EMA8, EMA21
    axes[0].plot(sdata['mpl.date'].values, sdata['EMA8'].values, 'm', label='EMA8')
    axes[0].plot(sdata['mpl.date'].values, sdata['EMA21'].values, 'c', label='EMA21')
    axes[0].legend(loc=0)
    axes[0].grid(True)
    
    axes[0].set_title(stock_data['code'].iloc[0] + ' ' + title_postfix)
    axes[0].set_ylabel('Price')
    axes[0].grid(True)
    axes[0].xaxis_date()

    # axes[1]: volume
    axes[1].bar(sdata['mpl.date'].values-0.25, sdata['volume'].values, width= 0.5)
    axes[1].set_ylabel('Volume')
    axes[1].grid(True)
    
    # axes[2]: MTMMA
    bars = axes[2].bar(sdata['mpl.date'].values-0.25, sdata['MTMMA'].values, width=0.8)
    for bar in bars:
        if bar.get_height() > 0:
            bar.set_color('g')
        else:
            bar.set_color('r')
        
    # axes[2]: SQUEEZE
    patches = axes[2].plot(sdata['mpl.date'].values-0.25,
                 [0 if x == sb.CONST_SQUEEZE_ONGOING else float('nan') for x in sdata['SQUEEZE'].values], # rescale
                 'ko',
                 label='SQUEEZE')
    axes[2].set_ylabel('SQZ')
    axes[2].grid(True)
    
    # axes[3]: TTM WAVE C
    bars = axes[3].bar(sdata['mpl.date'].values-0.25, sdata['MACD6'].values, color='red', width=0.8, alpha=0.8)
    bars = axes[3].bar(sdata['mpl.date'].values-0.25, sdata['HIST5'].values, color='orange', width=0.8, alpha=0.8)
    axes[3].set_ylabel('WAVE C')
    axes[3].grid(True)
   
    # axes[4]: TTM WAVE B
    bars = axes[4].bar(sdata['mpl.date'].values-0.25, sdata['HIST4'].values, color='magenta', width=0.8, alpha=0.8)
    bars = axes[4].bar(sdata['mpl.date'].values-0.25, sdata['HIST3'].values, color='teal', width=0.8, alpha=0.8)
    axes[4].set_ylabel('WAVE B')
    axes[4].grid(True)
   
    # axes[5]: TTM WAVE A
    bars = axes[5].bar(sdata['mpl.date'].values-0.25, sdata['HIST2'].values, color='lawngreen', width=0.8, alpha=0.8)
    bars = axes[5].bar(sdata['mpl.date'].values-0.25, sdata['HIST1'].values, color='yellow', width=0.8, alpha=0.8)
    axes[5].set_ylabel('WAVE A')
    axes[5].grid(True)
    
    # axes[6]: ADX
    axes[6].plot(sdata['mpl.date'].values, sdata['ADX'].values, 'm', label='ADX')
    axes[6].set_ylabel('ADX')
    axes[6].grid(True)
   
    return
    


### Visualize one stock sample

In [None]:
import random, math

# plot a random good sample

CONST_PROFIT_THRESHOLD = 2  # profit bigger than

while (1):
    # generate a random number
    pindex = random.randint(0, Y_all.shape[0]-1)
  
    if (Y_all.iloc[pindex].profit > CONST_PROFIT_THRESHOLD):
        pstock_data = X_all.iloc[(pindex * dp.CONST_LOOKBACK_SAMPLES):((pindex +1) * dp.CONST_LOOKBACK_SAMPLES)]
        title_postfix = ' ==  Profit: ' + str(round(Y_all.iloc[pindex].profit * 100, 1)) + '%,' + ' B/S: ' + Y_all.iloc[pindex].buy_date + ' / ' + Y_all.iloc[pindex].sell_date
        plot_stock_data(pstock_data, title_postfix)
        break
    else:
        continue


#### Calculate Y_all's buy_date and sell_date

In [None]:
# Calculate Y_all's buy_date and sell_date
# convert date to num
Y_all['mpl.buy_date'] = date_to_num(Y_all['buy_date'].values)
Y_all['mpl.sell_date'] = date_to_num(Y_all['sell_date'].values)
Y_all['hold_days'] = Y_all['mpl.sell_date'] - Y_all['mpl.buy_date']

In [None]:
# Sanity check, iterrate over Y_all
i = 0
for index, row in Y_all.iterrows():
    if row['hold_days'] == 0:
        print("ERROR")
        print(index, row['hold_days'], row['profit'])
        i += 1
print('Total', i)


In [None]:
# Calculate Y_all 'profit'
Y_all['profit'] = Y_all['sell_price']/Y_all['buy_price'] - 1

# Calculate Y_all 'profit.per.day'
Y_all['profit.per.day'] = Y_all['profit']/Y_all['hold_days']

Y_all.head(1)

In [None]:
# keep 'profit', which is a percentage rate of profit
Y_all_cleanup = Y_all.drop(['mpl.buy_date', 'mpl.sell_date', 'hold_days', 'buy_date', 'buy_price', 'code', 'sell_date', 'sell_price', 'sell_reason'], axis=1)
print(Y_all_cleanup.head(1))
print(Y_all_cleanup.shape)
print('No. of samples with positive profit higher than 20%: ', Y_all_cleanup[Y_all_cleanup.profit>0.2239].count())

#### Visualize Y_all 'hold_days' and 'profit'

In [None]:
# plot 'hold_days' (Day) and 'profit' (%) relationship
#  - colorbar indicates percentage (%) of earn per Day.

# Y_profit = Y_all
Y_profit = Y_all[Y_all['profit'] > 0.2239]

plt.figure(figsize=(15, 5))
plt.scatter(Y_profit['hold_days'].values,  # hold days
            Y_profit['profit.per.day'].values*100,  # profit percentage %
            c=Y_profit['profit.per.day'],  # increment % per day
            marker='o')
plt.colorbar()
plt.grid(True)
plt.xlabel('Days')
plt.ylabel('Profit Percentage')
plt.show()

#### Visualize Y_all 'profit' Distribution Hisogram
#### Percentile Y_all 'profit'

In [None]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

Y_profit_np = Y_all_cleanup['profit'].values

# quantile
print(np.percentile(Y_profit_np,95))  # 95th percentile
# percentile at 10% steps
print(np.percentile(Y_profit_np, np.arange(0, 100, 10)))  # deciles

print(Y_all_cleanup.shape[0], 'vs.', Y_all_cleanup[Y_all_cleanup.profit>=0.2239].shape[0])


# sns.distplot(Y_all_cleanup['profit'].values, bins=50, kde=False, rug=False)
sns.distplot(Y_all_cleanup[Y_all_cleanup.profit>0.2][Y_all_cleanup.profit<=0.3]['profit'].values, bins=10, kde=False, rug=False)
plt.grid(True)
plt.show()

#### Calculate sigmoid('profit' * 100 - 22.39 + 2)

In [None]:
from scipy.special import expit

t = Y_all['profit'].values  * 100 - 22.39 + 2

print(t.shape)

Y_all['sigmoid.profit'] = expit(Y_all['profit'].values  * 100 - 22.39 + 2)

# print(Y_all['sigmoid.profit'])
#sns.distplot(Y_all['sigmoid.profit'].values, bins=50, kde=False, rug=False)
sns.distplot(t, bins=50, kde=False, rug=False)

plt.grid(True)
plt.show()

sns.distplot(Y_all_cleanup['profit'].values, bins=50, kde=False, rug=False)
plt.grid(True)
plt.show()

#### Calculate sigmoid('profit.per.day')

In [None]:
from scipy.special import expit

t = Y_all['profit.per.day'].values*100
print(t.shape)

print(np.isnan(t).any())

Y_all['profit.per.day'].plot()
plt.show()

#sns.distplot(Y_all['profit.per.day'].values*100, bins=10, kde=False, rug=False)
plt.grid(True)
plt.show()

#sns.distplot(Y_all_cleanup['profit'].values, bins=50, kde=False, rug=False)
#plt.grid(True)
#plt.show()

#### Samples Standadization

归一化


INPUT
=====
X_all: DataFrame with all samples and all features.

OUTPUT
======
X_all_cleanup: DataFrame with all samples,
    - but trimming off unnessary features. 'ATR', 'LOW10', etc. are dropped.

X_all_cleanup_std: DataFrame,
    - which keep one sample in every CONST_ONE_OUT_OF_NUMBER samples.
    - with all features standardized.

In [None]:
# drop column 'ATR', 'LOW10', 'open', 'high', 'low', 'code'
X_all_cleanup = X_all.drop(['ATR', 'LOW10', 'open', 'high', 'low', 'code'], axis=1)

# print(X_all_cleanup.head())
# X_all_cleanup.shape
print(X_all_cleanup.describe())

#### Pick up X samples, and Standardize them

In [None]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import MaxAbsScaler

scaler = MinMaxScaler()
absscaler = MaxAbsScaler()

CONST_ONE_OUT_OF_NUMBER = 1

# Note:
#   - 'close', 'EMA8', 'EMA21' these three are correlated, and should be rescaled based on same value.
CONST_LOOKBACK_SAMPLES = 60

total_samples = int(X_all_cleanup.shape[0] / CONST_LOOKBACK_SAMPLES)
X_all_cleanup_std = pd.DataFrame()  # initialize a dataframe.

std_sample_list = []  # define a list for standadized samples. element is pandas DataFrame.

# Initial call to print 0% progress
dp.printProgressBar(0, total_samples - 1, prefix = 'Samples Standardization:', suffix = 'Complete', length = 60)

for i in range(0, total_samples):
    # only keep 1 out of CONST_ONE_OUT_OF_NUMBER
#    if (i % CONST_ONE_OUT_OF_NUMBER != 0):
#        continue
#    if i >= 10000:  # tesing
#        break
    # slice out one sample
    one_sample = X_all_cleanup.iloc[(i * CONST_LOOKBACK_SAMPLES):((i+1) * CONST_LOOKBACK_SAMPLES), :]
    # if (i % 20 == 0):
    #    print(one_sample.head(1))
    # print(one_sample.shape)
    # print(one_sample.columns)
    # print(one_sample.head())
    # print(one_sample.describe())
    # copy to new sample skeleton
    standardized_sample = pd.DataFrame(index=one_sample.index)
    
    # Scaling 'volume'
    standardized_sample['volume'] = scaler.fit_transform(one_sample['volume'].values.reshape(-1,1))
    
    # Scaling 'close', 'EMA8', 'EMA21'. They are co-related, so scale from the same scale.
    # print('close, EMA8, EMA21 scaling:')
    pmin = one_sample['close'].min()
    pmax = one_sample['close'].max()
    standardized_sample['close'] = (one_sample['close'].values - pmin) / (pmax - pmin)
    standardized_sample['EMA8'] = (one_sample['EMA8'].values - pmin) / (pmax - pmin)
    standardized_sample['EMA21'] = (one_sample['EMA21'].values - pmin) / (pmax - pmin)
    
    # plot it
    # fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=(60, 15))
    # ax1.set_title('Before Scaling')
    # sns.kdeplot(one_sample['close'], ax=ax1)
    # ax2.set_title('After Scaling')
    # sns.kdeplot(standardized_sample['s_close'], ax=ax2)
    # plt.show()
    
    # Scaling 'ADX'
    standardized_sample['ADX'] = scaler.fit_transform(one_sample['ADX'].values.reshape(-1,1))
    
    # Copy 'SQUEEZE'
    standardized_sample['SQUEEZE'] = one_sample['SQUEEZE']
    
    # Scaling 'MTMMA', 'HIST1' ~ 'HIST5', 'MACD6'
    standardized_sample['MTMMA'] = absscaler.fit_transform(one_sample['MTMMA'].values.reshape(-1,1))
    standardized_sample['HIST1'] = absscaler.fit_transform(one_sample['HIST1'].values.reshape(-1,1))
    standardized_sample['HIST2'] = absscaler.fit_transform(one_sample['HIST2'].values.reshape(-1,1))
    standardized_sample['HIST3'] = absscaler.fit_transform(one_sample['HIST3'].values.reshape(-1,1))
    standardized_sample['HIST4'] = absscaler.fit_transform(one_sample['HIST4'].values.reshape(-1,1))
    standardized_sample['HIST5'] = absscaler.fit_transform(one_sample['HIST5'].values.reshape(-1,1))
    standardized_sample['MACD6'] = absscaler.fit_transform(one_sample['MACD6'].values.reshape(-1,1))
    
    # add it to std_sample_list
    std_sample_list.append(standardized_sample)

    # update progress bar
    dp.printProgressBar(i, total_samples - 1, prefix = 'Samples Standardization:', suffix = 'Complete', length = 60)

print('starting to concat ...')
X_all_cleanup_std = pd.concat(std_sample_list)
print('DONE')

In [None]:
print(X_all_cleanup_std.shape)
print('Selected X samples: ', X_all_cleanup_std.shape[0]/CONST_LOOKBACK_SAMPLES)
print(X_all_cleanup_std.describe())


#### Pick up Y samples

INPUT
=====
Y_all_cleanup: DataFrame with 'profit' as only column

OUTPUT
======
Y_all_cleanup_std: DataFrame,
    - based from Y_all_cleanup.
    - which keep one sample in every CONST_ONE_OUT_OF_NUMBER samples.

Y_all_binary_profit: DataFrame,
    - based from Y_all_cleanup_std.
    - change 'profit' to a binary value. int(1) or int(0).
        - profit bigger than CONST_PROFIT_THRESHOLD is stored as 1.
        - profit less than or equal to CONST_PROFIT_THRESHOLD is stored as 1.
        


In [None]:
CONST_PROFIT_THRESHOLD = 0.2239  # take any profit value less than this number as a LOSS.

#
# Y_all_cleanup_std
#

total_samples = int(X_all_cleanup.shape[0] / CONST_LOOKBACK_SAMPLES)
total_samples_y = Y_all_cleanup.shape[0]

if total_samples_y != total_samples:
    print('ERROR: unmatched X and Y samples')

Y_all_cleanup_std = pd.DataFrame()  # initialize a dataframe. Store profit rate as a float number.

for i in range(0, total_samples_y):
    # only keep 1 out of CONST_ONE_OUT_OF_NUMBER
    if (i % CONST_ONE_OUT_OF_NUMBER != 0):
        i +=1
        continue;

    # slice out one sample
    one_sample = Y_all_cleanup.iloc[i:(i+1), :]
    Y_all_cleanup_std = Y_all_cleanup_std.append(one_sample)

#
# Y_all_binary_profit
#

# create binary profit value: 0 for loss, 1 for win.
Y_all_binary_profit = Y_all_cleanup_std.copy()

for index, row in Y_all_cleanup_std.iterrows():
        if row['profit'] > CONST_PROFIT_THRESHOLD:
            Y_all_binary_profit.loc[index]['profit'] = int(1)
        else:
            Y_all_binary_profit.loc[index]['profit'] = int(0)




In [None]:
print('Y_all_cleanup_std: ', Y_all_cleanup_std.shape)
print('Y_all_binary_profit: ', Y_all_binary_profit.shape)

In [None]:
Y_all_binary_profit[Y_all_binary_profit['profit']>0.5].count()  # 1 means Good Profit, 0 means Loss

#### Change Y_all_binary_profit to Categorical array

INPUT:
=====
Y_all_binary_profit: Dataframe with binary 'profit' field.

OUTPUT:
=======
Y_all_binary_categorical: Numpy data array in shape [x, 2]
    - two category column. One for Good Profit, the other for Loss.

In [None]:
from keras.utils import np_utils

Y_all_binary_categorical = np_utils.to_categorical(Y_all_binary_profit)
print(Y_all_binary_categorical)
print(Y_all_binary_categorical.shape)

### Choose Balanced samples for 'Good Profit' and 'Loss'

There are 6626 samples of 'Good Profit', profit bigger than 22.39%. To balance that, choose 6626 (1 out of 9) samples of 'Loss' randomly from the X/Y datasets.

Together, they make a balanced train/test sets.

In [None]:
#
# pick_one_sample_x_and_y
#


def pick_one_sample_x_and_y(index, X_samples_all, Y_samples_all, len_X_sample=60, len_Y_sample=1):
    ''' Given an index, return the sample's X and Y slice.
    
    Input
    =====
    index: a sequencial int number, denotes index no. of a sample
    X_samples_all: DataFrame, X_all set, with 
    len_X_sample:  integer, is one X_sample's length. Default 60.
    Y_samples_all: DataFrame  Y_all set, with 
    len_Y_sample:  integer, is one Y_sample's length. Default 1.
        
    Output
    ======
    one_sample_x: DataFrame
        the sample's X part
    one_sample_y: DataFrame
        the sample's Y part
    '''
    one_sample_y = Y_samples_all.iloc[(index * len_Y_sample):((index+1) * len_Y_sample), :]
    one_sample_x = X_samples_all.iloc[(index * len_X_sample):((index+1) * len_X_sample), :]

    return one_sample_x, one_sample_y


In [None]:

#
# generate balanced dataset
#

import random
random.seed('2017-12-02')

# test how many 'profit'/'loss' samples are selected
i = 0  # 'profit'
j = 0  # 'loss'

# init
X_all_picked = pd.DataFrame()  # initialize a dataframe.
Y_all_picked = pd.DataFrame()  # initialize a dataframe.

X_frame = []  # collection of X sample dataframes.
Y_frame = []  # collection of Y sample dataframes.

for index, row in Y_all_binary_profit.iterrows():
    # generate a random number between 1 and 9, inclusive
    selector = random.randint(1, 9)
    
    if (row['profit'] == 0) and (selector <= 8):
        # skip
        continue
    
    # pick it
    one_x_sample, one_y_sample = pick_one_sample_x_and_y(index,
                                                         X_all_cleanup_std,  # X samples all
                                                         Y_all_binary_profit,  # Y samples all
                                                         dp.CONST_LOOKBACK_SAMPLES,  # len of each X sample
                                                         1  # len of each Y sample
                                                        )
    
    # debug code
    if (row['profit'] == 1):
        i += 1
    else:
        j += 1
        
    # add it to X_frame and Y_frame
    X_frame.append(one_x_sample)
    Y_frame.append(one_y_sample)

X_all_picked = pd.concat(X_frame)
Y_all_picked = pd.concat(Y_frame)

'''
iterrate Y_all_cleanup_std by index,
    if profit > 22.39%,
        pick it;
    else,
        generate a random int between 0 and 8, round((total sample numbers -profit sample numbers)/(profit sample numbers))
        if randomint = 0
            pick it;
        else,
            skip it;
    endif
'''

In [None]:
print(i,j, i+j)
print('X picked: ', X_all_picked.shape[0]/dp.CONST_LOOKBACK_SAMPLES)
print('Y picked: ', Y_all_picked.shape[0])

#### Split to Train and Test

In [None]:
import random

#
# splict_train_and_test
#


def splict_train_and_test(X_samples_all, Y_samples_all, len_X_sample=60, len_Y_sample=1, train_test_ratio=7):
    ''' Split samples into X_train, Y_train and X_test, Y_test.
    
    Input
    =====
    X_samples_all: DataFrame, X_all set 
    len_X_sample:  integer, is one X_sample's length. Default 60.
    Y_samples_all: DataFrame  Y_all set 
    len_Y_sample:  integer, is one Y_sample's length. Default 1.
        
    train_test_ratio: integer, 
        The ration is defined in train_test_ratio, it's an integer between 1 and 10, inclusive.
        It means in every 10 samples, how many to be used as train.
        And others will be as test.
        
    Output
    ======
    X_train, Y_train: DataFrame
    X_test,  Y_test : DataFrame
    '''

    n_total_samples = int(X_samples_all.shape[0] / len_X_sample)

    Xtn_frame = []  # X_train
    Ytn_frame = []  # Y_train
    Xtt_frame = []  # X_test
    Ytt_frame = []  # Y_test
    
    for index in range(0, n_total_samples):
        # slice out one sample
        one_x_sample, one_y_sample = pick_one_sample_x_and_y(index,
                                                             X_samples_all,  # X samples all
                                                             Y_samples_all,  # Y samples all
                                                             len_X_sample,  # len of each X sample
                                                             len_Y_sample  # len of each Y sample
                                                            )

        # generate a random number
        selector = random.randint(1, 10)

        if selector <= train_test_ratio:
            # put this sample to X_train, and Y_train
            Xtn_frame.append(one_x_sample)
            Ytn_frame.append(one_y_sample)
        else:
            # put this sample to X_test, and Y_test
            Xtt_frame.append(one_x_sample)
            Ytt_frame.append(one_y_sample)

    print('start concat...')
    # concat
    X_train = pd.concat(Xtn_frame)
    Y_train = pd.concat(Ytn_frame)
    X_test = pd.concat(Xtt_frame)
    Y_test = pd.concat(Ytt_frame)
    
    print('Done!')
    return X_train, Y_train, X_test, Y_test


In [None]:
X_train, Y_train, X_test, Y_test = splict_train_and_test(X_all_picked, Y_all_picked, dp.CONST_LOOKBACK_SAMPLES, 1, 7)

In [None]:
print("Size of X_train/Y_train: ", int(X_train.shape[0] / dp.CONST_LOOKBACK_SAMPLES), Y_train.shape[0])
print("Size of X_test/Y_test: ", int(X_test.shape[0] / dp.CONST_LOOKBACK_SAMPLES), Y_test.shape[0])

print('How many have positive profit in Y_train?: ', Y_train[Y_train.profit>0].count())
print('How many have positive profit in Y_test?: ', Y_test[Y_test.profit>0].count())


#### Save standardized X/Y samples to csv

In [None]:
# Save X sample 
# X_all_cleanup_std.to_csv('hs300_20100101-20171124-samples-cleanup-std.csv')
#print(all_data.shape)
#print(all_data.head())

#### Convert X/Y Train/Test to numpy array

In [None]:
from keras.utils import np_utils

X_train_np = X_train.values.reshape(-1, 60, 13)
X_test_np = X_test.values.reshape(-1, 60, 13)

Y_train_np = Y_train.values
Y_test_np = Y_test.values


# Y_train_np = np_utils.to_categorical(Y_train)
# Y_test_np = np_utils.to_categorical(Y_test)

print('Train X/Y: ', X_train_np.shape, Y_train_np.shape)
print('Test  X/Y: ', X_test_np.shape, Y_test_np.shape)


### 创建 Keras 模型

In [None]:
### LOAD PACKAGES 
from numpy.random import seed
from keras.layers.convolutional import Conv1D, MaxPooling1D
from keras.optimizers import SGD
from keras.models import Sequential
from keras.layers import Dense, Flatten

#### Conv1D 模型

##### Common Settings:
- Number of filters: (power of 2, e.g. 32, 64, 128, 512)
- Kernel size: 3, 5, 1
- Stride: 1, 2

In [None]:
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten
from keras.layers import Embedding
from keras.layers import Conv1D, MaxPooling1D, GlobalMaxPooling1D
from keras.datasets import imdb

# apply a convolution 1d of length 3 to a sequence with CONST_LOOKBACK_SAMPLES timesteps, each with 13-dimensions,
# with 256 output filters
model = Sequential()
model.add(Conv1D(filters=256, kernel_size=3, padding='valid', input_shape=(dp.CONST_LOOKBACK_SAMPLES, 13)))  # , activation='tanh'
print('Model input shape: ', model.input_shape)
print('Conv1D, 512', model.output_shape)

#model.add(Conv1D(filters=256, kernel_size=3, padding='same'))  # , activation='tanh'

model.add(Conv1D(filters=128, kernel_size=3, padding='same'))  # , activation='tanh'
print('Conv1D, 128', model.output_shape)

model.add(Conv1D(filters=32, kernel_size=3, padding='same'))  # , activation='tanh'
print('Conv1D, 32: ', model.output_shape)

# we use max pooling:
model.add(MaxPooling1D(2))
print('MaxPooling1D by 2: model output shape: ', model.output_shape)

model.add(Flatten())
print('Flatten： ', model.output_shape)

# We add a vanilla hidden layer:
model.add(Dense(896, kernel_initializer='normal'))  # , activation='tanh'))
print('Dense, hidden_dims: model output shape: ', model.output_shape)

# We add a vanilla hidden layer:
model.add(Dense(16, kernel_initializer='normal'))  # , activation='tanh'))
print('Dense, hidden_dims 16: model output shape: ', model.output_shape)

model.add(Dropout(0.2))
print('Dropout 0.2: model output shape: ', model.output_shape)

# We project onto a single unit output layer:
model.add(Dense(1, activation='sigmoid'))
print('Dense 1: ', model.output_shape)

# sgd = SGD(lr=0.001, decay=1e-6, momentum=0.9, nesterov=True)
# model.compile(loss='binary_crossentropy', optimizer='sgd', metrics=['accuracy'])

# Compile model
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

# model.compile(loss='mean_squared_error',
#              optimizer='adam',
#              metrics=['accuracy'])

# model.add(Activation('sigmoid'))
# print('Activation sigmoid: ', model.output_shape)

print(model.summary())

#### Start Training
#### Continue previous Training

In [None]:
# Start training, or
# Continue prvious training: don't do model.compile() which will reset the inner state of the optimizer 
#

# K.clear_session()

batch_size = 32
epochs = 10

model.fit(X_train_np, Y_train_np,
          batch_size=batch_size,
          epochs=epochs,
          validation_data=(X_test_np, Y_test_np))

In [None]:
import tensorflow as tf
from keras.backend import tensorflow_backend as K

with tf.Session(config=tf.ConfigProto(
                    intra_op_parallelism_threads=0)) as sess:
    K.set_session(sess)   # your keras code follows

    # apply a convolution 1d of length 3 to a sequence with CONST_LOOKBACK_SAMPLES timesteps, each with 13-dimensions,
    # with 256 output filters
    model = Sequential()
    model.add(Conv1D(filters=256, kernel_size=3, padding='valid', input_shape=(dp.CONST_LOOKBACK_SAMPLES, 13)))  # , activation='tanh'
    print('Model input shape: ', model.input_shape)
    print('Conv1D, 512', model.output_shape)

    #model.add(Conv1D(filters=256, kernel_size=3, padding='same'))  # , activation='tanh'

    model.add(Conv1D(filters=128, kernel_size=3, padding='same'))  # , activation='tanh'
    print('Conv1D, 128', model.output_shape)

    model.add(Conv1D(filters=32, kernel_size=3, padding='same'))  # , activation='tanh'
    print('Conv1D, 32: ', model.output_shape)

    # we use max pooling:
    model.add(MaxPooling1D(2))
    print('MaxPooling1D by 2: model output shape: ', model.output_shape)

    model.add(Flatten())
    print('Flatten： ', model.output_shape)

    # We add a vanilla hidden layer:
    model.add(Dense(896, kernel_initializer='normal'))  # , activation='tanh'))
    print('Dense, hidden_dims: model output shape: ', model.output_shape)

    # We add a vanilla hidden layer:
    model.add(Dense(16, kernel_initializer='normal'))  # , activation='tanh'))
    print('Dense, hidden_dims 16: model output shape: ', model.output_shape)

    model.add(Dropout(0.2))
    print('Dropout 0.2: model output shape: ', model.output_shape)

    # We project onto a single unit output layer:
    model.add(Dense(1, activation='sigmoid'))
    print('Dense 1: ', model.output_shape)

    sgd = SGD(lr=0.001, decay=1e-6, momentum=0.9, nesterov=True)
    model.compile(loss='binary_crossentropy', optimizer='sgd', metrics=['accuracy'])


    batch_size = 32
    epochs = 1
    model.fit(X_train_np, Y_train_np,
          batch_size=batch_size,
          epochs=epochs,
          validation_data=(X_test_np, Y_test_np))
    
    # clear the session, to avoid session confusion.
    K.clear_session()

### Save Model and Weights to disk

Ref: https://machinelearningmastery.com/save-load-keras-deep-learning-models/

In [None]:

# file names
model_jason_file_name = 'model-2017-12-03-v2.json'
model_weights_hdf5 = 'model-2017-12-03-v2.h5'

# serialize model to JSON
model_json = model.to_json()
with open(model_jason_file_name, "w") as json_file:
    json_file.write(model_json)

# serialize weights to HDF5
model.save_weights(model_weights_hdf5)
print("Saved model to disk")


#### Load model from disk

In [None]:
from keras.models import model_from_json

# load json and create model
json_file = open(model_jason_file_name, 'r')
loaded_model_json = json_file.read()
json_file.close()

loaded_model = model_from_json(loaded_model_json)

# load weights into new model
loaded_model.load_weights(model_weights_hdf5)
print("Loaded model from disk")

#### Use loaded model to predict

In [None]:
# evaluate loaded model on test data
#loaded_model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
#score = loaded_model.evaluate(X, Y, verbose=0)
#print("%s: %.2f%%" % (loaded_model.metrics_names[1], score[1]*100))

## Use NN to Predict

### Step 1: fetch todays' K Line

In [None]:
stock_list_hs300 = ts.get_hs300s()  # 沪深300成分股

sd_t = '2015-05-01'  # start date
ed_t = '2017-12-02'  # end date

# fetch all data in the list. _t means 'today'
all_data_t = dp.fetch_raw_data(stock_list_hs300, sd_t, ed_t)

In [None]:
print(stock_list_hs300.head(2))
print(all_data_t.tail(2))
print(all_data_t.shape)


### Step 2: Generate features

In [None]:
# Input: all_data, MultiIndex'ed by 'code' and 'date'
#

CONST_DROP_THRESHOLD_t = 440  # Wave C needs 377, sample lookback needs 60.

all_data_and_features_t = pd.DataFrame()  # stock data with all available features, such as 'SQZ', 'HIST1', and all.


stock_list = all_data_t.groupby(level=0).size().reset_index(name='counts')
# Initial call to print 0% progress
total = stock_list.shape[0]
i = 0
dp.printProgressBar(i, total, prefix = 'Progress:', suffix = 'Complete', length = 60)


for index, row in stock_list.iterrows():  # iterate the stock list
    # slicing one stock_data
    stock_data = all_data_t.loc[row['code']]
    
    # skip if stock_data has less than CONST_DROP_THRESHOLD bars
    if stock_data.shape[0] < CONST_DROP_THRESHOLD_t:
        i += 1
        print('DROP ' + row['code'])
        continue
        
    # print(row['code'], row['name'])
    # print(stock_data.shape)
    
    # add 'EMA8', 'EMA21'
    stock_data = stock_data.join(sb.ttm_propulsion(stock_data))
    # add SQZ
    stock_data = stock_data.join(sb.ttm_squeeze(stock_data))
    # add WAVE
    stock_data = stock_data.join(sb.ttm_wave(stock_data))
    # add ADX
    stock_data = stock_data.join(sb.talib_adx(stock_data))
    # add ATR
    stock_data = stock_data.join(sb.talib_atr(stock_data))
    # add N-bar LOW
    stock_data = stock_data.join(sb.talib_nbarlow(stock_data, N_BAR_LOWEST = 10))
    
    # append to all_data_and_features
    all_data_and_features_t = all_data_and_features_t.append(stock_data)
    
    # update progress bar
    i += 1
    dp.printProgressBar(i, total, prefix = 'Progress:', suffix = 'Complete', length = 60)

# MulitIndex with both 'code' and 'date'
all_data_and_features_t.set_index([all_data_and_features_t['code'], all_data_and_features_t.index], inplace=True)


In [None]:

print('original all_data: ')
print(all_data_t.columns, '\n', all_data_t.shape)

print('after adding featuresa: ')
print(all_data_and_features_t.columns, '\n', all_data_and_features_t.shape)
print(all_data_and_features_t.head())

### Step 3: In today's K Line, find valid buy samples

In [None]:
# This function is reused from dp.generate_samples()

#
# generate_buy_samples_given_index
#

# number of bars to look back to form a sample
# CONST_LOOKBACK_SAMPLES = 60

def generate_buy_samples_given_index(stock_data, index):
    ''' Function to generate buy samples for the given 'index'. Index is a datetime value. 
        选出在指定时点下，符合买入规则的数据用例.
   
    Explain
    ======= 
    Given a stock_data, generate buy samples on the given datetime 'index'.
        Currently, using is_squeeze_buy_point() to find buy points.
        
    Input
    =====
    stock_data: DataFrame
        stock_data with full features 'SQZ', WAVE C/B/A, ATR, ADX, etc.
    index: string

    Output
    ======
    '''
    
    
    # Initialize X_all
    X_all = pd.DataFrame()
    
    
    
    if not sb.is_squeeze_buy_point(stock_data, index):
        ## print(index + ' is NOT')
        return X_all
    
    # Yes, this is a squeeze buy point
    # Back fetch N-record
    #
    location_of_buy_point = stock_data.index.get_loc(index)
    first_location = location_of_buy_point - CONST_LOOKBACK_SAMPLES + 1
    if first_location < 0: # there is no enough records to form a valid sample
        # skip it
        return X_all
    
    # Slicing. These are totally CONST_LOOKBACK_SAMPLES of records.
    x_sample = stock_data.iloc[first_location:(location_of_buy_point + 1)]
    # x_sample Validity check.
    if x_sample.isnull().values.any():
        # skip it
        return X_all

    # Add N-record to X_all
    X_all = X_all.append(x_sample, ignore_index = False)

    return X_all

In [None]:

def get_X_all_t(today_index):

    # Initialize X_all
    X_all_t = pd.DataFrame()

    print('checking: ', today_index)
    # Initial call to print 0% progress
    total = all_data_and_features_t.groupby(level=0).size().shape[0]
    i = 0
    dp.printProgressBar(i, total, prefix = 'Progress:', suffix = 'Complete', length = 60)

    for code, onestock_df in all_data_and_features_t.groupby(level=0):
        # print(code, ' ', onestock_df.shape)

        # generate a MultiIndex tuple    
        this_index = (code, today_index)

        # check to know this index doesn't exist?
        if this_index not in onestock_df.index:
            i += 1
            continue

        onestock_X_all = generate_buy_samples_given_index(onestock_df, this_index)
        # print(onestock_X_all.shape)

        # Add samples to X_all
        X_all_t = X_all_t.append(onestock_X_all, ignore_index = False)

        # update progress
        i += 1
        dp.printProgressBar(i, total, prefix = 'Progress:', suffix = 'Complete', length = 60)

        # testing
        # if i > 50:
        #    break
    return X_all_t

In [None]:
# Initialize X_all
X_all_t = pd.DataFrame()

'''
X_all_t = X_all_t.append(get_X_all_t('2017-11-01'), ignore_index=False)
X_all_t = X_all_t.append(get_X_all_t('2017-11-02'), ignore_index=False)
X_all_t = X_all_t.append(get_X_all_t('2017-11-03'), ignore_index=False)
X_all_t = X_all_t.append(get_X_all_t('2017-11-06'), ignore_index=False)
X_all_t = X_all_t.append(get_X_all_t('2017-11-07'), ignore_index=False)
X_all_t = X_all_t.append(get_X_all_t('2017-11-08'), ignore_index=False)
X_all_t = X_all_t.append(get_X_all_t('2017-11-09'), ignore_index=False)
X_all_t = X_all_t.append(get_X_all_t('2017-11-10'), ignore_index=False)
X_all_t = X_all_t.append(get_X_all_t('2017-11-13'), ignore_index=False)
X_all_t = X_all_t.append(get_X_all_t('2017-11-14'), ignore_index=False)
X_all_t = X_all_t.append(get_X_all_t('2017-11-15'), ignore_index=False)
X_all_t = X_all_t.append(get_X_all_t('2017-11-16'), ignore_index=False)
X_all_t = X_all_t.append(get_X_all_t('2017-11-17'), ignore_index=False)
X_all_t = X_all_t.append(get_X_all_t('2017-11-20'), ignore_index=False)
X_all_t = X_all_t.append(get_X_all_t('2017-11-21'), ignore_index=False)
X_all_t = X_all_t.append(get_X_all_t('2017-11-22'), ignore_index=False)
X_all_t = X_all_t.append(get_X_all_t('2017-11-23'), ignore_index=False)
X_all_t = X_all_t.append(get_X_all_t('2017-11-24'), ignore_index=False)
X_all_t = X_all_t.append(get_X_all_t('2017-11-27'), ignore_index=False)
X_all_t = X_all_t.append(get_X_all_t('2017-11-28'), ignore_index=False)
X_all_t = X_all_t.append(get_X_all_t('2017-11-29'), ignore_index=False)
X_all_t = X_all_t.append(get_X_all_t('2017-11-30'), ignore_index=False)
'''

X_all_t = X_all_t.append(get_X_all_t('2017-12-01'), ignore_index=False)


In [None]:
# Initialize X_all
X_all_t = pd.DataFrame()

X_all_t = X_all_t.append(get_X_all_t('2017-10-09'), ignore_index=False)
X_all_t = X_all_t.append(get_X_all_t('2017-10-10'), ignore_index=False)
X_all_t = X_all_t.append(get_X_all_t('2017-10-11'), ignore_index=False)
X_all_t = X_all_t.append(get_X_all_t('2017-10-12'), ignore_index=False)
X_all_t = X_all_t.append(get_X_all_t('2017-10-13'), ignore_index=False)
X_all_t = X_all_t.append(get_X_all_t('2017-10-16'), ignore_index=False)
X_all_t = X_all_t.append(get_X_all_t('2017-10-17'), ignore_index=False)
X_all_t = X_all_t.append(get_X_all_t('2017-10-18'), ignore_index=False)
X_all_t = X_all_t.append(get_X_all_t('2017-10-19'), ignore_index=False)
X_all_t = X_all_t.append(get_X_all_t('2017-10-20'), ignore_index=False)
X_all_t = X_all_t.append(get_X_all_t('2017-10-23'), ignore_index=False)
X_all_t = X_all_t.append(get_X_all_t('2017-10-24'), ignore_index=False)
X_all_t = X_all_t.append(get_X_all_t('2017-10-25'), ignore_index=False)
X_all_t = X_all_t.append(get_X_all_t('2017-10-26'), ignore_index=False)
X_all_t = X_all_t.append(get_X_all_t('2017-10-27'), ignore_index=False)
X_all_t = X_all_t.append(get_X_all_t('2017-10-30'), ignore_index=False)
X_all_t = X_all_t.append(get_X_all_t('2017-10-31'), ignore_index=False)



In [None]:
print('number of valid samples: ', X_all_t.shape[0]/dp.CONST_LOOKBACK_SAMPLES)

### Step 4: Standardization X_all_t

In [None]:
# COPY'ed from above

# drop column 'ATR', 'LOW10', 'open', 'high', 'low', 'code'
X_all_cleanup_t = X_all_t.drop(['ATR', 'LOW10', 'open', 'high', 'low', 'code'], axis=1)


from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import MaxAbsScaler

scaler = MinMaxScaler()
absscaler = MaxAbsScaler()

# Note:
#   - 'close', 'EMA8', 'EMA21' these three are correlated, and should be rescaled based on same value.
#CONST_LOOKBACK_SAMPLES = 60

total_samples = int(X_all_cleanup_t.shape[0] / CONST_LOOKBACK_SAMPLES)
X_all_cleanup_std_t = pd.DataFrame()  # initialize a dataframe.

# Initial call to print 0% progress
dp.printProgressBar(0, total_samples - 1, prefix = 'Samples Standardization:', suffix = 'Complete', length = 60)

for i in range(0, total_samples):
    # slice out one sample
    one_sample = X_all_cleanup_t.iloc[(i * CONST_LOOKBACK_SAMPLES):((i+1) * CONST_LOOKBACK_SAMPLES), :]
    # if (i % 20 == 0):
    #    print(one_sample.head(1))
    # print(one_sample.shape)
    # print(one_sample.columns)
    # print(one_sample.head())
    # print(one_sample.describe())
    # copy to new sample skeleton
    standardized_sample = pd.DataFrame(index=one_sample.index)
    
    # Scaling 'volume'
    standardized_sample['volume'] = scaler.fit_transform(one_sample['volume'].values.reshape(-1,1))
    
    # Scaling 'close', 'EMA8', 'EMA21'. They are co-related, so scale from the same scale.
    # print('close, EMA8, EMA21 scaling:')
    pmin = one_sample['close'].min()
    pmax = one_sample['close'].max()
    standardized_sample['close'] = (one_sample['close'].values - pmin) / (pmax - pmin)
    standardized_sample['EMA8'] = (one_sample['EMA8'].values - pmin) / (pmax - pmin)
    standardized_sample['EMA21'] = (one_sample['EMA21'].values - pmin) / (pmax - pmin)
    
    # plot it
    # fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=(60, 15))
    # ax1.set_title('Before Scaling')
    # sns.kdeplot(one_sample['close'], ax=ax1)
    # ax2.set_title('After Scaling')
    # sns.kdeplot(standardized_sample['s_close'], ax=ax2)
    # plt.show()
    
    # Scaling 'ADX'
    standardized_sample['ADX'] = scaler.fit_transform(one_sample['ADX'].values.reshape(-1,1))
    
    # Copy 'SQUEEZE'
    standardized_sample['SQUEEZE'] = one_sample['SQUEEZE']
    
    # Scaling 'MTMMA', 'HIST1' ~ 'HIST5', 'MACD6'
    standardized_sample['MTMMA'] = absscaler.fit_transform(one_sample['MTMMA'].values.reshape(-1,1))
    standardized_sample['HIST1'] = absscaler.fit_transform(one_sample['HIST1'].values.reshape(-1,1))
    standardized_sample['HIST2'] = absscaler.fit_transform(one_sample['HIST2'].values.reshape(-1,1))
    standardized_sample['HIST3'] = absscaler.fit_transform(one_sample['HIST3'].values.reshape(-1,1))
    standardized_sample['HIST4'] = absscaler.fit_transform(one_sample['HIST4'].values.reshape(-1,1))
    standardized_sample['HIST5'] = absscaler.fit_transform(one_sample['HIST5'].values.reshape(-1,1))
    standardized_sample['MACD6'] = absscaler.fit_transform(one_sample['MACD6'].values.reshape(-1,1))
    
    X_all_cleanup_std_t = X_all_cleanup_std_t.append(standardized_sample)
    dp.printProgressBar(i, total_samples - 1, prefix = 'Samples Standardization:', suffix = 'Complete', length = 60)


In [None]:
print(X_all_cleanup_std_t.info)

### Step 5: Predict

In [None]:
print(X_all_cleanup_std_t.values.reshape(-1, 60, 13).shape)
result = model.predict(X_all_cleanup_std_t.values.reshape(-1, 60, 13), batch_size=32, verbose=1)

In [None]:
# make Y_all_std_t 
yindex=range(0, int(X_all_cleanup_std_t.shape[0]/dp.CONST_LOOKBACK_SAMPLES))

Y_all_std_predicted_t = pd.DataFrame(index=yindex, columns=['predict.profit'])

Y_all_std_predicted_t['predict.profit'] = result
print(Y_all_std_predicted_t.sort_values(['predict.profit']))

### Step 6: Visaulize it!


In [None]:
# after 60 epochs
# 9347/9347 [==============================] - 65s - loss: 0.4525 - acc: 0.7912 - val_loss: 0.8121 - val_acc: 0.6486

# pindex = 2  # 2 潍柴动力 # 27 中信银行 # 3 阳光城 # 5 分众传媒 # 22 工商银行 # 5  # 3  # 27  # 2  # Profit Good (2 is max)
# pindex = # 15 伊利股份 # 7 立讯精密 # 4 双汇发展 #21 广深铁路  # 16 长江电力 # Loss (16 is less)

# after another 100 epochs, accu=0.
# 9347/9347 [==============================] - 59s - loss: 0.3649 - acc: 0.8394 - val_loss: 1.0057 - val_acc: 0.6653.

pindex = 18  # 0 华侨城 # 5 分众传媒 # 10 上港集团 # 1 美的集团 # 22 工商银行 # 3 阳光城 # 18 大秦铁路 # Profit Good (18 is max)
# pindex = 7  # 7 立讯精密 # 9 包钢股份  # 11 中国石化 # 16 长江电力 # 8 上海机场 # 21 广深铁路 # 12 万华化学 # Loss (12 is less)

# make a title
title_postfix = ' == Predicted (Profit +22%) Confidence Level: ' + str(Y_all_std_predicted_t.iloc[pindex]['predict.profit'])

# plot it
plot_one_x_sample(pindex, X_all_t, dp.CONST_LOOKBACK_SAMPLES, title_postfix)

# plot_one_x_sample(pindex, X_all_cleanup_std_t, dp.CONST_LOOKBACK_SAMPLES, title_postfix)


In [None]:
#
# plot_one_x_sample
#


def plot_one_x_sample(pindex, X_samples_all, len_X_sample=60, title_postfix=''):
    ''' Plot one X sample.
    
    Input
    =====
    pindex: Integer. the index X sample will be plot.
    X_samples_all: DataFrame, X_all set 
    len_X_sample:  integer, is one X_sample's length. Default 60.

    title_postfix: String, a title postfix
    
    Output
    ======
    None
    '''
    # validity check
    if (pindex >=  X_samples_all.shape[0] / len_X_sample) or (index <0):
        print('Error: Wrong Input index')
        return

    pstock_data = X_samples_all.iloc[(pindex * len_X_sample):((pindex +1) * len_X_sample)]
    plot_stock_data(pstock_data, title_postfix)
    
    return
        


In [None]:
#
# stock_code_to_name
#

def stock_code_to_name(code, stock_list):
    ''' From stock 'code' to get stock 'name'
    
    Input
    =====
    code: stock code, eg. '000338'
    stock_list: the stock list to search for
    
    Output
    ======
    name: string, of the stock name. eg. '潍柴动力'
    '''
    

### 解决 matplotlib 中文显示的问题

In [None]:
mpl.rcParams['font.sans-serif'] = ['SimHei']  # 指定默认字体  
mpl.rcParams['axes.unicode_minus'] = False  # 解决保存图像是负号'-'显示为方块的问题  

In [None]:
df1 = all_data.loc['002839']
print(df1.shape)

# add 1st featue
df1_propulsion = ttm_propulsion(df1)
# a = df1_propulsion.join(df1.drop(['code'], axis=1))
a = df1.join(df1_propulsion)

# a.rename(columns={'EMA8':'EMA8a', 'EMA21':'EMA21a'}, inplace=True)

# add 2nd feature，concatination
b = a.join(ttm_squeeze(a))

# add 3rd feature, concat. TTM Wave C/B/A
d = b.join(ttm_wave(b))

print(d.columns)
print(d.describe())

sd = '2017-05-04'
ed = '2017-12-04'
print("TTM Wave C:")
d.loc[sd:ed][['HIST5', 'MACD6']].plot()
plt.show()
#print(d.loc['2017-09-10':ed][['close', 'HIST5', 'MACD6', 'SQUEEZE', 'MTMMA']])

print("TTM Wave B:")
d.loc[sd:ed][['HIST3', 'HIST4']].plot()
plt.show()
#print(d.loc[sd:ed][['close', 'EMA8', 'EMA21', 'HIST3', 'HIST4']])

print("TTM Wave A:")
d.loc[sd:ed][['HIST1', 'HIST2']].plot()
plt.show()
#print(d.loc[sd:ed][['close', 'EMA8', 'EMA21', 'HIST1', 'HIST2']])

# add 4th feature, concat. ADX
e = d.join(talib_adx(d))
print(e.columns)
print("ADX:")
e.loc[sd:ed]['ADX'].plot()
plt.show()
e.loc[sd:ed][['close']].plot()
plt.show()
print(e.loc[sd:ed][['ADX', 'SQUEEZE', 'MTMMA']])

# add 5th feature, N-Bar low
f = e.join(talib_nbarlow(e, 10))
print(f.columns)
f.loc[sd:ed][['close', 'LOW10']].plot()
plt.show()

# add 6th feaure, ATR

g = f.join(talib_atr(f))
print(g.columns)
g.loc[sd:ed][['ATR']].plot()
plt.show()


In [None]:
stock_list_sz50 = ts.get_sz50s()
stock_list_sz50.columns

### 获取日K线

In [None]:
df1 = ts.get_k_data('000001')
df2 = ts.get_k_data('000002')
df1.head()
df2.head()

In [None]:
import matplotlib.pyplot as plt

df.set_index('date', inplace=True)
print(df.head(2))
df['close'].plot()
plt.show()

### Pandas Panel 学习

- Ref: Panel 结构： http://www.jianshu.com/p/424e61c2f8b8
- Ref: 米筐 Pandas 教程： https://www.ricequant.com/community/topic/3558/

In [None]:
import numpy as np

data = {'000001' : pd.DataFrame(np.random.randn(10, 3)),'000002' : pd.DataFrame(np.random.randn(3, 5))}
pnl = pd.Panel(data)
print(pnl['000001'].describe())
print(pnl['000002'].describe())


In [None]:
# combine df1 and df2 into Panel
data = {'000001' : df1,'000002' : df2}
pnl = pd.Panel(data)
# panel['']
print(pnl)

dfextracted = pnl['000001']
dfextracted.set_index('date', inplace=True)
dfextracted.head()

### 获取近期5分钟行情


In [None]:
m5df = ts.get_k_data('600000', ktype='5')
m5df.head()

newdf = m5df[[]]
newdf.head()

### Join， 选取指定日期的数据

给定df，包含全部日期数据。
给定日期范围dates。
选出dates内的数据：df_dates。

只使用‘close’

In [None]:
import pandas as pd

start_date = '2017-01-01'
end_date = '2017-01-31'
dates = pd.date_range(start_date, end_date)
df_dates = pd.DataFrame(index=dates)
df_dates.shape
# print(df_dates)

# slicing, use only 'close', and rename it to 'stock ID'
df_dates = df_dates.join(df['close'])
df_dates.rename(columns={'close':'600000'}, inplace=True)
df_dates.dropna(inplace=True)
print(df_dates)

### 学习 MultiIndex Dataframe

- Ref： http://blog.csdn.net/tpoy0099/article/details/49074551

### Example: fetch_raw_data()

In [None]:
import tushare as ts
#Eg.
# stock_list_hs300 = ts.get_hs300s()
stock_list_test = pd.DataFrame([{'code':'000001'}, {'code':'000002'}])
print(stock_list_test)

dp.printProgressBar(0, total, prefix = 'Progress:', suffix = 'Complete', length = 60)

a = dp.fetch_raw_data(stock_list_test, '2010-01-01', '2017-12-20')
print(a.columns)
print(a)

### Example MultiIndex Slicing

Ref:
- [1] https://pandas.pydata.org/pandas-docs/stable/advanced.html


In [None]:
stock_id_1 = '000001'
stock_id_2 = '000002'

# to get one stock's data
stock_1_data = a.loc[stock_id_1]
stock_2_data = a.loc[stock_id_2]
print(stock_1_data.shape)
print(stock_2_data.shape)
print(stock_1_data.describe())
print(stock_2_data.describe())

# to get all stock's data on a specific date
# Don't know yet.

### 使用举例

- 使用了两个DataFrame函数：join()， 和 drop()
- 使用 rename()
- 使用花括号，curl braces {}, to define a dictionary.

In [None]:
# examples. Get base data
import tushare as ts
import matplotlib.pyplot as plt
import pandas as pd

df1 = ts.get_k_data('000421', start='2011-01-01', end='2017-12-01')
df1.set_index('date', inplace=True)
print(df1.shape, df1.index)
print(df1.columns)
print(df1.head())

### 例子， 为股票增加feature

In [None]:

# add 1st featue
df1_propulsion = ttm_propulsion(df1)
# a = df1_propulsion.join(df1.drop(['code'], axis=1))
a = df1.join(df1_propulsion)

# a.rename(columns={'EMA8':'EMA8a', 'EMA21':'EMA21a'}, inplace=True)

# add 2nd feature，concatination
b = a.join(ttm_squeeze(a))

# add 3rd feature, concat. TTM Wave C/B/A
d = b.join(ttm_wave(b))

print(d.columns)
print(d.describe())

sd = '2017-05-04'
ed = '2017-12-04'
print("TTM Wave C:")
d.loc[sd:ed][['HIST5', 'MACD6']].plot()
plt.show()
#print(d.loc['2017-09-10':ed][['close', 'HIST5', 'MACD6', 'SQUEEZE', 'MTMMA']])

print("TTM Wave B:")
d.loc[sd:ed][['HIST3', 'HIST4']].plot()
plt.show()
#print(d.loc[sd:ed][['close', 'EMA8', 'EMA21', 'HIST3', 'HIST4']])

print("TTM Wave A:")
d.loc[sd:ed][['HIST1', 'HIST2']].plot()
plt.show()
#print(d.loc[sd:ed][['close', 'EMA8', 'EMA21', 'HIST1', 'HIST2']])

# add 4th feature, concat. ADX
e = d.join(talib_adx(d))
print(e.columns)
print("ADX:")
e.loc[sd:ed]['ADX'].plot()
plt.show()
e.loc[sd:ed][['close']].plot()
plt.show()
print(e.loc[sd:ed][['ADX', 'SQUEEZE', 'MTMMA']])

# add 5th feature, N-Bar low
f = e.join(talib_nbarlow(e, 10))
print(f.columns)
f.loc[sd:ed][['close', 'LOW10']].plot()
plt.show()

# add 6th feaure, ATR

g = f.join(talib_atr(f))
print(g.columns)
g.loc[sd:ed][['ATR']].plot()
plt.show()



In [None]:
# EXAMPLE
# create Series from Dictionary

data = {'A':1, 'B':2, 'C':3}
y_sample = pd.Series(data)
print(y_sample)
print(y_sample['A'])

data2 = {'A':10, 'B':20, 'C':pd.np.nan}
y_sample2 = pd.Series(data2)

print(y_sample2)

dft = pd.DataFrame()
dft = dft.append(y_sample, ignore_index=True)
if dft.isnull().values.any():
    print('there is NaN')
else:
    print('there is no NaN')

dft = dft.append(y_sample2, ignore_index=True)
print(dft)

print('append a dataframe')
dft = dft.append(dft, ignore_index=True)
print(dft)

## 选出符合规则的数据做 训练用例

从给定的股票数据，根据 买入规则， 卖出规则，选出相符合的数据序列，用作 训练用例


In [None]:
import pandas as pd

# generate_samples
# Explain:
#    - from a given stock_data, generate buy/sell samples
#    - currently, using is_squeeze_buy_point() to find buy points
#                 using get_sell_point() to find sell points
# Input:
#    - stock_data: stock_data with features
# Output:
#    - X_all: all samples' X part, concatenated, in DataFrame
#    - Y_all: all samples' Y part, concatenated, in DataFrame
#

# number of bars to look back to form a sample
CONST_LOOKBACK_SAMPLES = 30

def generate_samples(stock_data):
    # Initialize X_all and Y_all
    X_all = pd.DataFrame()
    Y_all = pd.DataFrame()

    for index, row in stock_data.iterrows():
        # is this row a buy-point?
        if not is_squeeze_buy_point(stock_data, index):
            ## print(index + ' is NOT')
            # check next row
            continue

        # Yes, it is a buy-point.
        # Let's check when is the sell-point.
        sell_index, sell_reason = get_sell_point(stock_data, index)

        # Do we hit a sell point?
        if sell_reason == 0: # No, skip it
            continue

        ## print('=======')
        ## print(index + ' is YES')
        ## print(sell_index, 'is SELL POINT')
        ## print(sell_reason, 'is SELL REASON')
        ## print('Buy  @ ', stock_data['close'][index])
        ## print('Sell @ ', stock_data['close'][sell_index])
        ## print('=======')

        # Back fetch N-record
        location_of_buy_point = stock_data.index.get_loc(index)
        first_location = location_of_buy_point - CONST_LOOKBACK_SAMPLES + 1
        if first_location < 0: # there is no enough records to form a valid sample
            # skip it
            continue

        # Slicing. These are totally CONST_LOOKBACK_SAMPLES of records.
        x_sample = stock_data.iloc[first_location:(location_of_buy_point + 1)]
        # x_sampel Validity check.
        if x_sample.isnull().values.any():
            # skip it
            continue

        # create y_sample as a pandas.Series
        y_raw_data = {'code': row['code'],
                      'buy_date': index,
                      'buy_price': stock_data['close'][index],
                      'sell_date': sell_index,
                      'sell_price': stock_data['close'][sell_index],
                      'sell_reason': sell_reason}
        y_sample = pd.Series(y_raw_data)
        # y_sample validity check
        if y_sample.isnull().values.any():
            # skip it
            continue

        # Add N-record to X_all
        X_all = X_all.append(x_sample, ignore_index = False)
        # Add sell-point information to Y_all.
        Y_all = Y_all.append(y_sample, ignore_index = True)

        #### END of for loop ####

    return X_all, Y_all

In [None]:
# Testing and Studying
# Examples of basic syntax

import talib
import matplotlib.pyplot as plt

# data preparation
# stock_data = stock_1_data
sd = '2015-01-30'
ed = '2016-01-30'
stock_data = g[sd:ed]
print(stock_data.columns)
print(type(stock_data.index))

cur_date = '2015-10-01'
remaining_s_data = stock_data[cur_date:]

print('first index in remaing_s_data', remaining_s_data.index[0])
print('it\'s location in stock_data', stock_data.index.get_loc(remaining_s_data.index[0]))

buy_point_loc = stock_data.index.get_loc(remaining_s_data.index[0])
first_loc = buy_point_loc - 5 + 1

# slicing
sample_x_test = stock_data.iloc[first_loc:(buy_point_loc+1)]

print(sample_x_test)

print('remaining data\'s index')
print(remaining_s_data.index)

# 10 days low
N_BAR_LOWEST = 10
nbar_lowest_col_name = 'LOW' + str(N_BAR_LOWEST)
stock_data[nbar_lowest_col_name] = talib.MIN(stock_data['low'].values, timeperiod = N_BAR_LOWEST)
stock_data[nbar_lowest_col_name] = stock_data[nbar_lowest_col_name].shift(1)

stock_data[['close', nbar_lowest_col_name]].plot()
plt.show()
print(stock_data[['low', 'close', nbar_lowest_col_name]])

print('===========')
print('===========')
print('===========')
print('===========')

X_all, Y_all = generate_samples(stock_data)
print(X_all.shape, Y_all.shape)
print(Y_all)





### 定义 SQUEEZE 选股规则

In [None]:
import pandas as pd

# is_squeeze_buy_point
# Explain:
#    - Is the current index'ed bar a squeeze buy-point?
# Input:
#    - stock_data: stock_data with features 'SQUEEZE', TTM Wave C/B/A, and ADX ready
#    - index: current index
# Output:
#    - Return: boolean, True or False
#

# define squeeze CONST
CONST_SQUEEZE_RELEASED = -1
CONST_SQUEEZE_ONGOING = 1

def is_squeeze_buy_point(stock_data, index):
    # FEATUREs Required: 
    #      - 'SQUEEZE', 'HIST5', and 'MACD6'
    # RULE:
    #      - a) TTM Wave C, ie. 'HIST5' and 'MACD6', must be greater than '0'. Then,
    #      - b) 'SQUEEZE' should be either ongoing, or on the first bar of releasing.
    # 
    ret = False
    
    # test TTM Wave C > 0
    if (stock_data.loc[index, 'HIST5'] <= 0) or (stock_data.loc[index, 'MACD6'] <= 0):
        ret = False
        return ret
    
    # test whether 'SQUEEZE' is on-going
    if (stock_data.loc[index, 'SQUEEZE'] == CONST_SQUEEZE_ONGOING):
        ret = True
        return ret
    
    # test whether on the first bar of 'SQUEEZE' release
    if (stock_data.loc[index, 'SQUEEZE'] == CONST_SQUEEZE_RELEASED):
        # check previous bar is 'SQUEEZE' ongoing?
        if (stock_data['SQUEEZE'].shift(1)[index] == CONST_SQUEEZE_ONGOING):
            ret = True
            return ret
    
    return ret

### 定义卖出规则

In [None]:
# get_sell_point
# Explain:
#    - Find the sell point
# Input:
#    - stock_data: stock data with features
#    - buy_index: the bar where we start the trade
#    - multi_atr: multiple of ATR as stop loss at
#    - n_low: close breaks n_low bar's low
# Output:
#    - sell_index: index of the sell point, or
#                  0 if cannot find sell point before the end
#    - sell_reason: 1 for stop loss, or 2 for N-bar low breakthrough.
#                  0 if not reached any sell point.
#

CONST_SELL_REASON_STOP_LOSS = 1
CONST_SELL_REASON_N_LOW =2

def get_sell_point(stock_data, buy_index, multi_atr = 2, n_low = 10):
    # FEATUREs Required: 
    #      - 'close', 'ATR'
    # RULE: check each bar after buy_index
    #      - a) close price is lower than (multi_atr * ATR)
    #      - b) close lower than prvious n_low bars' low
    # 
    sell_index = 0
    sell_reason = 0
    
    # where we start the trade
    buy_price = stock_data['close'][buy_index]
    stop_price = buy_price - stock_data['ATR'][buy_index] * multi_atr
    #print('buy_price: ', buy_price)
    #print('stop_price: ', stop_price)
    
    remaining_stock_data = stock_data[buy_index:]
    n_low_col_name = 'LOW' + str(n_low)
            
    # check when to sell
    for index, row in remaining_stock_data.iterrows():
        if row['close'] <= stop_price: # down-break stop_price
            # print('today close is lowet than stop_price')
            # print(row['close'])
            # print(stop_price)
            sell_index = index
            sell_reason = CONST_SELL_REASON_STOP_LOSS
            break
        
        # close at a price lower than previous n_low bars' low
        if row['close'] <= row[n_low_col_name]:
            # print(row[['close', col_name]])
            sell_index = index
            sell_reason = CONST_SELL_REASON_N_LOW
            break
    
    return sell_index, sell_reason

In [None]:

# Input:
#    - stock_data: 

# iterrate each row of stock_data
for index, row in stock_data.iterrows():
    print(index)
    # is this row a buy-point?
    if not is_squeeze_buy_point(stock_data, index, row):
        # check next row
        continue
        
    # Yes, this is a buying-point
    # Find the sell-point
    index_sell = get_sell_point(stock_data, buy_index)
    
    # do we find a valid sell-point? Maybe before our sequence ends, it doesn't reach any sell-point.
    if NOT valid(index_sell):
        # check next row
        continue

    # Back fetch N-record
    # check N-record valid or not
    # Add N-record to X_all
    # Add sell-point information to Y_all.
    
    #### END ####