# Data Preprocessing Part

In [1]:
import pandas as pd
import numpy as np
import warnings
import random
warnings.filterwarnings('ignore')

## Read Data

In [2]:
df1 = pd.read_csv('../data/data_part1.csv')
df2 = pd.read_csv('../data/data_part2.csv')
stockprice = pd.concat([df1,df2], axis=0)
stockprice.drop(columns=['Unnamed: 0'],inplace=True)
stockprice.sort_values(by=['time'],ascending=True,inplace=True)

## Select tradedates & Stock ID

In [3]:
##select tradedates
tradedate = stockprice['time']
tradedate = tradedate.drop_duplicates()
tradedate = list(tradedate)

##select stockID
stockid = stockprice['code']
stockid = stockid.drop_duplicates()
stockid = list(stockid)

##sort data according to stockid and time
stockprice.sort_values(by=['code','time'],ascending=True,inplace=True)
stockprice = stockprice.reset_index(drop=True)

## Drop stocks which were delisted from market during 2017-2019

In [4]:
ss = stockprice.loc[np.isnan(stockprice['close']),'code']
ss = list(ss.drop_duplicates())

filtered = []
for i in range(0,len(stockid)):
    if stockid[i] not in ss:
        filtered.append(stockid[i])

df = stockprice.loc[stockprice['code'].isin(filtered)]

## Import package 'talib' to compute several features

In [5]:
import talib
df['MOM'] = talib.MOM(df.loc[:, 'close'],timeperiod=5)

df['RSI']=talib.RSI(df.close.values, timeperiod=6)
df['EMA6'] = talib.EMA(df.close.values, timeperiod=6)  
df['EMA12'] = talib.EMA(df.close.values, timeperiod=12)   
df['MACD'],df['MACDsignal'],df['MACDhist'] = talib.MACD(df.close.values,
                            fastperiod=6, slowperiod=12, signalperiod=9)
df['atr'] = talib.ATR(df.high.values,df.low.values,df.close.values, timeperiod=14)

## Drop nan data due to features computing & save modified data

In [6]:
remaindate = tradedate[19:]
ddf = df.loc[df['time'].isin(remaindate)]
ddf = ddf.reset_index(drop=True)

#save modified data
ddf.to_csv('modifieddata.csv')

## Split 80% data in the front into training set and last 20% into test set

In [7]:
remaindate = ddf['time']
remaindate = remaindate.drop_duplicates()
remaindate = list(remaindate)

traindate = remaindate[:570]
testdate = remaindate[570:710]

traindata = ddf.loc[ddf['time'].isin(traindate)]
testdata = ddf.loc[ddf['time'].isin(testdate)]
##save training data and test data
traindata.to_csv('traindata.csv')
testdata.to_csv('testdata.csv')

## Implement PCA on training set

In [8]:
##train set
trMOM = traindata.MOM.values
trRSI = traindata.RSI.values
trEMA6 = traindata.EMA6.values
trEMA12 = traindata.EMA12.values
trMACD = traindata.MACD.values
tratr = traindata.atr.values

ss = pd.DataFrame({'MOM':trMOM,'RSI':trRSI,'EMA6':trEMA6,'EMA12':trEMA12,'MACD':trMACD,'ATR':tratr})

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

##Standardlization
scaler = StandardScaler()
scaler.fit(ss)
trans_ss = scaler.transform(ss)

##PCA
pca = PCA(n_components=3)
newss = pca.fit_transform(trans_ss)
print(pca.explained_variance_ratio_)

[0.48619757 0.3416973  0.09348087]


## Implement PCA on test set

In [9]:
##test set
teMOM = testdata.MOM.values
teRSI = testdata.RSI.values
teEMA6 = testdata.EMA6.values
teEMA12 = testdata.EMA12.values
teMACD = testdata.MACD.values
teatr = testdata.atr.values

xx = pd.DataFrame({'MOM':teMOM,'RSI':teRSI,'EMA6':teEMA6,'EMA12':teEMA12,'MACD':teMACD,'ATR':teatr})
##标准化&主成分
trans_xx = scaler.transform(xx)
newxx = pca.transform(trans_xx)

## Label 1 for up trend and -1 for down trend

In [10]:
traindata = traindata.reset_index(drop=True)   
Ytrain = []
for i in range(0,len(traindata)):
    if i%10 == 9:
        if traindata.loc[i,'close'] > traindata.loc[i-4,'close']:
            Ytrain.append(1)
        else:
            Ytrain.append(-1)   


testdata = testdata.reset_index(drop=True)
Ytest = []
for i in range(0,len(testdata)):
    if i%10 == 9:
        if testdata.loc[i,'close'] > testdata.loc[i-4,'close']:
            Ytest.append(1)
        else:
            Ytest.append(-1)

## Transform X data in training set and test set

In [11]:
Xtrain = []
for i in range(0,len(newss)):
    if i%10 == 4:
       temp = []
       for j in range(4,-1,-1): 
           temp.append(newss[i-j,0])
           temp.append(newss[i-j,1])
           temp.append(newss[i-j,2])
       Xtrain.append(temp)
    
Xtest = []
for i in range(0,len(newxx)):
    if i%10 == 4:
       temp = []
       for j in range(4,-1,-1): 
           temp.append(newxx[i-j,0])
           temp.append(newxx[i-j,1])
           temp.append(newxx[i-j,2])
       Xtest.append(temp)