In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import tensorflow as tf
from tensorflow import keras
from sklearn.preprocessing import normalize
import matplotlib.pyplot as plt
import pickle


In [3]:
def loadDataFiles():
    market_df = pickle.load(open('Market_train',"rb"))
    news_df = pickle.load(open("News_train", "rb"))
    print('Finished loading datafiles!')
    return market_df, news_df


In [4]:
def preprocess_data(mkt_df, news_df):
    mkt_df['time'] = pd.to_datetime(mkt_df['time'])
    news_df['time'] = pd.to_datetime(news_df['time'])
    mkt_df['time'] = mkt_df['time'].dt.date
    news_df['time'] = news_df['time'].dt.date
    assetCodes = []
    index = 0
    for x in news_df['assetCodes']:
        x = x.split(',')[0].split("'")[1]
        assetCodes.append(x)
    news_df['assetCode'] = np.asarray(assetCodes)
    irrelevantColumns = ['sourceTimestamp', 'firstCreated', 'sourceId', 
                         'headline', 'provider', 'subjects', 'audiences',
                        'headlineTag', 'marketCommentary', 'assetCodes', 'assetName']
    news_df.drop(irrelevantColumns, axis=1, inplace=True)
    mkt_df.drop(['assetName'], axis=1, inplace=True)
    modifiednews = news_df.groupby(['time','assetCode'], sort=False).aggregate(np.mean).reset_index()
    
    # join news reports to market data, note many assets will have many days without news data
    merged = pd.merge(mkt_df, modifiednews, how='left', on=['time', 'assetCode'], copy=False) 
    merged = merged.fillna(0)
    print('Finished preprocessing data!')
    return merged


In [5]:
market_data, news_data = loadDataFiles()


Finished loading datafiles!


In [6]:
X = preprocess_data(market_data, news_data)


Finished preprocessing data!


In [7]:
def normalizeY(ydf):
    ydf = (ydf + 1) / 2
    return ydf


In [8]:
X = X[X['returnsOpenNextMktres10'] >=-1]
X = X[X['returnsOpenNextMktres10'] <=1]

y = X['returnsOpenNextMktres10']

X.drop(['returnsOpenNextMktres10'], axis=1, inplace=True)
print(np.amin(y))
print(np.amax(y))
print("normalized")
y = normalizeY(y)
print(np.amin(y))
print(np.amax(y))

assetCodesAndTime = X.iloc[:, :2]
X = X.iloc[:, 2:]

-0.9987179121017122
0.9988220255803867
normalized
0.0006410439491438824
0.9994110127901934


In [9]:
def regularize(df):
    for column in df:
        colmin = np.amin(df[column])
        colmax = np.amax(df[column])
        df[column] = (df[column] - colmin) / (colmax - colmin)
        print(df[column])
    return df


In [10]:
X = regularize(X)

0          0.002125
1          0.001672
2          0.000949
3          0.019357
4          0.000985
5          0.001351
6          0.000967
7          0.000246
8          0.004640
9          0.000328
10         0.007367
11         0.001121
12         0.001334
13         0.000208
14         0.001061
15         0.000505
16         0.000332
17         0.000244
18         0.003803
19         0.002013
20         0.003046
21         0.016840
22         0.002762
23         0.002416
24         0.001544
25         0.000831
26         0.000152
27         0.000480
28         0.000222
29         0.000288
             ...   
4072926    0.001722
4072927    0.000174
4072928    0.000111
4072929    0.000489
4072930    0.007432
4072931    0.000957
4072932    0.000540
4072933    0.000774
4072934    0.000553
4072935    0.000527
4072936    0.002020
4072937    0.005244
4072938    0.000632
4072939    0.000801
4072940    0.000389
4072941    0.001538
4072942    0.000795
4072943    0.000538
4072944    0.000717


0          0.023142
1          0.021083
2          0.024181
3          0.022264
4          0.023601
5          0.024932
6          0.023749
7          0.024225
8          0.023013
9          0.023763
10         0.027787
11         0.025692
12         0.024065
13         0.023062
14         0.023385
15         0.022064
16         0.021059
17         0.023219
18         0.022780
19         0.025290
20         0.023028
21         0.026117
22         0.022970
23         0.024582
24         0.023058
25         0.022971
26         0.023243
27         0.023298
28         0.023232
29         0.023617
             ...   
4072926    0.024242
4072927    0.020419
4072928    0.024490
4072929    0.023701
4072930    0.023109
4072931    0.020477
4072932    0.021162
4072933    0.022567
4072934    0.022630
4072935    0.022934
4072936    0.025358
4072937    0.022384
4072938    0.022596
4072939    0.023465
4072940    0.018426
4072941    0.023255
4072942    0.022750
4072943    0.022328
4072944    0.023347


0          0.028497
1          0.010507
2          0.000000
3          0.000000
4          0.000000
5          0.000000
6          0.000000
7          0.000000
8          0.006477
9          0.000000
10         0.000000
11         0.000000
12         0.033967
13         0.000000
14         0.000000
15         0.000000
16         0.000000
17         0.000000
18         0.018998
19         0.014896
20         0.000000
21         0.025283
22         0.000000
23         0.000000
24         0.000000
25         0.000000
26         0.000000
27         0.000000
28         0.000000
29         0.018135
             ...   
4072926    0.000000
4072927    0.000000
4072928    0.000000
4072929    0.000000
4072930    0.000000
4072931    0.000864
4072932    0.000000
4072933    0.000000
4072934    0.000000
4072935    0.000000
4072936    0.000000
4072937    0.000000
4072938    0.000000
4072939    0.000000
4072940    0.000000
4072941    0.000000
4072942    0.000000
4072943    0.000000
4072944    0.000000


0          0.033840
1          0.022991
2          0.000000
3          0.000000
4          0.000000
5          0.000000
6          0.000000
7          0.000000
8          0.006509
9          0.000000
10         0.000000
11         0.000000
12         0.021711
13         0.000000
14         0.000000
15         0.000000
16         0.000000
17         0.000000
18         0.004528
19         0.018557
20         0.000000
21         0.014838
22         0.000000
23         0.000000
24         0.000000
25         0.000000
26         0.000000
27         0.000000
28         0.000000
29         0.015525
             ...   
4072926    0.000000
4072927    0.000000
4072928    0.000000
4072929    0.000000
4072930    0.000000
4072931    0.002224
4072932    0.000000
4072933    0.000000
4072934    0.000000
4072935    0.000000
4072936    0.000000
4072937    0.000000
4072938    0.000000
4072939    0.000000
4072940    0.000000
4072941    0.000000
4072942    0.000000
4072943    0.000000
4072944    0.000000


0          0.001745
1          0.003974
2          0.000000
3          0.000000
4          0.000000
5          0.000000
6          0.000000
7          0.000000
8          0.000775
9          0.000000
10         0.000000
11         0.000000
12         0.001874
13         0.000000
14         0.000000
15         0.000000
16         0.000000
17         0.000000
18         0.000000
19         0.000969
20         0.000000
21         0.007022
22         0.000000
23         0.000000
24         0.000000
25         0.000000
26         0.000000
27         0.000000
28         0.000000
29         0.000000
             ...   
4072926    0.000000
4072927    0.000000
4072928    0.000000
4072929    0.000000
4072930    0.000000
4072931    0.004071
4072932    0.000000
4072933    0.000000
4072934    0.000000
4072935    0.000000
4072936    0.000000
4072937    0.000000
4072938    0.000000
4072939    0.000000
4072940    0.000000
4072941    0.000000
4072942    0.000000
4072943    0.000000
4072944    0.000000
