In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import tensorflow as tf
import math
from tensorflow import keras
from sklearn.preprocessing import normalize
from sklearn.naive_bayes import GaussianNB
import matplotlib.pyplot as plt
import pickle
from tensorflow.keras.models import load_model
from collections import defaultdict

In [2]:
def loadDataFiles():
    market_df = pickle.load(open('Market_train',"rb"))
    news_df = pickle.load(open("News_train", "rb"))
    print('Finished loading datafiles!')
    return market_df, news_df


In [3]:
def preprocess_data(mkt_df, news_df):
    mkt_df['time'] = pd.to_datetime(mkt_df['time'])
    news_df['time'] = pd.to_datetime(news_df['time'])
    mkt_df['time'] = mkt_df['time'].dt.date
    news_df['time'] = news_df['time'].dt.date
    assetCodes = []
    index = 0
    for x in news_df['assetCodes']:
        x = x.split(',')[0].split("'")[1]
        assetCodes.append(x)
    news_df['assetCode'] = np.asarray(assetCodes)
    irrelevantColumns = ['sourceTimestamp', 'firstCreated', 'sourceId', 
                         'headline', 'provider', 'subjects', 'audiences',
                        'headlineTag', 'marketCommentary', 'assetCodes', 'assetName']
    news_df.drop(irrelevantColumns, axis=1, inplace=True)
    mkt_df.drop(['assetName'], axis=1, inplace=True)
    modifiednews = news_df.groupby(['time','assetCode'], sort=False).aggregate(np.mean).reset_index()
    
    # join news reports to market data, note many assets will have many days without news data
    merged = pd.merge(mkt_df, modifiednews, how='left', on=['time', 'assetCode'], copy=False) 
    merged = merged.fillna(0)
    print('Finished preprocessing data!')
    return merged


In [4]:
market_data, news_data = loadDataFiles()


Finished loading datafiles!


In [5]:
X = preprocess_data(market_data, news_data)


Finished preprocessing data!


In [6]:
X = X[X['returnsOpenNextMktres10'] >= -1]
X = X[X['returnsOpenNextMktres10'] <= 1]
y = X['returnsOpenNextMktres10']
print(len(y[y < 0]))
X.drop(['returnsOpenNextMktres10'], axis=1, inplace=True)
assetCodesAndTime = X.iloc[:, :2]
X = X.iloc[:, 2:]

1994287


In [7]:
def regularize(df):
    for column in df:
        colmin = np.amin(df[column])
        colmax = np.amax(df[column])
        df[column] = (df[column] - colmin) / (colmax - colmin)
    return df

In [8]:
X = regularize(X)

In [9]:
def saveModel(model, model_name):
    model.save(model_name + '.h5')

In [10]:
def loadModel(filename):
    model = load_model(filename)
    return model

In [11]:
# def createBiaser(X,y):
#     negCount = 0.0 # total negatives in the dataset
#     posCount = 0.0 # total positives in the dataset
#     totalPosScore = 0.0 #cumulative score of all fields that has been classified as positive in the entire dataset
#     totalNegScore = 0.0 # same for negative
#     isPos = False 
#     isNeg = False 
#     positiveScores = defaultdict(float) # cumulative score for each field that has been classified as positive stored in a dict
#     negativeScores = defaultdict(float) # same for negative
    
#     # initializing data
#     for index, row in y.iteritems():
#         if y[index] < 0:
#             negCount += 1
#             isNeg = True
#             isPos = False
#         else:
#             posCount += 1
#             isNeg = False
#             isPos = True
            
#         for feature in X.iloc[[index]]:
# #             print(X.iloc[index][feature])
#             if isPos == True:
#                 positiveScores[feature] += X.iloc[index][feature]
#                 totalPosScore += X.iloc[index][feature]
#             elif isNeg == True:
#                 negativeScores[feature] += X.iloc[index][feature]
#                 totalNegScore += X.iloc[index][feature]
          
#     return negCount, posCount, totalPosScore, totalNegScore, positiveScores, negativeScores

In [12]:
# X = regularize(X)
# def printData():
#     negCount, posCount, totalPosScore, totalNegScore, positiveScores, negativeScores = createBiaser(X[:1000],y[:1000])
#     print("Total Negatives: ", negCount)
#     print("Total Positives: ", posCount)
#     print("Total Positive Score: ", totalPosScore)
#     print("Total Negative Score: ", totalNegScore)
#     for feature in X.iloc[[1]]:
#         print("Feature: ", feature)
#         print("positiveScore: ", positiveScores[feature])
#         print("negativeScore: ", negativeScores[feature])

In [13]:
def splitDataset(X, y, split):
    index = int(split*len(y.index))
    y_train, y_test = np.split(y, [index])
    X_train, X_test = X.iloc[:index, :], X.iloc[index:, :]
    return X_train, y_train, X_test, y_test

In [14]:
X = regularize(X)
X_train, y_train, X_test, y_test = splitDataset(X, y, 0.7)
print(len(y_train[y_train < 0]))
print(len(y_test[y_test < 0]))
# negCount, posCount, totalPosScore, totalNegScore, positiveScores, negativeScores = createBiaser(X_train,y_train)
# loss = classify(negCount, posCount, totalPosScore, totalNegScore, positiveScores, negativeScores)

1374968
619319


In [15]:
# def classify(negCount, posCount, totalPosScore, totalNegScore, PositiveScore, negativeScore):
#     loss = []
#     totalPosScore = 0.0
#     totalNegScore = 0.0
#     priorPos = math.log(posCount) - math.log(posCount + negCount)
#     priorNeg = math.log(negCount) - math.log(posCount + negCount)

#     for feature in X_test.iloc[[1]]:
#         if feature in positiveScores:
#             if positiveScores[feature] != 0 and totalPosScore != 0:
#                 posteriorPos = math.log(positiveScores[feature]) - math.log(totalPosScore)
#             else:
#                 posteriorPos = 0.0
#         totalPosScore += posteriorPos

#         if feature in negativeScores:
#             if negativeScores[feature] != 0 and totalNegScore != 0:
#                 posteriorNeg = math.log(negativeScores[feature]) - math.log(totalNegScore)
#             else:
#                 posteriorNeg = 0.0
#         totalNegScore += posteriorNeg

#     totalPosScore += priorPos
#     totalNegScore += priorNeg
#     for val in y_test:
#         if totalPosScore > totalNegScore:
#             loss.append(val - totalPosScore)
#         else:
#             loss.append(val - totalNegScore)
        
#     return loss

In [16]:
# loss = classify(negCount, posCount, totalPosScore, totalNegScore, positiveScores, negativeScores)
# # for item in loss:
# #     print(item)

# mean = sum(loss) / float(len(loss))
# print(mean)

In [17]:
def classwify(y):

    y[y > 0] = 1
    y[y <= 0] = 2
    
    return y

In [18]:
y_train_new = np.copy(y_train)
y_test_new = np.copy(y_test)
y_train_new = classwify(y_train)
y_test_new = classwify(y_test)

In [19]:
clf = GaussianNB()
clf.fit(X_train, y_train_new)
probs = clf.score(X_test, y_test_new)
print(probs)

ValueError: Unknown label type: (array([-0.99871791, -0.99814278, -0.99382127, ...,  0.99264803,
        0.99424496,  0.99440021]),)