# Pattern Recognition in Financial Data

## Author: Adamantios Ntakaris

### Week 1 - Seminar 1

##### Business School - University of Edinburgh 





In [None]:
# Install hdf5 library by interacting with the interpreter 
import sys
!{sys.executable} -m pip install hdf5storage
!{sys.executable} -m pip install tensorflow

# Import Libraries 
import hdf5storage
import pandas as pd
import keras
import tensorflow as tf

# 1. Read data

In [None]:
LOB_input = hdf5storage.loadmat('S092215-v50-AMZN_OCT2_states.mat')
LOB_Amazon = pd.DataFrame.from_dict(LOB_input['LOB'])

In [None]:
# This is a problematic representation of the data. Why?
LOB_Amazon.head()

In [None]:
# Add Column Titles
LOB_Amazon.columns = [
                      "Time", "Mid_Price", "Spread", 
                      "AskPrice1", "AskVolume1","BidPrice1", "BidVolume1", # Level 1
                      "AskPrice2", "AskVolume2","BidPrice2", "BidVolume2", # Level 2
                      "AskPrice3", "AskVolume3","BidPrice3", "BidVolume3", # Level 3
                      "AskPrice4", "AskVolume4","BidPrice4", "BidVolume4", # Level 4
                      "AskPrice5", "AskVolume5","BidPrice5", "BidVolume5", # Level 5
                      "AskPrice6", "AskVolume6","BidPrice6", "BidVolume6", # Level 6
                      "AskPrice7", "AskVolume7","BidPrice7", "BidVolume7", # Level 7
                      "AskPrice8", "AskVolume8","BidPrice8", "BidVolume8", # Level 8
                      "AskPrice9", "AskVolume9","BidPrice9", "BidVolume9", # Level 9
                      "AskPrice10", "AskVolume10","BidPrice10", "BidVolume10", # Level 10
                     ]

In [None]:
# Much better Now?
LOB_Amazon.head()

# 2. Convert epoch time to readable format

In [None]:
# Conversion module
import datetime
    
# Version 1.0
new_time_column = []
for item in LOB_Amazon["Time"]:
    current_row = datetime.datetime.fromtimestamp(item/1000.0) #1000.0 since 1 sec = 1000 mls
    new_time_column.append(current_row)

# Version 2.0
readable_time = []
for i in range(0,len(new_time_column)):
    current_time = new_time_column[i].strftime("%H:%M:%f")
    readable_time.append(current_time)   

In [None]:
# Replace Unix time with the Readable Time Format (=Version 2.0)
LOB_Amazon['Time'] = readable_time 

In [None]:
# How it looks now?
LOB_Amazon.head(10)

# 3. Plot Time Series

In [None]:
#Plot Best Ask and Bid Prices
import matplotlib.pyplot as plt

LOB_Amazon_Sample = LOB_Amazon.iloc[86000:562500,] # Time-framing

LOB_Amazon_Sample.plot(x="Time", y=["AskPrice1", "BidPrice1"],
                       figsize=(18, 9), linewidth=1.0, fontsize=20)

plt.legend(loc=1, prop={'size': 20})
plt.rcParams.update({'font.size': 10})
plt.ylabel('Amazon Stock Price',fontsize=30);

In [None]:
#Plot Mid-Price
LOB_Amazon_Sample.plot(x="Time", 
                       y=["Mid_Price"], figsize=(18, 9), linewidth=2.0, 
                       color = {"green"}, 
                       fontsize=20
                      )
plt.legend(loc=1, prop={'size': 20})
plt.rcParams.update({'font.size': 20})
plt.ylabel('Mid Price',fontsize=30)
plt.show()

# 4. Feature Engineering

In [None]:
# Lagged Features
LOB_Amazon['Ask_lag_1'] = LOB_Amazon['AskPrice1'].shift(1)
LOB_Amazon['Bid_lag_1'] = LOB_Amazon['BidPrice1'].shift(1)

In [None]:
# Rolling Window
LOB_Amazon['rolling_mean'] = LOB_Amazon['AskPrice1'].rolling(window=4).mean()

In [None]:
LOB_Amazon.head()

# 5. Prepare Train & Test sets

In [None]:
# Identify the Columns that will be utilized as Inputs to the Regressors
Feature_Matrix = LOB_Amazon_Sample.iloc[4:-1,3:] 
Mid_Price_Series = LOB_Amazon_Sample['Mid_Price'].iloc[4:]

# Extarct Targets/Labels
Forecasting_Variable = Mid_Price_Series

print('Mid_Price_Series length:',len(Mid_Price_Series))
print('Feature_Matrix length:', len(Feature_Matrix))

# Why I did this iloc to Feature_Matrix & Mid_Price_Series?

In [None]:
# Calculate Stock Returns for Label Preparation

# Returns 
returns = (Mid_Price_Series/ Mid_Price_Series.shift(1)) - 1

# Indexing (i.e., identify type fo diffrences)
idx_pos = returns[returns >= 0].index
idx_neg = returns[returns < 0].index

# Based on Indexing convert/label returns
returns.loc[idx_pos] = 1
returns.loc[idx_neg] = -1

# Remove NaN valeus
returns.dropna(inplace=True)
print(returns.shape)

In [None]:
# Time Series K-Fold Cross Validation
from sklearn.model_selection import TimeSeriesSplit

tscv = TimeSeriesSplit()
print(tscv)
print(62*'#')

counter = 0
for train_index, test_index in tscv.split(Feature_Matrix):
    counter += 1
    print('Fold %d:' %counter)
    print("TRAIN:", len(train_index), "TEST:", len(test_index))
    
    X_train, X_test = Feature_Matrix.iloc[train_index,:], Feature_Matrix.iloc[test_index,:]
    y_tr, y_te = returns.iloc[train_index], returns.iloc[test_index]
    print(62*'-')
    
    #---------------------------------------------------------------------------------------------#
    #---------------------------------------- Classifiers ----------------------------------------#
    #---------------------------------------------------------------------------------------------#

    ## ----------------------------------------------------------------------------------------- ##
    # 1. Naive Bayes - Gaussian 
    from sklearn.naive_bayes import GaussianNB
    
    gnb = GaussianNB()
    
    # Predicted Labels
    y_pr_gnb = gnb.fit(X_train, y_tr).predict(X_test)
    print('1. Naive Bayes: Gaussian Case')
    print("Number of mislabeled points out of a total %d points : %d" % (X_test.shape[0], (y_te != y_pr_gnb).sum()))
    print(62*'-')
    
    ## ----------------------------------------------------------------------------------------- ##
    # 2. Naive Bayes - Bernoulli 
    from sklearn.naive_bayes import BernoulliNB
    
    bnb = BernoulliNB()
    
    # Predicted Labels
    y_pr_bnb = bnb.fit(X_train, y_tr).predict(X_test)
    print('2. Naive Bayes: Bernoulli Case')
    print("Number of mislabeled points out of a total %d points : %d" % (X_test.shape[0], (y_te != y_pr_bnb).sum()))
    print(62*'-')
    
    ## ----------------------------------------------------------------------------------------- ##
    # 3. Support Vector Machines - SVM
    import numpy as np
    from sklearn.svm import SVC
    
    train_portion = np.int_(len(train_index)* 0.1)
    test_portion = np.int_(len(test_index)* 0.1)
    
    # Consider a smaller dataset due to SVM convergence challenges (i.e., time limitations)
    train_index = train_index[0:train_portion]
    test_index = test_index[0:test_portion]
    
    X_train, X_test = Feature_Matrix.iloc[train_index,:], Feature_Matrix.iloc[test_index,:]
    y_tr, y_te = returns.iloc[train_index], returns.iloc[test_index]
    
    svc = SVC(kernel='rbf')
    
    # Predicted Labels
    y_pr_svc = svc.fit(X_train, y_tr).predict(X_test)
    print('3. SVM: ')
    print("TRAIN_subset:", len(train_index), "TEST_subset:", len(test_index))
    print("Number of mislabeled points out of a total %d points : %d" % (X_test.shape[0], (y_te != y_pr_svc).sum()))
    print('Full size data equivalent mislabeled points:', (y_te != y_pr_svc).sum() * 10) # Why 10?
    print(62*'#')
    print('')