In [27]:
import warnings
warnings.filterwarnings('ignore') # Filter out warnings
import numpy as np
import pandas as pd
import numpy as np
import pandas as pd
import random as rnd
import matplotlib.pyplot as plt
%matplotlib inline
from matplotlib.pylab import rcParams
from scipy import stats
import math

In [28]:
trade_path = 'trade.csv'
quote_path = 'quote.csv'
df_quote = pd.read_csv(quote_path)
df_trade = pd.read_csv(trade_path) 

In [29]:
df_trade['NEW_TIME'] = df_trade['DATE'].map(str) + str("  ")+ df_trade['TIME_M'].map(str)
#df_trade.head(5)

In [30]:
df_quote['NEW_TIME'] = df_quote['DATE'].map(str) + str("  ")+ df_quote['TIME_M'].map(str)
#df_quote.head(5)

In [31]:
#df_trade.columns.values
#df_quote.columns.values

In [32]:
print("Originally, quote and trade csv each has {}, {} rows.".format(len(df_quote.index),len(df_trade.index)))

Originally, quote and trade csv each has 173887, 5875 rows.


In [33]:
# remove rows in the quote data where ASK=0 or BID=0 
df_quote = df_quote[df_quote['ASK']>0][df_quote['BID']>0][df_quote['BIDSIZ']>0][df_quote['ASKSIZ']>0]
# add a new time column
df_quote['Time'] = pd.to_datetime(df_quote['NEW_TIME'])
df_quote = df_quote.sort_values(by='Time').reset_index()
# add a spread column
df_quote['Spread'] = df_quote['ASK']-df_quote['BID']
print("after cleaning, now quote csv has {} rows.".format(len(df_quote.index)))

after cleaning, now quote csv has 172115 rows.


In [34]:
# df_trade = df_trade.loc[df_trade['DATE']==20160104]
# add a new time column
df_trade['Time'] = pd.to_datetime(df_trade['NEW_TIME'])
df_trade = df_trade.sort_values(by='Time').reset_index()

## Problem 3: 1) design a statistical test for Roll's model

#### We are only looking at transcation happening between 9：30 and 12:30 on Jan.4th, 2016.

In [35]:
# specify a timeframe
time_range = pd.date_range(start='2016-01-04 09:30',end='2016-01-04 15:30',periods=None,freq='0.1H')
print("there are {} intervals.".format(len(time_range.tolist())))

there are 61 intervals.


In [36]:
Spread_list, Cov_list = [], []
for i in range(len(time_range)-1):
    s = df_quote[(df_quote['Time']>time_range[i])&(df_quote['Time']<time_range[i+1])].Spread.mean()
    Spread_list.append(s)
    
    p_diff = df_trade[(df_trade['Time']>time_range[i])&(df_trade['Time']<time_range[i+1])].PRICE.diff().values
    Cov = np.corrcoef(p_diff[1:len(p_diff)-1],p_diff[2:len(p_diff)])[0,1]
    Cov_list.append(2*np.sqrt(np.abs(Cov)))

In [80]:
# run the t-test
Result = stats.ttest_ind(Spread_list,Cov_list)
print("the p-value of the t test result is {} .".format(Result[1]))

the p-value of the t test result is 1.1528482086579586e-11 .


### Conclusion: Because the p-value is very close to 0 (way smaller than 0.05), we conclude that Roll's model doesn't fit with the data we used.

##  2) design a statistical test for the generalized Roll's model

#### cov(d_i,d_i-1) = -(s/2)*(lambda+s/2) ....[1] and var(d) = (s/2)^2 + (s/2+lambda)^2 + var(u)....[2]
#### where u: variance of non-trade information term; lambda: info content about trade

In [81]:
start = '2016-01-04 09:30'
end = '2016-01-04 15:30'
new_trade = df_trade[(df_trade['Time']>=start)&(df_trade['Time']<=end)]

#### suppose we know var(u), then plug equation [1] into [2],
#### then var(d)=(s/2)^2 + (cov(d_i,d_i-1)/(-s/2))^2 + var(u) ... [3], and we can get estimated s from [3]
#### which is estimated_s = sqrt( 2*(var(d)-var(u)) +/- 2*sqrt( (var(d)-var(u))^2 - 4*cov(d_i,d_i-1)  ) )

### Reasoning: for a couple of assumed var_u, calculate estimated_spread for each var_u, find the best fit to the real spread (by running statistical test), then the best var_u will give us the best lambda

In [112]:
var_u_list = [0.6,0.4,0.3,0.1,0.05,0.01,0.005,0.001,0.0005,0.0001,0]
Result_dict = dict()  # {var_u: p_value}

for var_u in var_u_list:
    s_hat_list = []
    
    # Get estimated spread
    for i in range(len(time_range)-1):
        # compute cov(d_i,d_i-1) term
        p_diff = df_trade[(df_trade['Time']>time_range[i])&(df_trade['Time']<time_range[i+1])].PRICE.diff().values  
        cov = np.corrcoef(p_diff[1:len(p_diff)-1],p_diff[2:len(p_diff)])[0,1]
        # compute var(d) term
        var_d = np.var(p_diff[1:])
        # compute estimated_s
        s_hat = math.sqrt(2*(var_d-var_u)+2*np.sqrt((var_d-var_u)**2-4*cov))
        s_hat_list.append(s_hat)
        # compute the corresponding lambda_hat
        lambda_hat = ((-cov)/(s_hat/2))-(s_hat/2)  
        #lambda_hat = np.sqrt(var_d - var_u - (s_hat/2)**2)-s_hat/2
    
    # replace nan with 0! 
    for (i, item) in enumerate(s_hat_list):
        if math.isnan(item):
            s_hat_list[i] = 0.00
    
    # compare it with real spread
    r = stats.ttest_ind(Spread_list,s_hat_list)
    Result_dict[var_u] = r[1],lambda_hat

In [114]:
# visualize our result
for k,v in Result_dict.items():
    print("var_u: {}, (p_value,lambda_hat): {}".format(k,v))

var_u: 0.6, (p_value,lambda_hat): (3.3820165292057212e-11, -0.08651616646006477)
var_u: 0.4, (p_value,lambda_hat): (7.5418399739238733e-09, -0.17324839204442416)
var_u: 0.3, (p_value,lambda_hat): (1.3234439784712371e-07, -0.22059808216624305)
var_u: 0.1, (p_value,lambda_hat): (4.3876928359647279e-05, -0.32273730757483637)
var_u: 0.05, (p_value,lambda_hat): (0.00017857558644884277, -0.34959407265208725)
var_u: 0.01, (p_value,lambda_hat): (0.00053069155772554079, -0.37137966221697738)
var_u: 0.005, (p_value,lambda_hat): (0.00060651927752655623, -0.37411963081237543)
var_u: 0.001, (p_value,lambda_hat): (0.00067460360106771141, -0.37631411743500182)
var_u: 0.0005, (p_value,lambda_hat): (0.00068361498323359408, -0.37658858313263438)
var_u: 0.0001, (p_value,lambda_hat): (0.00069090744863192733, -0.37680818029486335)
var_u: 0, (p_value,lambda_hat): (0.00069274223673362412, -0.3768630829976905)


### Conclusion: When lambda=-0.3768, var_u=0.0001, Generalized Roll's model fits the best. In general, it fits a lot better than the Roll's model.

## 3) test for Markov Chain