# Hidden Markov Model

In [1]:
import pandas as pd
from datetime import datetime
from matplotlib import pyplot as plt
%matplotlib inline
from  statsmodels.tsa.stattools import adfuller,kpss, acf
import statsmodels.api
import numpy as np
from statsmodels.graphics.tsaplots import plot_acf,plot_pacf
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error,mean_absolute_error, mean_squared_log_error,balanced_accuracy_score
from statistics import mean
import statsmodels.api as sm
from sklearn import preprocessing
import seaborn as sns 
import gapminder
import sys
import warnings
import time
from pomegranate import *
from sklearn.metrics import accuracy_score, f1_score
import json
from math import e

#if not sys.warnoptions:
#    warnings.simplefilter("ignore")

In [2]:
def ema(x,window):
    '''exponential moving average'''
    return np.array(x.ewm(span=window).mean())


def bollinger(x,window):
    '''Bolinger bands & moving average'''
    rolling_mean = x.rolling(window).mean()
    rolling_std = x.rolling(window).std()
    roll_mean = rolling_mean
    higher_band = rolling_mean + (rolling_std * 2)
    lower_band = rolling_mean - (rolling_std * 2)
    return [lower_band[window:],roll_mean[window:],higher_band[window:]]


def macd(v): 
    '''moving average convergence divergence'''
    return ema(v,12)-ema(v,26)

def momentum(x,lags):
    return (1-x/x.shift(lags))*100

def a_d(close,low,high,vol):
    '''accumulation/distribution indicator''' 
    return ((close-low)-(high-close)*vol)/(high-low)

def trend_deterministic(dat):
    '''indicators which help to determine price direction, i.e. 1 stands for up, -1 for down'''
    dat['tr-ema']=np.where(dat['ema']<dat['mid'],1,-1)
    dat['tr-macd']=np.where(dat[['macd']]>dat[['macd']].shift(1),1,-1)
    dat['tr-momentum']=np.where(dat['momentum']>0,1,-1)
    return dat
        
def return_size(df):
    df['return']=df.loc[:,'close']-df.loc[:,'close'].shift(1)
    #df['size']=np.nan
    #for i in range(1,len(df['return'])): 
    #    if df['return'][i]>=0:
    #        df.iloc[i,-1]='P'#for positive
    #    if df['return'][i]<0 :
    #        df.iloc[i,-1]='N'#for negative
    df=df.dropna()
    return df

def plot_return(df):
    plt.figure(figsize=(15,2))
    if df.name=='Hour':
        plt.plot(df[['date']][8:15],df[['return']][8:15] )
    else:
        plt.plot(df[['date']][:30],df[['return']][:30] )
    plt.title(str(df.name)+' Return')
    plt.xlabel('time')
    plt.ylabel('return')
    plt.subplot()
    
def plot_close(df):
    plt.figure(figsize=(15,2))
    plt.plot(df[['date']],df[['close']])
    plt.title(str(df.name)+' Price Close')
    plt.xlabel('time')
    plt.ylabel('price')
    plt.subplot()

def rename_col(df):
    df.columns=['date','time','open','high','low','close','vol']
    return df

def format_time(df):
    df['date']=df['date']+' '+df['time']
    df=df.drop(columns=['time'])
    df['date']=[datetime.strptime(i, '%d/%m/%y %H:%M:%S') for i in list(df['date'])]
    return df

#### Open and format data

In [3]:
minute=pd.read_csv('C:\\Users\\Dell\\Desktop\\coursework\\Stonks-2\\AMZN_min.csv')
minute=minute.iloc[:,2:]
#day.columns=['date','open','high','low','close','vol']
minute=rename_col(minute)
#hour=rename_col(hour)
#day['date']=[datetime.strptime(i, '%Y-%m-%d') for i in list(day['date'])]
minute=format_time(minute)


In [4]:
#generate hourly data
hourly = minute.copy()
hourly['year'] = [i.year for i in hourly['date']]
hourly['month'] = [i.month for i in hourly['date']]
hourly['day'] = [i.day for i in hourly['date']]
hourly['hour'] = [i.hour for i in hourly['date']]
hour_low = hourly.groupby(['year', 'month', 'day', 'hour']).min()['low']
hour_high = hourly.groupby(['year', 'month', 'day', 'hour']).max()['high']
hour_open = hourly.groupby(['year', 'month', 'day', 'hour']).first()['open']
hour_date = hourly.groupby(['year', 'month', 'day', 'hour']).first()['date']
hour_close=hourly.groupby(['year', 'month', 'day', 'hour']).last()['close']
hour_vol=hourly.groupby(['year', 'month', 'day', 'hour']).sum()['vol']
AMZNhour=pd.DataFrame([hour_open,hour_low, hour_high, hour_close, hour_vol]).transpose()
AMZNhour=pd.concat([hour_date, AMZNhour], axis=1)
#AMZNhour.to_excel('AMZNhour.xlsx')

In [5]:
#generate daily data
daily = minute.copy()
daily['year'] = [i.year for i in daily['date']]
daily['month'] = [i.month for i in daily['date']]
daily['day'] = [i.day for i in daily['date']]
daily_low = daily.groupby(['year', 'month', 'day']).min()['low']
daily_high = daily.groupby(['year', 'month', 'day']).max()['high']
daily_open = daily.groupby(['year', 'month', 'day']).first()['open']
daily_date = daily.groupby(['year', 'month', 'day']).first()['date']
daily_close=daily.groupby(['year', 'month', 'day']).last()['close']
daily_vol=daily.groupby(['year', 'month', 'day']).sum()['vol']
AMZNdaily=pd.DataFrame([daily_open,daily_low, daily_high, daily_close, daily_vol]).transpose()
AMZNdaily=pd.concat([daily_date, AMZNdaily], axis=1)
for date in range(AMZNdaily.shape[0]):
    AMZNdaily.iloc[date,0] = AMZNdaily.iloc[date,0].strftime("%d.%m.%Y %H:%M:%S")
    AMZNdaily.iloc[date,0] = str(AMZNdaily.iloc[date,0])[:10]
AMZNdaily.head(3)
#AMZNdaily.to_excel('AMZNdaily.xlsx')

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,date,open,low,high,close,vol
year,month,day,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2019,2,19,19.02.2019,1601.0,1600.56,1634.0,1627.23,1888575.0
2019,2,20,20.02.2019,1630.0,1610.12,1634.93,1622.12,1678423.0
2019,2,21,21.02.2019,1619.85,1600.91,1623.56,1619.49,1881385.0


In [4]:
day=pd.read_excel('C:\\Users\\Dell\\Desktop\\coursework\\Stonks-2\\AMZNdaily.xlsx')
hour=pd.read_excel('C:\\Users\\Dell\\Desktop\\coursework\\Stonks-2\\AMZNhour.xlsx')
#day['date']=[datetime.strptime(i, '%Y-%m-%d') for i in list(day['date'])]
#hour['date']=[datetime.strptime(i, '%d/%m/%y %H:%M:%S') for i in list(hour['date'])]
day['return']=day.loc[:,'close']-day.loc[:,'close'].shift(1)
hour['return']=hour.loc[:,'close']-hour.loc[:,'close'].shift(1)
minute['return']=minute.loc[:,'close']-minute.loc[:,'close'].shift(1)
day=day.iloc[1:,:]
hour=hour.iloc[1:,:]
minute=minute.iloc[1:,:]
day['frac_high'] = (day['high'] - day['open'])/day['open']
day['frac_low'] = (day['low'] - day['open'])/day['open']

#day=return_size(day)
#hour=return_size(hour)

In [5]:
#overnight
day['overnight'] = day.loc[:,'open'] - day.loc[:,'close'].shift(1)
day.loc[:,'open'] = day.loc[:,'close'].shift(1)
day=day.iloc[1:,:]

time_m=[str(j)[-8:] for j in minute['date'].to_list()]
indices_end_m = [v for v, x in enumerate(time_m) if x == "16:00:00"]
for ind in range(0, len(indices_end_m) - 1):
    minute.iloc[indices_end_m[ind] + 1,1] = minute.iloc[indices_end_m[ind],4]
    
time_h=[str(p)[-8:] for p in hour['date'].to_list()]
indices_end_h = [y for y, u in enumerate(time_h) if u == "16:00:00"]
for ind_ in range(0, len(indices_end_h) - 1):
    hour.iloc[indices_end_h[ind_] + 1,1] = hour.iloc[indices_end_h[ind_],4]

In [None]:
#plot for hourly prices is for 1 day, from 11am to 17pm
#plot fot minute prices and returns is for 1 day, from 10am to 16 pm 
day.name='Day'
hour.name='Hour'
minute.name='Minute'

plot_close(day)
plt.figure(figsize=(15,2))
plt.plot(hour[['date']][8:15],hour[['close']][8:15])
plt.title(str(hour.name)+' Price Close')
plt.xlabel('time')
plt.ylabel('price')
plt.subplot()

plt.figure(figsize=(15,2))
plt.plot(minute[['date']][:380],minute[['close']][:380])
plt.title(str(minute.name)+' Price Close')
plt.xlabel('time')
plt.ylabel('price')
plt.subplot()

plot_return(day)
plot_return(hour)
plot_return(minute)

In [None]:
plt.figure(figsize=(15,2))
plt.plot(day[['date']][:25],day[['return']][:25])
plt.xlabel('time')
plt.ylabel('dollars');

In [None]:
whole_range = pd.read_csv('AMZN_all_dates.csv')
whole_range['Date']=[datetime.strptime(i, '%Y-%m-%d') for i in list(whole_range['Date'])]
whole_range['year']=[i.year for i in whole_range['Date']]
whole_range['diff']=whole_range['Close'].diff(1)
whole_range.head(3)

In [None]:
fig, ax = plt.subplots(figsize=(15,6))
ax.plot(whole_range['Date'], whole_range['Close'].astype(int))
ax.set_yticks(range(0, int(whole_range['Close'].max()),200))
ax.set_xlabel('year')
ax.set_ylabel('AMZN stock price')

In [None]:
fig, ax = plt.subplots(figsize=(15,5))a
ax.plot(whole_range['Date'],whole_range['diff'])
#ax.set_yticks(range(int(whole_range['diff'].min()), int(whole_range['diff'].max()),40))
ax.set_xlabel('year')
ax.set_ylabel('AMZN stock return')

#### EDA

In [None]:
print('sd deviations are: daily {},hourly {}, minute {}'.format(day.iloc[-29:,-1].std(),hour.iloc[-174:,-1].std(), minute.iloc[-8144:,-2].std()))
print('means are: daily {},hourly {}, minute {}'.format(day.iloc[-29:,-1].mean(),hour.iloc[-174:,-1].mean(), minute.iloc[-8144:,-2].mean()))
#больше сглаживания

In [None]:
plt.figure(figsize=(15,3))
plt.plot(day.iloc[-29:,0], day.iloc[-29:,-1])
plt.tight_layout()
plt.xlabel('time')
plt.ylabel('return USD');
plt.figure(figsize=(15,3))
plt.plot(hour.iloc[-174:,0], hour.iloc[-174:,-1])
plt.tight_layout()
plt.xlabel('time')
plt.ylabel('return USD');
plt.figure(figsize=(15,3))
plt.plot( minute.iloc[-8144:,0],  minute.iloc[-8144:,-2])
plt.tight_layout()
plt.xlabel('time')
plt.ylabel('return USD');

In [None]:
#correlation between lags
for j in range(1,4):
    print('DAY correlation for {} lags ={}'.format(j,day['return'].corr(day['return'].shift(j))))
print('------------------------------------------------')
for j in range(1,4):
    print('HOUR correlation for {} lags ={}'.format(j,hour['return'].corr(hour['return'].shift(j))))
print('------------------------------------------------')
for j in range(1,4):
    print('MINUTE correlation for {} lags ={}'.format(j,minute['return'].corr(minute['return'].shift(j))))
#для дневдных зависит только от предыдущего лага как по Маркову

In [None]:
#autocorrelation is small, quickly converges to zero and stays around it
min_r=statsmodels.tsa.stattools.acf(np.array(minute['return'][1:]), nlags=5, unbiased=True)
day_r=statsmodels.tsa.stattools.acf(np.array(day['return'][1:]), nlags=5, unbiased=True)
hour_r=statsmodels.tsa.stattools.acf(np.array(hour['return'][1:]), nlags=5, unbiased=True)
_corr=pd.DataFrame([day_r,hour_r, min_r]).transpose()
_corr.columns=['day return', 'hour return','minute return']
_corr

In [None]:
minute['return'].corr(minute['overnight'])

#### Test on stationarity 

In [None]:
#Lags chosen through AIC minimization
#All tests show stationarity
adf=adfuller(day['return'],regression='c',autolag='AIC')
print('Daily - ADF - p-value = {}, number of lags = {}'.format(adf[1],adf[2]))
kps=kpss(day['return'],regression='ct')
print('Daily -KPSS - p-value = {}, number of lags = {}'.format(kps[1],kps[2]))
print('----------------------------------------------------')
adf=adfuller(hour['return'],regression='c',autolag='AIC')
print('Hourly - ADF - p-value = {}, number of lags = {}'.format(adf[1],adf[2]))
kps=kpss(hour['return'],regression='ct')
print('Hourly -KPSS - p-value = {}, number of lags = {}'.format(kps[1],kps[2]))
print('----------------------------------------------------')
adf=adfuller(minute['return'][:5000],maxlag=40,regression='ct',autolag='AIC')#since high computational cost
print('Minute - ADF - p-value = {}, number of lags = {}'.format(adf[1],adf[2]))
kps=kpss(minute['return'][:5000],regression='ct')
print('Minute -KPSS - p-value = {}, number of lags = {}'.format(kps[1],kps[2]))

In [None]:
'''For prediction of continuous data, I need to know pdf of returns. So here I want to see if they can be approximated by normal
distribution. Minute (2nd histogram) seems like normal. For others, I would use Mixture models (combine pdfs of normal 
distributions with different parameters with different weights)'''
day[['return']].hist(bins=40);
hour[['return']].hist(bins=40);
minute[['return']].hist(bins=40);

In [None]:
#close
day['overnight']=day.loc[:,'open']-day.loc[:,'close'].shift(1)


In [None]:
day=day.iloc[1:,:]
day['overnight'].corr(day['close'])

### Data preparation

#### Split

In [6]:
def split(df):
    ind_train = int(df.shape[0] * 0.7)
    margin = int(df.shape[0] * 0.01) # as we implement embargoing
    #ind_cv = int(df.shape[0] * 0)
    train = df.iloc[:ind_train]
   # cv = df.iloc[ind_train+margin:ind_train+margin+ind_cv]
    test = df.iloc[ind_train+margin:day.shape[0]]
    return train,  test
#day_cv.to_csv('day_cv.csv')
#day_test.to_csv('day_test.csv')

In [7]:
def trend(data, step, mode):
    df = data.copy()
    df['sign'] = np.nan
    for row in range(df.shape[0]):
        if df.iloc[row, 6] > 0:
            df.iloc[row, -1] = 1
        else:
            df.iloc[row, -1] = 0
    df_incr = []
    df_decr = []
    d = 0
    k = 0
    df['sum'] = df['sign'].rolling(window = step).sum()
    while d < df.shape[0] - 1:
        if df.iloc[d,-1] == step and df.iloc[d + 1,-1] != step:
            df_incr.append(df.iloc[k+1:d+1,:])
            k = d
        elif df.iloc[d,-1] == 0 and df.iloc[d + 1,-1] != 0:
            df_decr.append(df.iloc[k+1:d+1, :])
            k = d
        d += 1
        
    df_pos_fin = df_incr[0].iloc[:,:]
    for entry in range(1,len(df_incr)):
        df_pos_fin = df_pos_fin.append(df_incr[entry])
    
    df_neg_fin = df_decr[0].iloc[:,:]
    for entry_ in range(1,len(df_decr)):
        df_neg_fin = df_neg_fin.append(df_decr[entry_])

    if mode == 'pos':
        return df_pos_fin
    elif mode == 'neg':
        return df_neg_fin

In [8]:
#day_pos, day_neg = trend(day, 8) #max 8
#day_train_pos, day_cv_pos, day_test_pos = split(day_pos)
#day_train_neg, day_cv_neg, day_test_neg = split(day_neg)
#a, b = trend(day, 3, 'pos')
#a.head(20)

#### HMM -- day -- univariate -- close - close -- long sequence

In [9]:
 def init_parameters(states):
    pi =  np.random.rand(states)
    pi = pi/pi.sum()
    end =  np.random.rand(states)
    end = end/end.sum()
    trans = np.random.rand(states, states)
    for row in range(trans.shape[0]):
        trans[row] = trans[row] / trans[row].sum()
    return pi, end, trans

def make_cv(df, step, col_n, mode):
    cv = []
    a = step - 1 
    b = 0
    if mode == 'neg':
        while a < df.shape[0]:
            if df.iloc[a,-1] == 0:
                arr = np.array(df.iloc[b:a - (step - 1), col_n])
                if arr.size > 1:
                    cv.append(arr)
                b = a + 1
            a += 1
    if mode == 'pos':
        while a < df.shape[0]:
            if df.iloc[a,-1] == step:
                arr = np.array(df.iloc[b:a - (step - 1), col_n])
                if arr.size > 1:
                    cv.append(arr) #close price
                b = a + 1
            a += 1
    return cv

In [10]:
def test_hmm(df_test, model_neg, model_pos, step, col_n, mode):
    try_test = make_cv(df_test, step, col_n, mode)
    lab_true = []
    lab_pred = []
    if mode == 'pos':
        for item in range(len(try_test)):
            lab_true.append('P')
    elif mode == 'neg':
        for item in range(len(try_test)):
            lab_true.append('N')
    for i in try_test:
        nprob = model_neg.forward(i)[-1]
        total_neg_prob=0
        for pr_neg in nprob:
            if pr_neg!=-np.Inf:
                total_neg_prob+=pr_neg
        pprob = model_pos.forward(i)[-1]
        total_pos_prob=0
        for pr_pos in pprob:
            if pr_pos!=-np.Inf:
                total_pos_prob+=pr_pos
        if total_neg_prob > total_pos_prob:
            lab_pred.append('N')
        else:
            lab_pred.append('P')
    
    return f1_score(lab_true, lab_pred, average='micro')

def final_check_day(mode, col_n, ind):
    if mode == 'neg':
        return test_hmm(day_neg_testsets[ind], day_neg_mod[ind], day_pos_mod[ind], day_neg_steps[ind], col_n, 'neg' )
    elif mode == 'pos':
        return test_hmm(day_pos_testsets[ind], day_neg_mod[ind],day_pos_mod[ind], day_pos_steps[ind],col_n, 'pos' )

In [11]:
#day_data_neg = list([np.array(day_neg_fin['overnight']), np.array(day_neg_fin['frac_high']), 
                   #  np.array(day_neg_fin['frac_low']),
                   #  np.array(day_neg_fin['return'])])
def train_hmm(data, mode, col, col_number):
    n_states = []
    n_steps = []
    n_pi = []
    n_end = []
    n_trans = []
    n_dists = []
    n_prob = []
    n_mod = []
    n_cv = []
    n_testsets = []
    for step in range(3, 6):
        df = trend(data, step, mode) 
        df_train, df_test = split(df)
        for states in range(3,9):
            for try_ in range(30):
                df_pi, df_end, df_trans = init_parameters(states)
                df_data = list()
                df_data.append(np.array(df_train[col]))
                dists = list()
                mean = df_train[col].mean()
                std = df_train[col].std()
                for i in range(states):
                    dists.append(NormalDistribution(numpy.random.randint(50,100)/100 * mean, numpy.random.randint(50,100)/100 * std))
                mod = HiddenMarkovModel.from_matrix(df_trans, dists, df_pi, df_end)
                mod.fit(df_data, algorithm='baum-welch');
                #test = make_cv(df_test, step, col_number, mode)#[:4]
                #prob = 0
                #for i in cv:
                 #   prob += mod.log_probability(i)
                n_states.append(states)
                n_steps.append(step)
                n_pi.append(df_pi)
                n_end.append(df_end)
                n_trans.append(df_trans)
                n_dists.append(dists)
               # n_prob.append(prob)
                n_mod.append(mod)
                #n_cv.append()
                n_testsets.append(df_test)

   # best_ind = n_prob.index(max(n_prob))
   # print(""""number of states = {}, number of steps = {}, starting probabilities = {}, end probabilities = {},
    #                  transition matrix = {}, emission matrix = {}, highest log_probability = {}"""
     #                 .format(n_states[best_ind], n_steps[best_ind], n_pi[best_ind], n_end[best_ind], n_trans[best_ind],
      #                        n_dists[best_ind], n_prob[best_ind]))
   # return n_mod[best_ind], n_testsets[best_ind]
    return n_states, n_steps, n_pi, n_end, n_trans, n_dists,  n_mod,  n_testsets

In [156]:
day_pos_states, day_pos_steps, day_pos_pi, day_pos_end, day_pos_trans, day_pos_dist, day_pos_mod, day_pos_testsets = train_hmm(day, 'pos', 'return', 6)
day_neg_states, day_neg_steps, day_neg_pi, day_neg_end, day_neg_trans, day_neg_dist, day_neg_mod, day_neg_testsets = train_hmm(day, 'neg',  'return', 6)

In [12]:
def best_model(mod_length, col_n, func):
    score = []
    for idx in range(mod_length):
        p1 = func('neg', col_n, idx)
        p2 = func('pos', col_n, idx)
        score.append(p1+p2)
    all_max = list(filter(lambda x: x> 1, score))
    max_ = max(all_max)
    max_idx = score.index(max_)
    return score[max_idx], max_idx

def emission_matr(df,cap):
    emiss = []
    for stat in df.states:
        try: 
            mean_, std_ = stat.distribution.parameters
            emiss.append('N({}, {})'.format(round(mean_, 6), round(std_,6)))
        except AttributeError:
            pass
    return print(pd.DataFrame(emiss).to_latex(caption='Emission matrix '+cap))

def trans_matr(data, caption):
    df=pd.DataFrame(data.dense_transition_matrix())
    df = df.iloc[:-1,:-2]
    pi = df.iloc[-1,:]
    df = df.iloc[:-1,:]
    for r in range(df.shape[0]):
        df.iloc[r,:] = df.iloc[r,:]/df.iloc[r,:].sum()
    return print(df.round(6).to_latex(caption='Transition matrix ' + caption)), print(pi.round(6).to_latex(caption=
                                                                                                           'Initial state perobabilities vector ' + caption))

In [181]:
day_neg_return_mod,day_neg_return_ind = best_model(len(day_neg_mod), 6, final_check_day)
print('Total accuracy: {}'.format(day_neg_return_mod))
print('Index: {}'.format(day_neg_return_ind))

Total accuracy: 1.5
Index: 174


In [183]:
print('negative: {}'.format(final_check_day('neg', 6, 174)))
print('positive: {}'.format(final_check_day('pos', 6, 174)))

negative: 0.8333333333333334
positive: 0.6666666666666666


In [190]:
with open('day_neg_mod_ret_long.json', 'w') as js:
    json.dump(day_neg_mod[174].to_json(), js)
with open('day_pos_mod_ret_long.json', 'w') as js:
    json.dump(day_pos_mod[174].to_json(), js)

#### Short return

In [164]:
day_pos_states_sh, day_pos_steps_sh, day_pos_pi_sh, day_pos_end_sh, day_pos_trans_sh, day_pos_dist_sh, day_pos_mod_sh, day_pos_testsets_sh = train_hmm(day[:800], 'pos', 'return', 6)
day_neg_states_sh, day_neg_steps_sh, day_neg_pi_sh, day_neg_end_sh, day_neg_trans_sh, day_neg_dist_sh, day_neg_mod_sh, day_neg_testsets_sh = train_hmm(day[:800], 'neg',  'return', 6)

In [168]:
def final_check_day_sh(mode, col_n, ind):
    if mode == 'neg':
        return test_hmm(day_neg_testsets_sh[ind], day_neg_mod_sh[ind], day_pos_mod_sh[ind], day_neg_steps_sh[ind], col_n, 'neg' )
    elif mode == 'pos':
        return test_hmm(day_pos_testsets_sh[ind], day_neg_mod_sh[ind],day_pos_mod_sh[ind], day_pos_steps_sh[ind],col_n, 'pos' )

In [178]:
day_neg_return_mod_sh,day_neg_return_ind_sh = best_model(len(day_neg_mod_sh), 6, final_check_day_sh)
print('Total accuracy: {}'.format(day_neg_return_mod_sh))
print('Index: {}'.format(day_neg_return_ind_sh))

Total accuracy: 2.0
Index: 413


In [176]:
print('negative: {}'.format(final_check_day_sh('neg', 6, 413)))
print('positive: {}'.format(final_check_day_sh('pos', 6, 413)))

negative: 1.0
positive: 1.0


In [189]:
with open('day_neg_mod_ret_short.json', 'w') as js:
    json.dump(day_neg_mod_sh[413].to_json(), js)
with open('day_pos_mod_ret_short.json', 'w') as js:
    json.dump(day_pos_mod_sh[413].to_json(), js)

#### Multivariable return

In [185]:
day_pos_states_c, day_pos_steps_c, day_pos_pi_c, day_pos_end_c, day_pos_trans_c, day_pos_dist_c, day_pos_mod_c, day_pos_testsets_c = train_hmm(day[:800], 'pos', 'close', 4)
day_neg_states_c, day_neg_steps_c, day_neg_pi_c, day_neg_end_c, day_neg_trans_c, day_neg_dist_c, day_neg_mod_c, day_neg_testsets_c = train_hmm(day[:800], 'neg',  'close', 4)

In [186]:
def final_check_day_c(mode, col_n, ind):
    if mode == 'neg':
        return test_hmm(day_neg_testsets_c[ind], day_neg_mod_c[ind], day_pos_mod_c[ind], day_neg_steps_c[ind], col_n, 'neg' )
    elif mode == 'pos':
        return test_hmm(day_pos_testsets_c[ind], day_neg_mod_c[ind],day_pos_mod_c[ind], day_pos_steps_c[ind],col_n, 'pos' )

In [187]:
day_neg_return_mod_c,day_neg_return_ind_c = best_model(len(day_neg_mod_c), 4, final_check_day_c)
print('Total accuracy: {}'.format(day_neg_return_mod_c))
print('Index: {}'.format(day_neg_return_ind_c))

Total accuracy: 2.0
Index: 437


In [188]:
print('negative: {}'.format(final_check_day_c('neg', 4, 437)))
print('positive: {}'.format(final_check_day_c('pos', 4, 437)))

negative: 1.0
positive: 1.0


In [191]:
with open('day_neg_mod_cl_short.json', 'w') as js:
    json.dump(day_neg_mod_c[437].to_json(), js)
with open('day_pos_mod_cl_short.json', 'w') as js:
    json.dump(day_pos_mod_c[437].to_json(), js)

#### Close long

In [25]:
day_pos_states_c2, day_pos_steps_c2, day_pos_pi_c2, day_pos_end_c2, day_pos_trans_c2, day_pos_dist_c2, day_pos_mod_c2, day_pos_testsets_c2 = train_hmm(day, 'pos', 'close', 4)
day_neg_states_c2, day_neg_steps_c2, day_neg_pi_c2, day_neg_end_c2, day_neg_trans_c2, day_neg_dist_c2, day_neg_mod_c2, day_neg_testsets_c2 = train_hmm(day, 'neg',  'close', 4)

In [26]:
def final_check_day_c_long(mode, col_n, ind):
    if mode == 'neg':
        return test_hmm(day_neg_testsets_c2[ind], day_neg_mod_c2[ind], day_pos_mod_c2[ind], day_neg_steps_c2[ind], col_n, 'neg' )
    elif mode == 'pos':
        return test_hmm(day_pos_testsets_c2[ind], day_neg_mod_c2[ind],day_pos_mod_c2[ind], day_pos_steps_c2[ind],col_n, 'pos' )

In [27]:
day_neg_return_mod_c2,day_neg_return_ind_c2 = best_model(len(day_neg_mod_c2), 4, final_check_day_c_long)
print('Total accuracy: {}'.format(day_neg_return_mod_c2))
print('Index: {}'.format(day_neg_return_ind_c2))

Total accuracy: 1.4444444444444444
Index: 369


In [30]:
print('negative: {}'.format(final_check_day_c_long('neg', 4, 369)))
print('positive: {}'.format(final_check_day_c_long('pos', 4, 369)))

negative: 0.6666666666666666
positive: 0.7777777777777778


In [31]:
with open('day_neg_mod_close_long.json', 'w') as js:
    json.dump(day_neg_mod_c2[369].to_json(), js)
with open('hour_pos_mod_close_long.json', 'w') as js:
    json.dump(day_pos_mod_c2[369].to_json(), js)

#### Hourly long return

In [18]:
hour_pos_states, hour_pos_steps, hour_pos_pi, hour_pos_end, hour_pos_trans, hour_pos_dist, hour_pos_mod, hour_pos_testsets = train_hmm(hour.iloc[-1859:,:], 'pos', 'return', 6)
hour_neg_states, hour_neg_steps, hour_neg_pi, hour_neg_end, hour_neg_trans, hour_neg_dist, hour_neg_mod, hour_neg_testsets = train_hmm(hour.iloc[-1859:,:], 'neg',  'return', 6)

In [19]:
def final_check_hour_ret_long(mode, col_n, ind):
    if mode == 'neg':
        return test_hmm(hour_neg_testsets[ind], hour_neg_mod[ind], hour_pos_mod[ind], hour_neg_steps[ind], col_n, 'neg' )
    elif mode == 'pos':
        return test_hmm(hour_pos_testsets[ind], hour_neg_mod[ind],hour_pos_mod[ind], hour_pos_steps[ind],col_n, 'pos' )

In [22]:
hour_neg_return_mod,hour_neg_return_ind = best_model(len(hour_neg_mod), 6, final_check_hour_ret_long)
print('Total accuracy: {}'.format(hour_neg_return_mod))
print('Index: {}'.format(hour_neg_return_ind))

Total accuracy: 1.5
Index: 474


In [23]:
print('negative: {}'.format(final_check_hour_ret_long('neg', 6, 474)))
print('positive: {}'.format(final_check_hour_ret_long('pos', 6, 474)))

negative: 0.8000000000000002
positive: 0.7


In [24]:
with open('hour_neg_mod_ret_long.json', 'w') as js:
    json.dump(hour_neg_mod[474].to_json(), js)
with open('hour_pos_mod_ret_long.json', 'w') as js:
    json.dump(hour_pos_mod[474].to_json(), js)

#### Hourly long close

In [32]:
hour_pos_states_cl_l, hour_pos_steps_cl_l, hour_pos_pi_cl_l, hour_pos_end_cl_l, hour_pos_trans_cl_l, hour_pos_dist_cl_l, hour_pos_mod_cl_l, hour_pos_testsets_cl_l = train_hmm(hour.iloc[-1859:,:], 'pos', 'close', 4)
hour_neg_states_cl_l, hour_neg_steps_cl_l, hour_neg_pi_cl_l, hour_neg_end_cl_l, hour_neg_trans_cl_l, hour_neg_dist_cl_l, hour_neg_mod_cl_l, hour_neg_testsets_cl_l = train_hmm(hour.iloc[-1859:,:], 'neg',  'close', 4)

In [33]:
def final_check_hour_cl_long(mode, col_n, ind):
    if mode == 'neg':
        return test_hmm(hour_neg_testsets_cl_l[ind], hour_neg_mod_cl_l[ind], hour_pos_mod_cl_l[ind], hour_neg_steps_cl_l[ind], col_n, 'neg' )
    elif mode == 'pos':
        return test_hmm(hour_pos_testsets_cl_l[ind], hour_neg_mod_cl_l[ind],hour_pos_mod_cl_l[ind], hour_pos_steps_cl_l[ind],col_n, 'pos' )

In [34]:
hour_neg_close_mod,hour_neg_close_ind = best_model(len(hour_neg_mod_cl_l), 4, final_check_hour_cl_long)
print('Total accuracy: {}'.format(hour_neg_close_mod))
print('Index: {}'.format(hour_neg_close_ind))

Total accuracy: 1.6
Index: 467


In [35]:
print('negative: {}'.format(final_check_hour_cl_long('neg', 4, 467)))
print('positive: {}'.format(final_check_hour_cl_long('pos', 4, 467)))

negative: 1.0
positive: 0.6


In [36]:
with open('hour_neg_mod_close_long.json', 'w') as js:
    json.dump(hour_neg_mod_cl_l[467].to_json(), js)
with open('hour_pos_mod_close_long.json', 'w') as js:
    json.dump(hour_pos_mod_cl_l[467].to_json(), js)

#### Hour close short

In [39]:
hour_pos_states_cl_sh, hour_pos_steps_cl_sh, hour_pos_pi_cl_sh, hour_pos_end_cl_sh, hour_pos_trans_cl_sh, hour_pos_dist_cl_sh, hour_pos_mod_cl_sh, hour_pos_testsets_cl_sh = train_hmm(hour.iloc[-800:,:], 'pos', 'close', 4)
hour_neg_states_cl_sh, hour_neg_steps_cl_sh, hour_neg_pi_cl_sh, hour_neg_end_cl_sh, hour_neg_trans_cl_sh, hour_neg_dist_cl_sh, hour_neg_mod_cl_sh, hour_neg_testsets_cl_sh = train_hmm(hour.iloc[-800:,:], 'neg',  'close', 4)

In [42]:
def final_check_hour_cl_short(mode, col_n, ind):
    if mode == 'neg':
        return test_hmm(hour_neg_testsets_cl_sh[ind], hour_neg_mod_cl_sh[ind], hour_pos_mod_cl_sh[ind], hour_neg_steps_cl_sh[ind], col_n, 'neg' )
    elif mode == 'pos':
        return test_hmm(hour_pos_testsets_cl_sh[ind], hour_neg_mod_cl_sh[ind],hour_pos_mod_cl_sh[ind], hour_pos_steps_cl_sh[ind],col_n, 'pos' )

In [43]:
hour_neg_close_mod_sh,hour_neg_close_ind_sh = best_model(len( hour_neg_mod_cl_sh), 4, final_check_hour_cl_short)
print('Total accuracy: {}'.format(hour_neg_close_mod_sh))
print('Index: {}'.format(hour_neg_close_ind_sh))

Total accuracy: 1.4666666666666668
Index: 361


In [44]:
print('negative: {}'.format(final_check_hour_cl_short('neg', 4, 361)))
print('positive: {}'.format(final_check_hour_cl_short('pos', 4, 361)))

negative: 0.6666666666666666
positive: 0.8000000000000002


In [45]:
with open('hour_neg_mod_close_short.json', 'w') as js:
    json.dump(hour_neg_mod_cl_sh[361].to_json(), js)
with open('hour_pos_mod_close_short.json', 'w') as js:
    json.dump(hour_pos_mod_cl_sh[361].to_json(), js)

##### Hour return short

In [46]:
hour_pos_states_ret_sh, hour_pos_steps_ret_sh, hour_pos_pi_ret_sh, hour_pos_end_ret_sh, hour_pos_trans_ret_sh, hour_pos_dist_ret_sh, hour_pos_mod_ret_sh, hour_pos_testsets_ret_sh = train_hmm(hour.iloc[-800:,:], 'pos', 'return', 6)
hour_neg_states_ret_sh, hour_neg_steps_ret_sh, hour_neg_pi_ret_sh, hour_neg_end_ret_sh, hour_neg_trans_ret_sh, hour_neg_dist_ret_sh, hour_neg_mod_ret_sh, hour_neg_testsets_ret_sh = train_hmm(hour.iloc[-800:,:], 'neg',  'return', 6)

In [47]:
def final_check_hour_ret_short(mode, col_n, ind):
    if mode == 'neg':
        return test_hmm(hour_neg_testsets_ret_sh[ind], hour_neg_mod_ret_sh[ind], hour_pos_mod_ret_sh[ind], hour_neg_steps_ret_sh[ind], col_n, 'neg' )
    elif mode == 'pos':
        return test_hmm(hour_pos_testsets_ret_sh[ind], hour_neg_mod_ret_sh[ind],hour_pos_mod_ret_sh[ind], hour_pos_steps_ret_sh[ind],col_n, 'pos' )

In [48]:
hour_neg_ret_mod_sh, hour_neg_ret_ind_sh = best_model(len( hour_neg_mod_ret_sh), 6, final_check_hour_ret_short)
print('Total accuracy: {}'.format(hour_neg_ret_mod_sh))
print('Index: {}'.format(hour_neg_ret_ind_sh))

Total accuracy: 1.6750000000000003
Index: 353


In [90]:
print('negative: {}'.format(final_check_hour_ret_short('neg', 6, 353)))
print('positive: {}'.format(final_check_hour_ret_short('pos', 6, 353)))
print('steps: {}'.format(hour_pos_steps_ret_sh[353]))
print('states: {}'.format(hour_pos_states_ret_sh[353]))

NameError: name 'final_check_hour_ret_short' is not defined

In [50]:
with open('hour_neg_mod_return_short.json', 'w') as js:
    json.dump(hour_neg_mod_ret_sh[353].to_json(), js)
with open('hour_pos_mod_return_short.json', 'w') as js:
    json.dump(hour_pos_mod_ret_sh[353].to_json(), js)

#### Minute return long

In [52]:
min_pos_states_ret, min_pos_steps_ret, min_pos_pi_ret, min_pos_end_ret, min_pos_trans_ret, min_pos_dist_ret, min_pos_mod_ret, min_pos_testsets_ret = train_hmm(minute.iloc[-1859:,:], 'pos', 'return', 6)
min_neg_states_ret, min_neg_steps_ret, min_neg_pi_ret, min_neg_end_ret, min_neg_trans_ret, min_neg_dist_ret, min_neg_mod_ret, min_neg_testsets_ret = train_hmm(minute.iloc[-1859:,:], 'neg',  'return', 6)

In [56]:
def final_check_min_ret_long(mode, col_n, ind):
    if mode == 'neg':
        return test_hmm(min_neg_testsets_ret[ind], min_neg_mod_ret[ind], min_pos_mod_ret[ind], min_pos_steps_ret[ind], col_n, 'neg' )
    elif mode == 'pos':
        return test_hmm(min_pos_testsets_ret[ind], min_neg_mod_ret[ind], min_pos_mod_ret[ind], min_pos_steps_ret[ind],col_n, 'pos' )

In [57]:
min_neg_ret_mod_l, min_neg_ret_ind_l = best_model(len( min_neg_mod_ret), 6, final_check_min_ret_long)
print('Total accuracy: {}'.format(min_neg_ret_mod_l))
print('Index: {}'.format(min_neg_ret_ind_l))

Total accuracy: 1.5
Index: 496


In [91]:
print('negative: {}'.format(final_check_min_ret_long('neg', 6, 496)))
print('positive: {}'.format(final_check_min_ret_long('pos', 6, 496)))
print('steps: {}'.format(min_pos_steps_ret[496]))
print('states: {}'.format(min_pos_states_ret[496]))

NameError: name 'final_check_min_ret_long' is not defined

In [59]:
with open('min_neg_mod_return_long.json', 'w') as js:
    json.dump(min_neg_mod_ret[496].to_json(), js)
with open('min_pos_mod_return_long.json', 'w') as js:
    json.dump(min_pos_mod_ret[496].to_json(), js)

#### Minute return short

In [15]:
min_pos_states_ret_sh, min_pos_steps_ret_sh, min_pos_pi_ret_sh, min_pos_end_ret_sh, min_pos_trans_ret_sh, min_pos_dist_ret_sh, min_pos_mod_ret_sh, min_pos_testsets_ret_sh = train_hmm(minute.iloc[-800:,:], 'pos', 'return', 6)
min_neg_states_ret_sh, min_neg_steps_ret_sh, min_neg_pi_ret_sh, min_neg_end_ret_sh, min_neg_trans_ret_sh, min_neg_dist_ret_sh, min_neg_mod_ret_sh, min_neg_testsets_ret_sh = train_hmm(minute.iloc[-800:,:], 'neg',  'return', 6)

In [88]:
def trans_matr(data, caption):
    df=pd.DataFrame(data.dense_transition_matrix())
    df = df.iloc[:-1,:-2]
    pi = df.iloc[-1,:]
    df = df.iloc[:-1,:]
    for r in range(df.shape[0]):
        df.iloc[r,:] = df.iloc[r,:]/df.iloc[r,:].sum()
    return print(df.round(6).to_latex(caption='Transition matrix ' + caption)), print(pi.round(6).to_latex(caption=
                                                                                                           'Initial state perobabilities vector ' + caption))

In [92]:
def final_check_min_ret_short(mode, col_n, ind):
    if mode == 'neg':
        return test_hmm(min_neg_testsets_ret_sh[ind], min_neg_mod_ret_sh[ind], min_pos_mod_ret_sh[ind], min_pos_steps_ret_sh[ind], col_n, 'neg' )
    elif mode == 'pos':
        return test_hmm(min_pos_testsets_ret_sh[ind], min_neg_mod_ret_sh[ind], min_pos_mod_ret_sh[ind], min_pos_steps_ret_sh[ind],col_n, 'pos' )

In [93]:
min_neg_ret_mod_sh, min_neg_ret_ind_sh = best_model(len( min_neg_mod_ret_sh), 6, final_check_min_ret_short)
print('Total accuracy: {}'.format(min_neg_ret_mod_sh))
print('Index: {}'.format(min_neg_ret_ind_sh))

Total accuracy: 2.0
Index: 413


In [94]:
print('negative: {}'.format(final_check_min_ret_short('neg', 6, 413)))
print('positive: {}'.format(final_check_min_ret_short('pos', 6, 413)))
print('steps: {}'.format(min_pos_steps_ret_sh[413]))
print('states: {}'.format(min_pos_states_ret_sh[413]))

negative: 1.0
positive: 1.0
steps: 5
states: 4


In [95]:
with open('min_neg_mod_return_short.json', 'w') as js:
    json.dump( min_neg_mod_ret_sh[413].to_json(), js)
with open('min_pos_mod_return_short.json', 'w') as js:
    json.dump( min_pos_mod_ret_sh[413].to_json(), js)