In [101]:
import warnings
import logging
import itertools
import pandas as pd
import numpy as np
from pandas_datareader import data
import matplotlib.pyplot as plt
from hmmlearn.hmm import GaussianHMM
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from tqdm import tqdm
import argparse
import sys
import os
from datetime import timedelta

# Suppress warning in hmmlearn
warnings.filterwarnings("ignore")

1. Define input parameters

In [102]:
metrics = True
plot = True
out_dir = os.getcwd()

In [103]:
n_intervals_frac_change=50
n_intervals_frac_high=10
n_intervals_frac_low=10
n_hidden_states=4
n_latency_days = 15
hmm = GaussianHMM(n_components=n_hidden_states)
predicted_close = None

2. Import data of stock prices, and split train and test data given test_size (0.33):

In [104]:
#def _split_train_test_data(test_size):
used_data = pd.read_csv('Copy.csv',index_col=0)

# Do not shuffle the data as it is a time series
train_data, test_data = train_test_split(used_data, test_size=test_size, shuffle=False)

In [105]:
used_data = pd.read_csv('HMM_data.csv',index_col=0) #The path to used data
LEN_DATA = len(used_data)
NUM_TRAIN = LEN_DATA-37
train_data = used_data[0:NUM_TRAIN]
test_data = used_data[NUM_TRAIN:]
days = len(test_data)

In [106]:
print(used_data)

         sea_ice_extent  sea_ice_extent_next_month  sea_ice_extent_in_2months  \
Date                                                                            
1979-01        15604191                   16378929                   16521089   
1979-02        16378929                   16521089                   15561238   
1979-03        16521089                   15561238                   14085613   
1979-04        15561238                   14085613                   12653185   
1979-05        14085613                   12653185                   10521427   
...                 ...                        ...                        ...   
2020-10         5591115                    9476049                   12254325   
2020-11         9476049                   12254325                   13480000   
2020-12        12254325                   13480000                   14390000   
2021-01        13480000                   14390000                   14640000   
2021-02        14390000     

3. Definition and initialization of computation of prediction

In [107]:
#def _compute_all_possible_outcomes(self, n_intervals_frac_change, n_intervals_frac_high, n_intervals_frac_low):

frac_change_range = np.linspace(-0.1, 0.1, n_intervals_frac_change)
frac_high_range = np.linspace(-0.1, 0.1, n_intervals_frac_high)
frac_low_range = np.linspace(-0.1, 0.1, n_intervals_frac_low)

_possible_outcomes = np.array(list(itertools.product(frac_change_range, frac_high_range, frac_low_range)))


In [108]:
print("Training data period is from " + str(train_data.index[0]) + " to " + str(train_data.index[-1]))

Training data period is from 1979-01 to 2018-01


4. Fit the HMM model

In [109]:
##"""Fit the continuous emission Gaussian HMM."""
# Feature Extractor
def extract_features(data):
        """Extract the features - open, close, high, low price - from the Yahooo finance generated dataframe."""
        this_month = np.array(data['sea_ice_extent'])
        next_month = np.array(data['sea_ice_extent_next_month'])
        in_2months = np.array(data['sea_ice_extent_in_2months'])
        in_3months = np.array(data['sea_ice_extent_in_3months'])
 
        frac_change = (next_month - this_month) / this_month
        frac_change2 = (in_2months - this_month) / this_month
        frac_change3 = (in_3months - this_month) / this_month

        # Put the observations into one array
        return np.column_stack((frac_change, frac_change2, frac_change3))

observations = extract_features(train_data)

# Fit the HMM using the fit feature of hmmlearn
hmm.fit(observations)

print(observations)


[[ 0.04964935  0.05875973 -0.00275266]
 [ 0.00867944 -0.04992335 -0.14001624]
 [-0.05809853 -0.14741619 -0.23411919]
 ...
 [ 0.22022038  0.345634    0.43388851]
 [ 0.10277949  0.17510618  0.19955338]
 [ 0.06558582  0.08775453  0.03921632]]


5. Get the predicted and actual stock prices and create a DF for saving if you'd like to get a metric for the model

In [110]:
def get_most_probable_outcome(day_index):

        previous_data_start_index = max(0, day_index - n_latency_days)
        previous_data_end_index = max(0, day_index - 1)
        previous_data = test_data.iloc[previous_data_start_index: previous_data_end_index]
        previous_data_features = extract_features(previous_data)
    
        outcome_score = []

        # Score all possible outcomes and select the most probable one to use for prediction
        for possible_outcome in _possible_outcomes:
            total_data = np.row_stack((previous_data_features, possible_outcome))
            outcome_score.append(hmm.score(total_data))

        # Get the index of the most probable outcome and return it
        most_probable_outcome = _possible_outcomes[np.argmax(outcome_score)]

        return most_probable_outcome

def predict_close_price(day_index):
        """Predict close price for a given day."""
        this_month = test_data.iloc[day_index]['sea_ice_extent']
        predicted_frac_change, pred_frac_change2, pred_frac_change3 = get_most_probable_outcome(day_index)
        
        ##Here, please change the parameter name for the corresponding time lead.
        return this_month * (1 + predicted_frac_change) #Prediction for next month (time lead = 1 month)
        # return this_month * (1 + pred_frac_change2) #Prediction for 2 months later (time lead = 2 months)
        # return this_month * (1 + pred_frac_change3) #Prediction for 3 months later (time lead = 3 months)

  
predicted_next_month = []
print("Predicting Sea ice extent from " + str(test_data.index[0]) + " to " + str(test_data.index[-1]))
for day_index in tqdm(range(days)):
    predicted_next_month.append(predict_close_price(day_index))
predicted_extent1 = predicted_next_month
#return predicted_close_prices

  0%|          | 0/37 [00:00<?, ?it/s]

Predicting Sea ice extent from 2018-02 to 2021-02


100%|██████████| 37/37 [00:51<00:00,  1.38s/it]


In [111]:
print(predicted_extent1)
#np.save("HMM_lag1.npy", predicted_extent1) #Save predicted data in a numpy file

[14950736.614285715, 15261775.432653062, 14580758.46122449, 11126773.8, 9833478.3, 7795222.2, 5060698.2, 4351457.7, 7228505.957142858, 11265622.175510205, 13252605.465306124, 14556954.367346939, 15430694.04489796, 15589772.840816326, 14325664.665306123, 11079268.200000001, 9768530.700000001, 7155660.600000001, 4519077.3, 4006541.7, 6493376.351020409, 10695436.671428572, 13428639.54489796, 14621942.700000001, 15622077.942857144, 15811325.46734694, 14492470.206122449, 11312613.0, 9772418.700000001, 6901128.9, 4687056.0, 3690817.2, 5922017.724489796, 10384976.148979593, 13429739.846938776, 14277795.918367347, 15241653.06122449]


In [115]:
actual_next_month = test_data.loc[:, ['sea_ice_extent_next_month']]
actual_extent1 = np.array(actual_next_month['sea_ice_extent_next_month'])
actual_extent1 = actual_extent1/1000000 #The predicted data is divided by 1000000 for unit transferring
predicted_extent1 = np.array(predicted_extent1)/1000000
#np.save("HMM_actual_lag1.npy", predicted_extent1)  #Save predicted data in a numpy file

In [88]:
for i in range(len(predicted_extent1)):
	print('Observed: %f, Predicted: %f' % (actual_extent1[i],predicted_extent1[i]))

Observed: 0.860593, Predicted: 0.694504
Observed: 0.660648, Predicted: 0.777012
Observed: 0.224368, Predicted: 0.596362
Observed: -0.222498, Predicted: 0.202182
Observed: -0.926767, Predicted: -0.201563
Observed: -1.871614, Predicted: -0.837872
Observed: -2.116675, Predicted: -1.691544
Observed: -1.497949, Predicted: -1.912957
Observed: -0.423532, Predicted: -1.353936
Observed: 0.140285, Predicted: -0.383198
Observed: 0.653660, Predicted: 0.126213
Observed: 0.910187, Predicted: 0.590048
Observed: 0.956892, Predicted: 0.821820
Observed: 0.585754, Predicted: 0.864019
Observed: 0.207954, Predicted: 0.528695
Observed: -0.244939, Predicted: 0.187352
Observed: -1.147752, Predicted: -0.221838
Observed: -2.058758, Predicted: -1.037532
Observed: -2.235852, Predicted: -1.860629
Observed: -1.713780, Predicted: -2.020633
Observed: -0.585325, Predicted: -1.548940
Observed: 0.190236, Predicted: -0.529378
Observed: 0.672740, Predicted: 0.171343
Observed: 0.966376, Predicted: 0.607287
Observed: 1.0219

In [116]:
# Caculate MSE and NMSE
mse1 = np.mean((predicted_extent1-actual_extent1)**2)
rmse1 = np.sqrt(mse1)
print(rmse1)
nrmse1 = rmse1/(actual_extent1.mean())
nrmse1

1.6535264395828884


0.15588655111031574

In [28]:
# Calculate R-square score
from sklearn.metrics import r2_score
y_true1 = actual_extent1
y_pred1= predicted_extent1
r1=r2_score(y_true1, y_pred1)
r1

0.783486923818859