In [26]:
from statsmodels.tsa.arima_model import ARIMA, ARIMAResults
import pandas as pd
import numpy as np
from pathlib import Path
import os
from datetime import datetime, timedelta
import warnings
import time

In [1]:
def format_financial_data(data, indices, start_date, end_date): 
    """ Takes a pandas dataframe with date indices and start/end 
    date as datetime objects or strings. 
    
    Parameters: 
    data - A pandas dataframe containing one or more columns of data
    indices - The indices to include. If not all indices are originally included, the data is interpolated
    start_date - The starting date to incldue
    end_date - The final date 
    
    Returns: 
    stock_data - Normalized financial data to [0,1]
    fin_stats - Dictionary containing the min and max of the formatted data
    """
    # Add indicies which are present in the news data but not in the 
    # financial data and interpolate missing values 
    stock_data = data.reindex(indices.drop_duplicates())
    stock_data = stock_data.interpolate()
    stock_data = stock_data[start_date : end_date]
    # Normalize to [0,1]
    fin_stats = pd.DataFrame(columns=['min','max'])
    for col in stock_data: 
        minimum = min(stock_data[col])
        maximum = max(stock_data[col])
        fin_stats = fin_stats.append({'min':minimum, 'max':maximum},ignore_index=True)
        stock_data[col] = [(row - minimum) / (maximum - minimum) for row in stock_data[col].values]
    fin_stats.index = stock_data.columns
    return stock_data, fin_stats

def fit_all_models(par,data):
    """ Fits ARIMA models with the parameters specified in par
    to the provided data for every index. 
    
    Parameters: 
    par - Dictionary of parameters
    one_year - Series of one year rates
    three_year - Series of three year rates
    SP - S&P500-index data 
    
    Returns:
    A pandas dataframe containing three series with fitted models
    for every date in the data
    
    """
    one_year = data['1 YEAR']
    three_year = data['3 YEAR']
    SP = data['S&P']
    dates = data.index
    n = len(dates)
    ARIMA_models = pd.DataFrame()
    d = timedelta(days=par['lookback'])
    start_params_oyr = [0,0.5,0.5]
    start_params_tyr = [0,0.5,0.5]
    start_params_sp = [0,0.5,0.5]
    ctr = 0
    t = time.time()
    for date in dates: 
        if not ctr % 50: 
            print("Processing... {} %".format(round(100 * ctr / n,1)),end='\r')
        # This is since models should only be fitted if they are further ahead of the first
        # date than lookback (because this is the first NLP-prediction) and further ahead 
        # than 5 days, since this is the needed degrees of freedom 
        if date > dates[0] + timedelta(days=max(par['lookback'],5)):
            m_oyr_fit = try_fit(one_year[:date.strftime('%Y-%m-%d')].values,par,start_params_oyr)
            m_tyr_fit = try_fit(three_year[:date.strftime('%Y-%m-%d')].values,par,start_params_tyr)
            m_sp_fit = try_fit(SP[:date.strftime('%Y-%m-%d')].values,par,start_params_sp)
            ARIMA_models = ARIMA_models.append({'date':date,
                                                '1 YEAR':m_oyr_fit,
                                                '3 YEAR':m_tyr_fit,
                                                'S&P':m_sp_fit
                                               },ignore_index=True) 
            start_params_oyr = m_oyr_fit.params
            start_params_tyr = m_tyr_fit.params
            start_params_sp = m_sp_fit.params
        ctr += 1
    ARIMA_models.set_index('date',inplace=True)
    total_time = round(time.time() - t,3)
    print("Total time: {} s".format(total_time))
    print("Average time per fitting: {} s".format(round(total_time/(3 * len(dates)),3)))
    return ARIMA_models

def predict_arima(ARIMA_models, steps):
    """ Predicts the values steps ahead from the models in ARIMA_models. 
    
    Parameters: 
    ARIMA_models - A dataframe with models named as implied above.
    steps - How many days ahead to predict. 
    
    Returns: 
    A dataframe containing the prediction steps ahead for each category. 
    The date is the date when the forecast is made and not the day we want to forecast. 
    """
    ARIMA_preds = pd.DataFrame({'date':[], '1 YEAR':[], '3 YEAR': [], 'S&P': []})
    for d in ARIMA_models.index:
        ARIMA_preds = ARIMA_preds.append({'date': d,
                                          '1 YEAR': ARIMA_models.loc[d]['1 YEAR'].forecast(steps=steps)[0][-1],
                                          '3 YEAR': ARIMA_models.loc[d]['3 YEAR'].forecast(steps=steps)[0][-1],
                                          'S&P': ARIMA_models.loc[d]['S&P'].forecast(steps=steps)[0][-1]},
                                        ignore_index=True)
    ARIMA_preds.set_index('date',inplace=True)  
    return ARIMA_preds

def try_fit(data,par,start,maxiter=1000,disp=0,internal_request=False): 
    # Test several optimization routines to get convergence
    with warnings.catch_warnings():
        warnings.filterwarnings('error')
        try: 
            m = ARIMA(data,order=(par['p'],par['d'],par['q']))
            m_fit = m.fit(start_params=start,maxiter=maxiter,solver="powell",disp=disp)
            return m_fit
        except Warning as e: 
            pass
        try: 
            m = ARIMA(data,order=(par['p'],par['d'],par['q']))
            m_fit = m.fit(start_params=start,maxiter=maxiter,solver="lbfgs",disp=disp)
            return(m_fit)
        except Warning as e: 
            pass
        try:
            m = ARIMA(data,order=(par['p'],par['d'],par['q']))
            m_fit = m.fit(start_params=start,maxiter=maxiter,solver="bfgs",disp=disp)
            return(m_fit)
        except Warning as e: 
            pass
        try:
            m = ARIMA(data,order=(par['p'],par['d'],par['q']))
            m_fit = m.fit(start_params=start,maxiter=maxiter,solver="newton",disp=disp)
            return(m_fit)
        except Warning as e: 
            pass
        try:
            m = ARIMA(data,order=(par['p'],par['d'],par['q']))
            m_fit = m.fit(start_params=start,maxiter=maxiter,solver="nm",disp=disp)
            return(m_fit)
        except Warning as e: 
            pass
        try:
            m = ARIMA(data,order=(par['p'],par['d'],par['q']))
            m_fit = m.fit(start_params=start,maxiter=maxiter,solver="cg",disp=disp)
            return(m_fit)
        except Warning as e: 
            pass
        try:
            m = ARIMA(data,order=(par['p'],par['d'],par['q']))
            m_fit = m.fit(start_params=start,maxiter=maxiter,solver="ncg",disp=disp)
            return(m_fit)
        except Warning as e: 
            pass
    if not internal_request:
        return try_fit(data,par,start=(0.5,0.5,0.5),maxiter=2*maxiter, internal_request=True)
    else:
        # If no solution was found after two tries with different start_params
        # and solvers, we'll just return a suboptimal solution. 
        print("Suboptimal solution returned.")
        m = ARIMA(data,order=(par['p'],par['d'],par['q']))
        m_fit = m.fit(start_params=start,maxiter=maxiter,solver="ncg",disp=disp)
        return m_fit

def shuffle_and_partition(x,y,test_part=0.2):
    n = len(y)
    n_train = int(n * (1 - test_part))
    n_test = n - n_train
    indices = np.arange(n)
    np.random.shuffle(indices)
    
    train_indices = indices[:n_train]
    test_indices = indices[n_train:]
    
    x_train = x[train_indices]
    y_train = y[train_indices]
    
    x_test = x[test_indices]
    y_test = y[test_indices]
    
    return (np.asarray(x_train), np.asarray(y_train)), (np.asarray(x_test), np.asarray(y_test))