# Just remember.... Start with the simplist model, and add features
## Features
- Daily returns
 - daily_returns[t] = (price[t]/price[-1])-1

## Predictions
1. Will the price stay the same or go up tomorrow. up=1, others=0

## Things I want to find out
- Can we predict gold resonably well
    - What features are corrrelated?
    - What are good features?
    - Do we need lots of them?
- Has trading changed in modern times
    - Has automated trading made things different?
    - Do we overfit if we only look at old data?
    - Was old data correlated to different features that are no longer important?
       
## New quesitions
- Do the dates work correctly?
 - Asuming the day is correct, we use
 - UK Gold 10:30am (GMT)
 - UK Gold 3pm (GMT)
 - FX NYC 12pm (4pm GMT)
  - http://www.federalreserve.gov/pubs/bulletin/2005/winter05_index.pdf

- Score seems kind of high, why?
- Is the model accurate today?
    - Let's test on the data from the last year

- How long is a model valid before it's predictions don't work?

- Can the model predict further out?

- Are they correct if we retrain every day?

- Can we predict bigger changes rather than just positive?

## Notes discovered
We are only predicting gold against USD. What about the global economy? Do we use a bag of currencies? What are their weightings?

In [30]:
# Log message
def lm(text):
    logging.info(text)

In [45]:
import pandas as pd
import re
import quandl
import os

import matplotlib.pyplot as plt
%matplotlib inline

import logging
logger = logging.getLogger()
logger.setLevel(logging.INFO)
logging.debug("test")



In [46]:

# import quandl
def download_quandl(codes, filename=None, load_file=True):
    if filename is None:
        filename = re.sub('[^-a-zA-Z0-9_.() ]+', '_', codes)

    if load_file and os.path.exists(filename):
        lm("Loading file:%s" % filename)
        return pd.read_csv(filename)
    
    the_data = quandl.get(codes)
    the_data.describe()
    the_data.head()
    the_data.to_csv(filename)
    return the_data

currencies = {
#     'AUD': 'FRED/XUDLADD', 
#     'JPY': 'FRED/XUDLJYD',
#     'GBP': 'FRED/XUDLGBD',
#     #'EUR': 'FRED/DEXUSEU',
#     'EUR': 'FRED/XUDLERG',
#     'CAD': 'FRED/XUDLCDD',
#     'CHF': 'FRED/XUDLSFD',
#     'CNY': 'FRED/XUDLBK73',
#     'NZD': 'FRED/XUDLNDD',
    
    'AUD': 'FRED/DEXUSAL', 
    'JPY': 'FRED/DEXJPUS',
    'GBP': 'FRED/DEXUSUK',
    'EUR': 'FRED/DEXUSEU',
    'CAD': 'FRED/DEXCAUS',
    'CHF': 'FRED/DEXSZUS',
    'CNY': 'FRED/DEXCHUS',
    'NZD': 'FRED/DEXUSNZ',
    }

def load_and_prepare_data():
    lm("Load currencies")
    df_curr = download_quandl([currencies[k] for k in currencies], 'currencies')
    logging.debug(df_curr.head())
    #df_curr.describe()
    df_curr.set_index('DATE', inplace=True)
    df_curr.columns = [a for a in currencies]

    lm("Are we using the correct timezone?")
    #FRED = noon NYC time
    # London 10:30,3pm
    # NYC 



    lm("inverse currencies so they are all 'how many x does 1 usd buy'")
    for curr in currencies:
        if currencies[curr][:-2] != 'US':
            df_curr[curr] = 1. / df_curr[curr]
    logging.debug(df_curr['GBP'][:10])

    lm("Lets get the gold")
    df_gold = download_quandl("LBMA/GOLD")
    logging.debug(df_gold.head())
    df_gold.set_index('Date', inplace=True)
    pd.concat([df_gold, df_curr], axis=1)
    df_gold.drop([c for c in df_gold.columns.values if c != 'USD (AM)'], axis=1, inplace=True)
    df_gold.columns = ['GOLD']

    df_concat = pd.concat([df_gold, df_curr], axis=1)

    lm("Forward fill weekends and holidays")
    logging.debug( df_concat.isnull().sum())
    df_concat.fillna(method='ffill', inplace=True)
    logging.debug(df_concat.isnull().sum())
    return df_concat

def set_date_range(df, start_date, end_date):
    lm("Set date range Dates")
    logging.debug(start_date, end_date)
    # using a 15 year perriod
    lm("Using aa 15 year period of data")
    df = df[start_date:end_date].copy()
    logging.debug(df.head())
    return df
    
# df_raw = load_and_prepare_data()

# start_date = '2001-01-04'
# end_date = '2016-01-04'
# df_train = set_date_range(df_raw, start_date, end_date)

In [47]:
def calc_daily_ret(df):
    lm("calculate daily returns")
    # calculate dailt returns
    df_dr = (df / df.shift(1)) -1
    df_dr.columns = ["%s_%s" % (col, 'dr') for col in df.columns]
    df_dr.fillna(method='bfill', inplace=True)
    return df_dr

def calc_rolling_averages(df_in, windows):
    lm("caluclate rolling averages")
    # caluclate rolling averages
    def calc_rolling_mean(df_in, windows):
        new_columns = []
        for window in windows:
            new_df = pd.rolling_mean(df_in, window)
            new_df.columns = ["%s_%s" % (col, window) for col in new_df.columns]
            new_df.fillna(method='bfill', inplace=True)
            new_columns.append(new_df)

        result = pd.concat(new_columns, axis=1)
        return result

    # Get rolling averages from daily returns
    df_dr_rm = calc_rolling_mean(df_in, windows)
    return df_dr_rm

def draw_info(df_in, windows):
    def draw_graphs(df_in, cols, windows=None):
        if windows:
            cols = ['%s_%s' % (col, window) for col in cols for window in windows]

        logging.debug(cols)
        df_in[cols].plot(figsize=(15, 5)).legend(loc='center left', bbox_to_anchor=(1, 0.5))
        plt.axhline(0)
        plt.show()

    lm("Draw a graph")
    draw_graphs(df_dr_rm, df_dr.columns, [180])
    #draw_graphs(df_dr_rm, df_dr.columns, [30])

    #df_all = pd.concat([df, df_dr, df_dr_rm], axis=1)

# df_dr = calc_daily_ret(df_train)
# windows = [2, 7, 30, 180]
# df_rw = calc_rolling_averages(df_dr, windows)

# df_y = create_y_labels(df_dr)




In [48]:
# Kept here incase I need some of this later

# from sklearn.linear_model import LinearRegression
# import numpy as np

# def rolling_linear_regression(df, n, col):
#     def func(indexes, df): 
#         d_tmp = df.loc[indexes]
                
#         X = np.array(range(len(indexes))).reshape([-1, 1])
#         y = d_tmp[col].reshape([-1, 1])
        
#         lr.fit(X, y)
#         result = lr.coef_
        
#         return result
    
#     lr = LinearRegression()
#     result = pd.rolling_apply(df.index.values, n, lambda i: func(i, df))
#     return result

# def rate_of_change(df, n, col):
#     prev = df.shift(n)[col]
#     result = 100 * (df[col] - prev) / prev
#     return result

# def roc_ratio(df, n, col):
#     prev = df.shift(n)[col]
    
    
# for n in [5, 10, 20]:
#     df['lr_%s' % n] = rolling_linear_regression(df, n, 'v')
#     df['roc_%s' % n] = rate_of_change(df, n, 'v')

# df['v'].plot()
# df[['lr_%s' % n for n in [5, 10, 20]]].plot()
# df[['roc_%s' % n for n in [5, 10, 20]]].plot()

In [49]:
lm('fookit, lets see is we can predict')
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import KFold

from sklearn.grid_search import GridSearchCV
from sklearn.svm import SVC

def test_train(X_in, y_in, clf, param_grid, columns=None): 
    logging.debug( X_in.shape)
    logging.debug(y_in.shape)
    logging.debug(X_in.columns.values)

    lm("Ensure no nulls in dataset")
    if X_in.isnull().any().any():
        raise "Empy values in dataset"
    
    if columns:
        logging.debug(columns)
        X_in = X_in[columns]
    logging.debug(X_in.columns.values)
    
    lm('split the data')
    #X_train, X_test, y_train, y_test = train_test_split(X_in, y_in, test_size=0.3)

    gs = GridSearchCV(clf, param_grid)
    
    logging.debug("Training")
    gs.fit(X_in, y_in)
    
    logging.debug("Trained")
    return gs.best_estimator_

INFO:root:fookit, lets see is we can predict


In [50]:
def predict_and_score(clf, X_test, y_test):
    from sklearn.metrics import f1_score
    lm('Lets see if it scores')
    pred = clf.predict(X_test)

    score = f1_score(y_test, pred)
    return score

## Lets see how it works for the last year

In [59]:
def load_and_calculate():
    df_raw = load_and_prepare_data()
    start_date = '2001-01-04'
    end_date = '2016-01-04'
    df_train = set_date_range(df_raw, start_date, end_date)
    df_dr = calc_daily_ret(df_train)
    windows = [2, 7, 30, 180]
    df_rw = calc_rolling_averages(df_dr, windows)
    
    df_X = pd.concat([df_dr, df_rw], axis=1)
    return df_X

def create_y_labels(df_in, days_ahead=1, threshold=0.):
    # Extract the labels vector
    lm("Create the y labels vector")
    df_y = (df_in.shift(-days_ahead) > threshold) * 1.
    return df_y

def get_data_ready():   
    df_X = load_and_calculate()
    df_y = create_y_labels(df_X['GOLD_dr'])
    return df_X, df_y

In [62]:
def run(train_start, train_end, test_start, test_end):
    X, y = get_data_ready()

    # Get data for training
    X_train = X[train_start: train_end]
    y_train = y[train_start: train_end]

    param_grid={
        'kernel': ['linear', 'rbf'],
    }
    clf = SVC()
    best_clf = test_train(X_train, y_train, clf, param_grid)

    # Get data for testing
    X_test = X[test_start: test_end]
    y_test = y[test_start: test_end]
    
    logging.info("Actual")
    logging.info(predict_and_score(best_clf, X_test, y_test))
    
    logging.info("random sample")
    pred_sample = y_test.sample(y_test.shape[0])
    from sklearn.metrics import f1_score
    logging.info(f1_score(y_test, pred_sample))
    
data_start = '2001-01-04'
data_end = '2016-01-04'
#run('2001-01-04', '2014-01-04', '2014-01-04', '2016-01-04')
#run('2011-01-04', '2016-01-04', '2001-01-04', '2006-01-04')
run('2001-01-04', '2011-01-04', '2011-01-04', '2016-01-04')

INFO:root:Load currencies
INFO:root:Loading file:currencies
INFO:root:Are we using the correct timezone?
INFO:root:inverse currencies so they are all 'how many x does 1 usd buy'
INFO:root:Lets get the gold
INFO:root:Loading file:LBMA_GOLD
INFO:root:Forward fill weekends and holidays
INFO:root:Set date range Dates
INFO:root:Using aa 15 year period of data
INFO:root:calculate daily returns
INFO:root:caluclate rolling averages
INFO:root:Create the y labels vector
INFO:root:Ensure no nulls in dataset
INFO:root:split the data
INFO:root:Actual
INFO:root:Lets see if it scores
INFO:root:0.649921507064
INFO:root:random sample
INFO:root:0.48309178744


## Can we predict further out?
For this we will try to predict from the rolling averages in the future

In [None]:

    
def run_further_out(train_start, train_end, test_start, test_end):    
    df_y = create_y_labels(df_dr)
    
    
    return df_X, df_y


        
    # Get data for training
    X_train = X[train_start: train_end]
    y_train = y[train_start: train_end]

    param_grid={
        'kernel': ['linear', 'rbf'],
    }
    clf = SVC()
    best_clf = test_train(X_train, y_train, clf, param_grid)

    # Get data for testing
    X_test = X[test_start: test_end]
    y_test = y[test_start: test_end]
    
    logging.debug("Actual")
    logging.debug(predict_and_score(best_clf, X_test, y_test))
    
    logging.debug("random sample")
    pred_sample = y_test.sample(y_test.shape[0])
    from sklearn.metrics import f1_score
    logging.debug(f1_score(y_test, pred_sample))
    
data_start = '2001-01-04'
data_end = '2016-01-04'
#run('2001-01-04', '2014-01-04', '2014-01-04', '2016-01-04')
#run('2011-01-04', '2016-01-04', '2001-01-04', '2006-01-04')
run('2001-01-04', '2014-01-04', '2014-01-04', '2016-01-04')

draw_info()