In [46]:
import numpy as np
import pandas as pd
#import preprocessing
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

## Selection of the most correlated features

In [47]:
def top_correlated(df, nb_features, threshold, correlation_method=3):
    """Select top 20 correlated features from DataFrame"""
    # select settle prices only
    keep_col = [0, 1] + list(range(5, 557, 4))
    selected_columns = df.iloc[:, keep_col].columns
    
    # compute correlation matrix
    df_cor = pd.DataFrame(columns=['pearson', 'spearman', 'kendall'])
    df_cor['pearson'] = df[selected_columns].corr(method='pearson')['p1']
    df_cor['spearman'] = df[selected_columns].corr(method='spearman')['p1']
    df_cor['kendall'] = df[selected_columns].corr(method='kendall')['p1']
    df_cor['score'] = (abs(df_cor['pearson']) + abs(df_cor['spearman']) + abs(df_cor['kendall'])) / 3
    
    # sort depending on the correlation_method 
    col = df_cor.columns[correlation_method]
    df_cor_sorted = df_cor.sort_values(by=col, ascending=False)[col]    
    
    # retrieve the top nb_features correlated with p1
    selected_features = df_cor_sorted.index[:nb_features] 
    
    # eliminate features that are too correlated to each other
    df_cor_count = df_cor.loc[selected_features].copy()
    df_cor_count[df_cor_count < threshold] = 0
    df_cor_count[df_cor_count >= threshold] = 1
    df_cor_count = df_cor_count.sum(axis=1)
    df_cor_count = df_cor_count[df_cor_count > 0]
    df_cor_count = df_cor_count.sort_values(ascending=False)
    
    # keep 20 at most
    limit = min(20, len(df_cor_count))
    selected_features = df_cor_count.index[:limit]
    
    return selected_features

## Reduction of dimension by PCA

In [48]:
def pca_selection(df, n=20):
    "Perform PCA to reduce then number of features"
    # first scale data
    X = df.values
    scaler = StandardScaler()
    X_sc = scaler.fit_transform(X)
    
    # PCA
    pca = PCA(n_components=n)
    X_pca = pca.fit_transform(X_sc)
    
    return pd.DataFrame(X_pca, index=df.index)

## Reformatting the dataframe

In [49]:
df = pd.read_csv('metals_daily_train.csv')
df = df.dropna(axis=0)

def reformat_as_sequence(df, input_seq_len, output_seq_len):
    # columns of the reformatted dataframe
    cols= ['date_t']
    for i in range(input_seq_len-1, 0, -1):
        cols += ['p1_(t-{})'.format(i)]
    cols += ['p1_t']
    for i in range(1, output_seq_len+1):
        cols += ['p1_(t+{})'.format(i)]
    df_reformat = pd.DataFrame(columns=cols)
    
    # t will slide through the time series and create a row
    lim = len(df) - output_seq_len
    for t in range(input_seq_len, lim):
        row = [df['date'].iloc[t]] + list(df['p1'].iloc[t-input_seq_len:t+output_seq_len].values)
        df_reformat.loc[t] = row
    return df_reformat

df_ref = reformat_as_sequence(df, 10, 3)
df_ref.head(16)

Unnamed: 0,date_t,p1_(t-9),p1_(t-8),p1_(t-7),p1_(t-6),p1_(t-5),p1_(t-4),p1_(t-3),p1_(t-2),p1_(t-1),p1_t,p1_(t+1),p1_(t+2),p1_(t+3)
10,20081215.0,457.032497,465.530103,482.060575,472.395859,471.922241,472.073147,472.134908,472.519568,473.139085,467.806318,468.585995,474.348364,493.223915
11,20081216.0,465.530103,482.060575,472.395859,471.922241,472.073147,472.134908,472.519568,473.139085,467.806318,468.585995,474.348364,493.223915,494.082888
12,20081217.0,482.060575,472.395859,471.922241,472.073147,472.134908,472.519568,473.139085,467.806318,468.585995,474.348364,493.223915,494.082888,494.46642
13,20081218.0,472.395859,471.922241,472.073147,472.134908,472.519568,473.139085,467.806318,468.585995,474.348364,493.223915,494.082888,494.46642,499.496445
14,20081219.0,471.922241,472.073147,472.134908,472.519568,473.139085,467.806318,468.585995,474.348364,493.223915,494.082888,494.46642,499.496445,498.971557
15,20081222.0,472.073147,472.134908,472.519568,473.139085,467.806318,468.585995,474.348364,493.223915,494.082888,494.46642,499.496445,498.971557,505.969313
16,20081223.0,472.134908,472.519568,473.139085,467.806318,468.585995,474.348364,493.223915,494.082888,494.46642,499.496445,498.971557,505.969313,505.688433
17,20081229.0,472.519568,473.139085,467.806318,468.585995,474.348364,493.223915,494.082888,494.46642,499.496445,498.971557,505.969313,505.688433,519.860552
18,20081230.0,473.139085,467.806318,468.585995,474.348364,493.223915,494.082888,494.46642,499.496445,498.971557,505.969313,505.688433,519.860552,519.289737
19,20090105.0,467.806318,468.585995,474.348364,493.223915,494.082888,494.46642,499.496445,498.971557,505.969313,505.688433,519.860552,519.289737,518.833986


## Train Test Split

In [50]:
#from sklearn.model_selection import train_test_split #to adapt 
#train_df,test_df=train_test_split(dataframe, test_size=n)