In [1]:
import numpy as np
import pandas as pd
#import preprocessing
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

## Example dataframe

In [20]:
df = pd.read_csv('metals_daily_train.csv')
df = df.dropna(axis=0)
df

Unnamed: 0,date,p0,p1,p2,f000_open,f000_high,f000_low,f000_settle,f001_open,f001_high,...,f136_open,f136_high,f136_low,f136_settle,f137_open,f137_high,f137_low,f137_settle,week,week_date
109,20081201,444.511058,457.032497,457.032497,53.080,56.330,52.620,56.290,49.110,52.150,...,9420.0,9680.0,9315.0,9540.0,9520.0,9800.0,9495.0,9650.0,2030,20081201
110,20081202,446.908899,465.530103,459.323035,55.990,56.290,54.680,55.300,51.800,52.240,...,9480.0,9600.0,9430.0,9510.0,9640.0,9730.0,9560.0,9630.0,2030,20081201
111,20081203,453.484820,482.060575,459.696940,56.500,56.720,54.650,55.210,53.010,53.020,...,9495.0,9580.0,9400.0,9500.0,9530.0,9690.0,9505.0,9590.0,2030,20081201
112,20081204,447.532919,472.395859,459.964389,55.500,57.810,54.880,57.620,51.750,54.420,...,9485.0,9485.0,9120.0,9145.0,9400.0,9445.0,9205.0,9225.0,2030,20081201
113,20081205,447.084228,471.922241,459.503235,58.500,60.000,56.630,56.750,54.700,56.180,...,8710.0,9000.0,8595.0,8665.0,8885.0,8940.0,8670.0,8715.0,2030,20081201
114,20081208,447.227191,472.073147,459.650169,55.030,55.850,54.140,54.670,51.820,53.770,...,8315.0,9010.0,8315.0,8900.0,8400.0,9060.0,8400.0,8970.0,2031,20081208
115,20081209,447.285703,472.134908,459.710306,55.250,55.650,54.170,55.380,53.340,53.820,...,9270.0,9300.0,8955.0,9135.0,9270.0,9375.0,9010.0,9170.0,2031,20081208
116,20081210,447.650117,472.519568,460.084842,55.160,55.600,53.780,54.440,53.360,53.710,...,9165.0,9400.0,9165.0,9295.0,9290.0,9460.0,9170.0,9355.0,2031,20081208
117,20081211,448.237028,473.139085,460.688056,54.860,55.320,52.600,55.320,53.450,53.600,...,9350.0,9350.0,9140.0,9205.0,9395.0,9395.0,9190.0,9270.0,2031,20081208
118,20081212,449.094065,467.806318,459.073933,55.850,56.500,54.220,54.460,53.920,54.500,...,9200.0,9215.0,8910.0,9090.0,9230.0,9230.0,8895.0,9070.0,2031,20081208


## Selection of the most correlated features

In [24]:
def top_correlated(df, nb_features, threshold, correlation_method=3):
    """Select top 20 correlated features from DataFrame"""
    # select settle prices only
    keep_col = [0, 2] + list(range(7, 557, 4))
    selected_columns = df.iloc[:, keep_col].columns
    
    # compute correlation matrix
    df_cor = pd.DataFrame(columns=['pearson', 'spearman', 'kendall'])
    df_cor['pearson'] = df[selected_columns].corr(method='pearson')['p1']
    df_cor['spearman'] = df[selected_columns].corr(method='spearman')['p1']
    df_cor['kendall'] = df[selected_columns].corr(method='kendall')['p1']
    df_cor['score'] = (abs(df_cor['pearson']) + abs(df_cor['spearman']) + abs(df_cor['kendall'])) / 3
    
    # sort depending on the correlation_method 
    col = df_cor.columns[correlation_method]
    df_cor_sorted = df_cor.sort_values(by=col, ascending=False)[col]    
    
    # retrieve the top nb_features correlated with p1
    selected_features = df_cor_sorted.index[:nb_features] 
    
    # eliminate features that are too correlated to each other
    df_cor_count = df_cor.loc[selected_features].copy()
    df_cor_count[df_cor_count < threshold] = 0
    df_cor_count[df_cor_count >= threshold] = 1
    df_cor_count = df_cor_count.sum(axis=1)
    df_cor_count = df_cor_count[df_cor_count > 0]
    df_cor_count = df_cor_count.sort_values(ascending=False)
    
    # keep 20 at most
    limit = min(20, len(df_cor_count))
    selected_features = df_cor_count.index[:limit]
    
    return selected_features

In [25]:
selected_features = top_correlated(df, 50, 0.75)
selected_features

Index(['p1', 'f091_settle', 'f029_settle', 'f090_settle', 'f131_settle',
       'f132_settle', 'f130_settle', 'f133_settle', 'f028_settle',
       'f135_settle', 'f134_settle', 'f011_settle', 'f010_settle',
       'f002_settle', 'f004_settle', 'f003_settle', 'f097_settle'],
      dtype='object')

## Reduction of dimension by PCA

In [26]:
def pca_selection(df, n=20):
    "Perform PCA to reduce then number of features"
    # first scale data
    X = df.values
    scaler = StandardScaler()
    X_sc = scaler.fit_transform(X)
    
    # PCA
    pca = PCA(n_components=n)
    X_pca = pca.fit_transform(X_sc)
    
    return pd.DataFrame(X_pca, index=df.index)

In [27]:
df_pca = pca_selection(df)
df_pca.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
109,-1.551894,32.879244,1.749189,1.96107,13.635106,8.942555,10.719506,7.749649,-3.093242,-0.339466,0.897852,1.780742,1.949166,0.825261,2.144371,-1.138135,-2.836095,0.064952,-3.091704,-2.62116
110,-2.504254,33.212287,1.425177,2.605029,13.200869,8.479536,10.016911,8.050657,-3.381756,0.11552,0.39068,1.31772,1.425832,0.692797,2.05283,-0.675515,-2.353422,0.028417,-2.956447,-2.175771
111,-3.258635,33.17491,1.251037,2.999204,13.010946,8.238106,9.759872,8.311804,-3.444505,0.100865,0.706402,1.35424,1.175833,0.295533,1.998927,-0.359771,-2.037847,0.014768,-2.885288,-1.991641
112,-4.128095,33.789522,0.739162,3.379791,12.946017,8.287326,10.349546,7.794329,-3.002137,0.403191,1.355227,1.370025,0.740066,0.043532,1.751352,-0.450039,-2.114068,-0.201304,-2.881472,-2.166792
113,-5.582836,35.11899,0.347171,3.950416,13.222885,8.327585,11.132295,6.996393,-2.502764,0.240236,1.748788,1.169872,0.469832,-0.038427,1.579187,-0.377464,-1.491747,-0.460078,-2.72704,-2.101481


## Reformatting the dataframe

In [28]:
def reformat_as_sequence(df, input_seq_len, output_seq_len):
    # columns of the reformatted dataframe
    cols= ['date_t']
    for i in range(input_seq_len-1, 0, -1):
        cols += ['p1_(t-{})'.format(i)]
    cols += ['p1_t']
    for i in range(1, output_seq_len+1):
        cols += ['p1_(t+{})'.format(i)]
    df_reformat = pd.DataFrame(columns=cols)
    
    # t will slide through the time series and create a row
    lim = len(df) - output_seq_len
    for t in range(input_seq_len, lim):
        row = [df['date'].iloc[t]] + list(df['p1'].iloc[t-input_seq_len:t+output_seq_len].values)
        df_reformat.loc[t] = row
    return df_reformat

df_ref = reformat_as_sequence(df, 20, 3)
df_ref.head()

Unnamed: 0,date_t,p1_(t-19),p1_(t-18),p1_(t-17),p1_(t-16),p1_(t-15),p1_(t-14),p1_(t-13),p1_(t-12),p1_(t-11),...,p1_(t-6),p1_(t-5),p1_(t-4),p1_(t-3),p1_(t-2),p1_(t-1),p1_t,p1_(t+1),p1_(t+2),p1_(t+3)
20,20090106.0,457.032497,465.530103,482.060575,472.395859,471.922241,472.073147,472.134908,472.519568,473.139085,...,494.082888,494.46642,499.496445,498.971557,505.969313,505.688433,519.860552,519.289737,518.833986,519.061761
21,20090107.0,465.530103,482.060575,472.395859,471.922241,472.073147,472.134908,472.519568,473.139085,467.806318,...,494.46642,499.496445,498.971557,505.969313,505.688433,519.860552,519.289737,518.833986,519.061761,518.970627
22,20090108.0,482.060575,472.395859,471.922241,472.073147,472.134908,472.519568,473.139085,467.806318,468.585995,...,499.496445,498.971557,505.969313,505.688433,519.860552,519.289737,518.833986,519.061761,518.970627,518.909889
23,20090109.0,472.395859,471.922241,472.073147,472.134908,472.519568,473.139085,467.806318,468.585995,474.348364,...,498.971557,505.969313,505.688433,519.860552,519.289737,518.833986,519.061761,518.970627,518.909889,518.826397
24,20090112.0,471.922241,472.073147,472.134908,472.519568,473.139085,467.806318,468.585995,474.348364,493.223915,...,505.969313,505.688433,519.860552,519.289737,518.833986,519.061761,518.970627,518.909889,518.826397,519.008596


## Train Test Split

In [50]:
#from sklearn.model_selection import train_test_split #to adapt 
#train_df,test_df=train_test_split(dataframe, test_size=n)