In [None]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle 
import itertools
import os 
import warnings
warnings.filterwarnings("ignore")

In [None]:
def curation_download(data):
    
    df = data.copy()
    df = df[["Exchange Date", "Close", "Open", "Low", "High", "Volume", "Turnover - USD"]][::-1]
    df["Exchange Date"] = pd.to_datetime(df["Exchange Date"])#.dt.floor("d")
    df = df.set_index("Exchange Date")
    df.index.names = ["Date"]
    df = df.astype(np.float64)
    
    return df


def technical_indicators(data, column):
    
    df = data.copy()
    #SMA
    df["SMA_10"] = df[column].rolling(window = 10).mean()
    df["SMA_50"] = df[column].rolling(window = 50).mean()

    # EMA
    df["EMA_10"] = df[column].ewm(span = 50, adjust = False).mean()
    df["EMA_50"] = df[column].ewm(span = 50, adjust = False).mean() # look at adjust

    # Bollinger Bands
    df["SMA_20"] = df[column].rolling(window = 20).mean()
    df["upper_band"] = df["SMA_20"] + 2 * df[column].rolling(window = 20).std()
    df["lower_band"] = df["SMA_20"] - 2 * df[column].rolling(window = 20).std()

    # MACD
    df["EMA_12"] = df[column].ewm(span = 12, adjust = False).mean()
    df["EMA_26"] = df[column].ewm(span = 26, adjust = False).mean() #might lead to leakage
    df["MACD"] = df["EMA_12"] - df["EMA_26"]
    df["Signal_Line"] = df["MACD"].ewm(span=9, adjust=False).mean()
    
    return df, df.columns


def additional_features(data):
    
    df = data.copy()
    features = pd.DataFrame(index = df.index)    
    # zscore
    features["f01"] = df["Close"].rolling(window=200, min_periods=20).mean() / df["Close"].rolling(window=200, 
                                                                                                   min_periods=20).std()
    features["f02"] = df["High"] - df["Close"] # upper shadow
    features["f03"] = df["Open"] - df["Low"] # lower shadow
    
    return features

In [None]:
# check VW, BMW
all_stocks = []
stock_names = ["Daimler", "Bayer"]

daimler = pd.read_excel("../data/Daimler(EUR)/Price History.xlsx", skiprows = 32, usecols = "A:J")
daimler = curation_download(daimler)
all_stocks.append(daimler)

bayer = pd.read_excel("../data/Bayer(EUR)/Price History.xlsx", skiprows = 30, usecols = "A:I")
bayer = curation_download(bayer)
all_stocks.append(bayer)



# macro-economic data
comex_gold = pd.read_excel("../data/Comex(Gold)/Price History.xlsx", skiprows = 30, usecols = "A:I")
comex_gold = comex_gold[["Exchange Date", "Close", "Volume"]][::-1]
comex_gold["Exchange Date"] = pd.to_datetime(comex_gold["Exchange Date"])#.dt.floor("d")
comex_gold = comex_gold.set_index("Exchange Date")
comex_gold.index.names = ["Date"]
comex_gold = comex_gold.astype(np.float64)

eur_usd = pd.read_excel("../data/EUR-USD/Price History.xlsx", skiprows = 26, usecols = "A:H")
eur_usd = eur_usd[["Exchange Date", "Bid", "Ask", "High", "Low", "Open"]]
eur_usd["Exchange Date"] = pd.to_datetime(eur_usd["Exchange Date"])
eur_usd = eur_usd.set_index("Exchange Date")
eur_usd.index.names = ["Date"]
eur_usd = eur_usd.astype(np.float64)

In [None]:
technical_df = []
technical_cols = []

for i in range(len(all_stocks)):
    technical, tech_cols = technical_indicators(all_stocks[i], "Close")
    technical_df.append(technical)
    technical_cols.append(tech_cols)

In [None]:
technical_df[0].head(1)

In [None]:
add_feat = []

for i in range(len(all_stocks)):
    add_feat.append(additional_features(all_stocks[i]))

In [None]:
add_feat[0].head(1)

In [None]:
# merge everything including macro economic data

final_df = []

for i in range(len(all_stocks)):
    feat_df = pd.merge(technical_df[i], add_feat[i], on = "Date")
    feat_df = pd.merge(feat_df, comex_gold["Close"], on = "Date", suffixes = [None, "_gold"])
    feat_df = pd.merge(feat_df , eur_usd[["High", "Low", "Open"]], on = "Date", suffixes = [None, "_fx"])
    feat_df = feat_df.drop(["High", "Low", "Open"], axis = 1)
    feat_df = feat_df.dropna()
    final_df.append(feat_df)
    
final_df[0].head(1)

### Correlation matrix (annotation True)

In [None]:
corrMatrix = final_df[0].corr()

fig, ax = plt.subplots(figsize = (20, 20))
sns.heatmap(corrMatrix, annot = True, ax = ax)

plt.show()

In [None]:
final_data = []

for i in range(len(final_df)):
    
    drop_matrix = final_df[i].corr().abs()
    upper = drop_matrix.where(np.triu(np.ones(drop_matrix.shape), k=1).astype(np.bool))
    to_drop = [column for column in upper.columns if any(upper[column] > 0.98)]
    final_df[i].drop(to_drop, axis = 1, inplace = True)
    final_data.append(final_df[i])
    
print(final_data[0].shape)
final_data[0].head(1)

### create labels and plot correlation/ label - feature

In [None]:
def get_daily_vol(df, span=20):
    
    daily_returns = df.pct_change()
    
    return daily_returns.ewm(span=span).std()


def create_labels(df, daily_vol, t_final = 10, upper_lower_multipliers = [1, 1]):

    out = pd.DataFrame(index = daily_vol.index, columns = ["date_passed", "label", "initial", "upper", 
                                                           "lower", "break", "final"])
    
    for day, vol in daily_vol.iterrows():
        days_passed = len(daily_vol.loc[daily_vol.index[0] : day])

        if (days_passed + t_final < len(daily_vol.index) and t_final != 0):
            vert_barrier = daily_vol.index[days_passed + t_final]

        else:
            vert_barrier = np.nan

        if upper_lower_multipliers[0] > 0:
            top_barrier = df["Close"][day] + df["Close"][day] * upper_lower_multipliers[0] * vol
        else:
            top_barrier = pd.Series(index=close_prices.index)

        if upper_lower_multipliers[1] > 0:
            bot_barrier = df["Close"][day] - df["Close"][day] * upper_lower_multipliers[1] * vol
        else:
            bot_barrier = pd.Series(index=close_prices.index)

        breakthrough_date = vert_barrier
        out.at[day, "initial"] = df["Close"][day]
        out.at[day, "upper"] = top_barrier["Close"]
        out.at[day, "lower"] = bot_barrier["Close"]
        out.at[day, "break"] = breakthrough_date

        for future_date in daily_vol.index[days_passed : min(days_passed + t_final, len(daily_vol.index))]:
            if ((df["Close"].loc[future_date] >= top_barrier["Close"] and top_barrier["Close"] != 0)):
                out.at[day, "date_passed"] = future_date
                out.at[day, "label"] = 1 
                out.at[day, "final"] = df["Close"].loc[future_date]
                breakthrough_date = future_date
                break

            elif ((df["Close"].loc[future_date] <= bot_barrier["Close"] and bot_barrier["Close"] != 0)):
                out.at[day, "date_passed"] = future_date
                out.at[day, "label"] = -1
                out.at[day, "final"] = df["Close"].loc[future_date]
                breakthrough_date = future_date
                break

        if (breakthrough_date == vert_barrier):
            price_initial = df["Close"].loc[day]
            price_final = df["Close"].loc[breakthrough_date]

            if price_final > top_barrier["Close"]:
                out.at[day, "date_passed"] = "break_through"
                out.at[day, "label"] = 1
                out.at[day, "final"] = price_final

            elif price_final < bot_barrier["Close"]:
                out.at[day, "date_passed"] = "break_through"
                out.at[day, "label"] = -1
                out.at[day, "final"] = price_final

            else:
                out.at[day, "date_passed"] = "calculated"
                out.at[day, "label"] = max([(price_final - price_initial) / (top_barrier["Close"] - price_initial),
                                            (price_final - price_initial) / (price_initial - bot_barrier["Close"])],
                                           key=abs)
                out.at[day, "final"] = price_final
    
    return out

In [None]:
labels_df = []

for i in range(len(final_data)):
    vol = get_daily_vol(final_data[i][["Close"]]).dropna()
    labels = create_labels(final_df[i], vol, t_final = 10, upper_lower_multipliers = [2, 2])
    labels = labels[:labels["label"].isnull().values.argmax()]
    labels["label"] = np.where(labels["label"] > 0, 1, -1)
    labels_df.append(labels)

In [None]:
final_df = []

for i in range(len(final_data)):
    final_df.append(final_data[i].join(labels_df[i]["label"], on = "Date").dropna())

In [None]:
x = final_df[1].corr()["label"].sort_values(ascending=False)[1:]
y = x.index 

fig = plt.figure(figsize = (10, 7))
ax = fig.add_subplot()
ax.set_title('Correlation Between Attributes and Target', fontsize=16)
ax.barh(np.arange(len(y)), x, align='center')
ax.set_yticks(np.arange(len(y)))
ax.set_yticklabels(y)
ax.invert_yaxis()
ax.set_xlabel('Correlation', fontsize=14)
ax.set_ylabel('Attributes', fontsize=14)
ax.axvline(linewidth=1, color='black')
plt.show()

## Create input

In [None]:
from statsmodels.tsa.stattools import adfuller 

def getWeights_FFD(d=0.1, thres=1e-5):
    
    w,k=[1.],1
    while True:
        w_=-w[-1]/k*(d-k+1)
        if abs(w_)<thres:break
        w.append(w_)
        k+=1
    return np.array(w[::-1]).reshape(-1,1)


def transfer_data_by_frac_diff_FFD(col, d=0.1, thres=1e-4):
    #3) Apply weights to values
    w=getWeights_FFD(d,thres)
    width=len(w)-1
    
    df = pd.Series()
    #widow size can't be larger than the size of data
    if width >= col.shape[0]:
        raise Exception("width is oversize")
        
    for i in range(width, col.shape[0]):
        i_0_index, i_1_index = col.index[i-width], col.index[i]
        data = np.dot(w.T, col.loc[i_0_index:i_1_index])[0]
        
        df[i_1_index] = data
                   
    return df


def MemoryVsCorr(total_range, series, treshold):
    
    interval = np.linspace(total_range[0], total_range[1], total_range[2])
    result = pd.DataFrame(np.zeros((len(interval),4)))
    result.columns = ['order','adf','corr', '5%']
    result['order'] = interval

    for counter,order in enumerate(interval):
        seq_traf = transfer_data_by_frac_diff_FFD(series, order, treshold)
        res=adfuller(seq_traf, maxlag=1, regression='c') #autolag='AIC'
        result.loc[counter,'adf']=res[0]
        result.loc[counter,'5%']=res[4]['5%']
        seq_traf = seq_traf.values
        difference = len(series) - len(seq_traf)
        df1 = series[difference:].reset_index(drop=True).values
        result.loc[counter, 'corr'] = np.corrcoef(df1, seq_traf)[0,1]
        
    return result


def plotMemoryVsCorr(result, seriesName):
    
    fig, ax = plt.subplots()
    ax2 = ax.twinx()  
    color1='xkcd:deep red'; color2='xkcd:cornflower blue'
    ax.plot(result.order,result['adf'],color=color1)
    ax.plot(result.order, result['5%'], color='xkcd:slate')
    ax2.plot(result.order,result['corr'], color=color2)
    ax.set_xlabel('order of differencing')
    ax.set_ylabel('adf', color=color1);ax.tick_params(axis='y', labelcolor=color1)
    ax2.set_ylabel('corr', color=color2); ax2.tick_params(axis='y', labelcolor=color2)
    plt.title('ADF test statistics and correlation for %s' % (seriesName))
    plt.show()
    

def df_after_frac(df_before, df_after):

    difference = len(df_before) - len(df_after)
    df_frac = df_after.to_frame().reset_index().copy()
    df_frac.columns = ["Date", "Diff"]
    df_frac.set_index("Date", inplace = True)
    
    df_final = df_before[["Close", "Volume", "MACD", "Signal_Line"]].merge(df_frac, on='Date', how='left')[difference:]
    
    return df_final

In [None]:
for i in range(len(final_df)):
    final_df[i].drop("label", axis = 1, inplace = True)

final_df[0].head(1)

In [None]:
%%time

final_data = final_df.copy()

stock_num = 0
adf_list = []
while stock_num < (len(stock_names)):
    df_adf = pd.DataFrame(columns=[final_data[stock_num].columns])
    print(stock_names[stock_num])
    values_df = []
    for col in final_data[stock_num].columns:
        test_adf = MemoryVsCorr([0, 1, 21], final_data[stock_num][col], 1e-3)
        for i in range(len(test_adf)):
            if test_adf.loc[i]["adf"] < test_adf.loc[i]["5%"]:
                values_df.append(test_adf.loc[i]["order"])
                break
            elif i == (len(test_adf)-1):
                values_df.append("out of range")
            else:
                pass
    df_adf.loc[len(df_adf)] = values_df
    adf_list.append(df_adf)
    stock_num += 1

In [None]:
df_frac = []

for i, data in enumerate(final_data):
    df = pd.DataFrame(index = data.index)
    for col in data.columns:
        if adf_list[i].loc[0][col].item() != 0:
            fractioned = transfer_data_by_frac_diff_FFD(data[col], adf_list[i].loc[0][col].item(),
                                                        thres = 1e-3)
            fractioned = fractioned.to_frame().reset_index()
            fractioned.columns = ["Date", "diff_{}".format(col)]
            fractioned.set_index("Date", inplace=True)
            df = df.join(fractioned, on = "Date")
        else:
            df = df.join(data[[col]])
    df_frac.append(df.dropna())

In [None]:
from pyts.image import GramianAngularField
import math

def create_data(df, df_label, stock_names):
    X_gasf_list = []
    train_dax = []
    test_dax = []
    target_train = []
    target_test = []
    
    df_scaled = ((df - df.max()) + (df - df.min()))/(df.max() - df.min())
    
    for i in range(30, len(df_scaled), 1):
        X = df_scaled.iloc[i-30:i].T
        gasf = GramianAngularField(method = "summation")
        X_gasf = gasf.fit_transform(X)
        X_gasf_list = [i for i in X_gasf]
        dimensional = np.dstack(X_gasf_list)
        
        target = df_label["label"].iloc[i-1]

        if df.iloc[[i]].index < "2019-01-01":
            train_dax.append(dimensional)
            target_train.append(target)
            
        else:
            test_dax.append(dimensional)
            target_test.append(target)
    
    path_train = "../eikon_api/{}/Train".format(stock_names)
    path_test = "../eikon_api/{}/Test".format(stock_names)
    
    os.makedirs(path_train)
    os.makedirs(path_test)
    
    np.save(os.path.join("../eikon_api/{}".format(stock_names), "Train", "train.npy"), train_dax)
    np.save(os.path.join("../eikon_api/{}".format(stock_names), "Test", "test.npy"), test_dax)

    np.savetxt(os.path.join("../eikon_api/{}".format(stock_names), "Train", "target_train.csv"), target_train)
    np.savetxt(os.path.join("../eikon_api/{}".format(stock_names), "Test", "target_test.csv"), target_test)

In [None]:
final_df = []

for i in range(len(df_frac)):
    result = pd.merge(df_frac[i], labels_df[i]["label"], on = "Date", how = "left")
    final_df.append(result)

In [None]:
cnn_data = []
cnn_labels = []

for i in range(len(final_df)):
    cnn_labels.append(final_df[i][["label"]])
    data = final_df[i].drop("label", axis = 1)
    cnn_data.append(data)

In [None]:
for i in range(len(cnn_data)):
    create_data(cnn_data[i], cnn_labels[i], stock_names[i])

In [None]:
train = []
train_label = []
test = []
test_label = []

for i in range(len(stock_names)):
    train.append(np.load(os.path.join("../eikon_api/{}".format(stock_names[i]), "Train", "train.npy")))
    train_label.append(pd.read_csv("../eikon_api/{}/Train/target_train.csv".format(stock_names[i]), header = None))
    test.append(np.load(os.path.join("../eikon_api/{}".format(stock_names[i]), "Test", "test.npy")))
    test_label.append(pd.read_csv("../eikon_api/{}/Test/target_test.csv".format(stock_names[i]), header = None)) 

In [None]:
plt.figure(figsize=(10,10))

for i in range(len(train[0][0,0,0,:])):
    plt.subplot(6, 6, i+1)   
    plt.axis('off')
    plt.title(cnn_data[0].columns[i])
    plt.imshow(train[0][0,:,:,i], cmap = "gray")

plt.show()

In [None]:
y_train_list = []
y_test_list = []

for i in range(len(train_label)):
    y_train = train_label[i].iloc[:,0].values
    y_test = test_label[i].iloc[:,0].values
    y_train = np.where(y_train == -1, 0, y_train)
    y_test = np.where(y_test == -1, 0, y_test)
    y_train_list.append(y_train)
    y_test_list.append(y_test)

In [None]:
unique, counts = np.unique(y_test_list[1], return_counts=True)
dict(zip(unique, counts))

In [None]:
import tensorflow as tf

def build_model_11(pool_type = "max", conv_activation = "relu", drop_out_conv = 0, drop_out_layer = 0.2):
    
    model = tf.keras.models.Sequential()
    model.add(tf.keras.layers.Conv2D(filters = 64, kernel_size = (3, 3), input_shape = (30,30,11), 
                                     activation = conv_activation))
    
    model.add(tf.keras.layers.Conv2D(32, (3, 3), activation = conv_activation))
    if pool_type == "max":
        model.add(tf.keras.layers.MaxPooling2D(pool_size=(2, 2)))
    if pool_type == "average":
        model.add(tf.keras.layers.AveragePooling2D(pool_size=(2, 2)))
    if drop_out_conv != 0:
        model.add(tf.keras.layers.Dropout(drop_out_conv))
        
    model.add(tf.keras.layers.Conv2D(32, (3, 3), activation = conv_activation))
    if pool_type == "max":
        model.add(tf.keras.layers.MaxPooling2D(pool_size=(2, 2)))
    if pool_type == "average":
        model.add(tf.keras.layers.AveragePooling2D(pool_size=(2, 2)))
    if drop_out_conv != 0:
        model.add(tf.keras.layers.Dropout(drop_out_conv))
        
    model.add(tf.keras.layers.Flatten())
    
    model.add(tf.keras.layers.Dense(units = 256, activation = conv_activation))
    if drop_out_layer != 0:
        model.add(tf.keras.layers.Dropout(drop_out_layer))
        
    model.add(tf.keras.layers.Dense(units = 256, activation = conv_activation))
    if drop_out_layer != 0:
        model.add(tf.keras.layers.Dropout(drop_out_layer))
        
    model.add(tf.keras.layers.Dense(units = 2, activation =  "softmax"))
    
    model.compile(optimizer = "adam", loss = "binary_crossentropy", metrics = ["accuracy"])
    
    return model


def build_model_10(pool_type = "max", conv_activation = "relu", drop_out_conv = 0, drop_out_layer = 0.2):
    
    model = tf.keras.models.Sequential()
    model.add(tf.keras.layers.Conv2D(filters = 64, kernel_size = (3, 3), input_shape = (30,30,10), 
                                     activation = conv_activation))
    
    model.add(tf.keras.layers.Conv2D(32, (3, 3), activation = conv_activation))
    if pool_type == "max":
        model.add(tf.keras.layers.MaxPooling2D(pool_size=(2, 2)))
    if pool_type == "average":
        model.add(tf.keras.layers.AveragePooling2D(pool_size=(2, 2)))
    if drop_out_conv != 0:
        model.add(tf.keras.layers.Dropout(drop_out_conv))
        
    model.add(tf.keras.layers.Conv2D(32, (3, 3), activation = conv_activation))
    if pool_type == "max":
        model.add(tf.keras.layers.MaxPooling2D(pool_size=(2, 2)))
    if pool_type == "average":
        model.add(tf.keras.layers.AveragePooling2D(pool_size=(2, 2)))
    if drop_out_conv != 0:
        model.add(tf.keras.layers.Dropout(drop_out_conv))
        
    model.add(tf.keras.layers.Flatten())
    
    model.add(tf.keras.layers.Dense(units = 256, activation = conv_activation))
    if drop_out_layer != 0:
        model.add(tf.keras.layers.Dropout(drop_out_layer))
        
    model.add(tf.keras.layers.Dense(units = 256, activation = conv_activation))
    if drop_out_layer != 0:
        model.add(tf.keras.layers.Dropout(drop_out_layer))
        
    model.add(tf.keras.layers.Dense(units = 2, activation =  "softmax"))
    
    model.compile(optimizer = "adam", loss = "binary_crossentropy", metrics = ["accuracy"])
    
    return model



def build_model_5(pool_type = "max", conv_activation = "relu", drop_out_conv = 0, drop_out_layer = 0.2):
    
    model = tf.keras.models.Sequential()
    model.add(tf.keras.layers.Conv2D(filters = 64, kernel_size = (3, 3), input_shape = (30,30,5), 
                                     activation = conv_activation))
    
    model.add(tf.keras.layers.Conv2D(32, (3, 3), activation = conv_activation))
    if pool_type == "max":
        model.add(tf.keras.layers.MaxPooling2D(pool_size=(2, 2)))
    if pool_type == "average":
        model.add(tf.keras.layers.AveragePooling2D(pool_size=(2, 2)))
    if drop_out_conv != 0:
        model.add(tf.keras.layers.Dropout(drop_out_conv))
        
    model.add(tf.keras.layers.Conv2D(32, (3, 3), activation = conv_activation))
    if pool_type == "max":
        model.add(tf.keras.layers.MaxPooling2D(pool_size=(2, 2)))
    if pool_type == "average":
        model.add(tf.keras.layers.AveragePooling2D(pool_size=(2, 2)))
    if drop_out_conv != 0:
        model.add(tf.keras.layers.Dropout(drop_out_conv))
        
    model.add(tf.keras.layers.Flatten())
    
    model.add(tf.keras.layers.Dense(units = 256, activation = conv_activation))
    if drop_out_layer != 0:
        model.add(tf.keras.layers.Dropout(drop_out_layer))
        
    model.add(tf.keras.layers.Dense(units = 256, activation = conv_activation))
    if drop_out_layer != 0:
        model.add(tf.keras.layers.Dropout(drop_out_layer))
        
    model.add(tf.keras.layers.Dense(units = 2, activation =  "softmax"))
    
    model.compile(optimizer = "adam", loss = "binary_crossentropy", metrics = ["accuracy"])
    
    return model

In [None]:
# arrays as output otherwise will not work with sk-GS

class BlockingTimeSeriesSplit():
    
    def __init__(self, n_splits):
        self.n_splits = n_splits
    
    def get_n_splits(self, X, y, groups):
        return self.n_splits
    
    def split(self, X, y=None, groups=None):
        n_samples = len(X)
        k_fold_size = n_samples // self.n_splits
        indices = np.arange(n_samples)

        margin = 0
        for i in range(self.n_splits):
            start = i * k_fold_size
            stop = start + k_fold_size
            mid = int(0.8 * (stop - start)) + start
            yield indices[start: mid], indices[mid + margin: stop]

In [None]:
from sklearn.model_selection import cross_val_score

# feature importance 
btscv = BlockingTimeSeriesSplit(n_splits = 5)

feat_importance_df = []

for i in range(len(train)):
    print(stock_names[i])
    selection_df = pd.DataFrame()
    combination_list = []
    score_list = []
    y_train = y_train_list[i] 
    combinations = list(itertools.combinations(range(len(cnn_data[i].columns)), 5))
    for j in range(len(combinations)+1):
        if j == 0 and i == 0:
            print("Base line")
            X_train = train[i].copy()
            model = tf.keras.wrappers.scikit_learn.KerasClassifier(build_fn = build_model_11, epochs = 10, 
                                                                   batch_size = 16, verbose = 0)
            combination_list.append(["Base Line"])
        elif j == 0 and i == 1:
            print("Base line")
            X_train = train[i].copy()
            model = tf.keras.wrappers.scikit_learn.KerasClassifier(build_fn = build_model_10, epochs = 10, 
                                                                   batch_size = 16, verbose = 0)
            combination_list.append(["Base Line"])
        else:
            combo_cols = [cnn_data[i].columns[idx] for idx in combinations[j-1]]
            print(combo_cols)
            train_input1 = train[i][:,:,:,combinations[j-1][0]].copy()
            train_input2 = train[i][:,:,:,combinations[j-1][1]].copy()
            train_input3 = train[i][:,:,:,combinations[j-1][2]].copy()
            train_input4 = train[i][:,:,:,combinations[j-1][3]].copy()
            train_input5 = train[i][:,:,:,combinations[j-1][4]].copy()
            X_train = np.stack([train_input1, train_input2, train_input3, train_input4, train_input5], axis = 3)
            model = tf.keras.wrappers.scikit_learn.KerasClassifier(build_fn = build_model_5, epochs = 10, 
                                                                   batch_size = 16, verbose = 0)
            combination_list.append(combo_cols)
            
        tf.keras.backend.clear_session()
        all_scores = cross_val_score(model, X_train, y_train, cv=btscv, scoring = "accuracy")
        print(np.mean(all_scores))
        score_list.append(np.mean(all_scores))
    
    selection_df["Combo"] = combination_list
    selection_df["Score"] = score_list 
    feat_importance_df.append(selection_df)

### final model

#### GridSearch

In [None]:
from sklearn.model_selection import GridSearchCV

btscv = BlockingTimeSeriesSplit(n_splits=5)

param_grid = {
    "pool_type":["max", "average"],
    "conv_activation":["relu", "tanh"],
    "drop_out_layer":[0.1, 0.2],
    "batch_size":[1, 16, 32, 64],
    "epochs":[10, 20]
}

for i in range(len(selection_df)):
    model_CV = tf.keras.wrappers.scikit_learn.KerasClassifier(build_fn = build_model)
    grid = GridSearchCV(estimator=model_CV, param_grid=param_grid, n_jobs=-1, cv=btscv)
    grid_result = grid.fit(X, y)

    print("Best {}: {} using {}".format(stock_names[i], grid_result.best_score_, grid_result.best_params_))
    means = grid_result.cv_results_["mean_test_score"]
    stds = grid_result.cv_results_["std_test_score"]
    params = grid_result.cv_results_["params"]

In [None]:
for mean, stdev, param in zip(means, stds, params):
    print("{} ({}) with: {}".format(mean, stdev, param))

### Backtesting

## Following (might be used later again)

- Close, Open (stock), Low (stock), High (stock) = Group 1
- Volume, Turnover = Group 2
- SMA 10, SMA 50, EMA 10, EMA 50, SMA 20, upper band, lower band, EMA 12, EMA 26, MACD, Signal Line = Group 3
- f01 =
- f02, f03, f04, f05, f06 = Group 2 (one removed)
- f07
- f08
- f09
- f10
- Close (Gold) = Gold
- High, Low, Open  = FX

- select one variable from every group to overcome multicollinearity

groups = ["Base Line", "Group 1", "Group 1", "Group 1", "Group 1", "Group 2", "Group 2", "Group 3", "Group 3", 
          "Group 3", "Group 3", "Group 3", "Group 3", "Group 3", "Group 3", "Group 3", "Group 3", "Group 3", "X",
         "Group 2", "Group 2", "Group 2", "Group 2", "Group 2", "X", "X", "X", "X", "Gold", "FX", "FX", "FX"]

In [None]:
def feature_classification(data): 
    if data == 0:
        return "neutral"
    elif data > 0:
        return "negative influence"
    else:
        return "positive influence"

    
feat_imp_results = []

for i in range(len(feat_importance_df)):
    
    result = feat_importance_df[i].copy()
    
    result["Difference"] = result - result.loc["total"]
    result = result[[stock_names[i], "Difference"]]
    result["Classification"] = result["Difference"].apply(lambda x: feature_classification(x))
    result["Group"] = groups
    idx_gb = result.groupby("Group", as_index=False)
    list_idx = [list(i) for i in idx_gb.groups.values()]
    list_idx = [item for sublist in list_idx for item in sublist]
    result = result.reindex(list_idx)
    result = result.sort_values(["Group", stock_names[i], "Classification"])
    feat_imp_results.append(result)

In [None]:
from IPython.display import display

for i in range(len(feat_imp_results)):
    for group in feat_imp_results[i]["Group"].unique():
        display(feat_imp_results[i][feat_imp_results[i]["Group"] == group])

In [None]:
final_df = []

for i in range(len(feat_imp_results)):
    feat_df = feat_imp_results[i][(feat_imp_results[i]["Classification"] != "negative influence") &
                                 (feat_imp_results[i]["Classification"] != "neutral")]
    feat_group1 = feat_df[feat_df["Group"] == "Group 1"].reset_index().copy()
    feat_group1.set_index("index", inplace = True)
    feat_df = feat_df[feat_df["Group"] != "Group 1"]
    feat_df = feat_df.drop_duplicates(subset = ["Group"], keep = "first")
    feat_df = feat_df.append(feat_group1.loc[["diff_Close"]])
    final_df.append(feat_df)

In [None]:
list(final_df[1].index)

In [None]:
for i in range(len(final_df)):
    cols = [j for j in cnn_data[i].columns]
    print(cols)
    stock_idx = [j for j in range(len(cnn_data[i].columns)) if cols[j] in list(final_df[i].index)]
    print(stock_idx)

In [None]:
print(cols)
print(stock_idx)

In [None]:
cnn_data[1]