In [None]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle 
import itertools
import os 
import warnings
from IPython.display import display
warnings.filterwarnings("ignore")

In [None]:
def curation_download(data):
    
    df = data.copy()
    df = df[["Exchange Date", "Close", "Open", "Low", "High", "Volume"]][::-1]
    df["Exchange Date"] = pd.to_datetime(df["Exchange Date"])#.dt.floor("d")
    df = df.set_index("Exchange Date")
    df.index.names = ["Date"]
    df = df.astype(np.float64)
    
    return df


def technical_indicators(data, column):
    
    df = data.copy()
    #SMA
    df["SMA_10"] = df[column].rolling(window = 10).mean()
    df["SMA_50"] = df[column].rolling(window = 50).mean()

    # EMA
    df["EMA_10"] = df[column].ewm(span = 50, adjust = False).mean()
    df["EMA_50"] = df[column].ewm(span = 50, adjust = False).mean() # look at adjust

    # Bollinger Bands
    df["SMA_20"] = df[column].rolling(window = 20).mean()
    df["upper_band"] = df["SMA_20"] + 2 * df[column].rolling(window = 20).std()
    df["lower_band"] = df["SMA_20"] - 2 * df[column].rolling(window = 20).std()

    # MACD
    df["EMA_12"] = df[column].ewm(span = 12, adjust = False).mean()
    df["EMA_26"] = df[column].ewm(span = 26, adjust = False).mean() #might lead to leakage
    df["MACD"] = df["EMA_12"] - df["EMA_26"]
    df["Signal_Line"] = df["MACD"].ewm(span=9, adjust=False).mean()
    
    return df, df.columns


def additional_features(data):
    
    df = data.copy()
    features = pd.DataFrame(index = df.index)
    
    # zscore
    features["f01"] = df["High"] - df["Close"] # upper shadow
    features["f02"] = df["Open"] - df["Low"] # lower shadow
    
    return features

In [None]:
# check VW, BMW
all_stocks = []
stock_names = ["Daimler", "Bayer", "SAP", "Deutsche Bank"]

daimler = pd.read_excel("../data/Daimler(USD)/Price History.xlsx", skiprows = 32, usecols = "A:I")
daimler = curation_download(daimler)
all_stocks.append(daimler)

bayer = pd.read_excel("../data/Bayer(USD)/Price History.xlsx", skiprows = 30, usecols = "A:I")
bayer = curation_download(bayer)
all_stocks.append(bayer)

sap = pd.read_excel("../data/SAP(USD)/Price History.xlsx", skiprows = 31, usecols = "A:I")
sap = curation_download(sap)
all_stocks.append(sap)

db = pd.read_excel("../data/DB(USD)/Price History.xlsx", skiprows = 29, usecols = "A:I")
db = curation_download(db)
all_stocks.append(db)

# economic data
comex_gold = pd.read_excel("../data/Comex(Gold)/Price History.xlsx", skiprows = 31, usecols = "A:I")
comex_gold = comex_gold[["Exchange Date", "Close", "Volume"]][::-1]
comex_gold["Exchange Date"] = pd.to_datetime(comex_gold["Exchange Date"])#.dt.floor("d")
comex_gold = comex_gold.set_index("Exchange Date")
comex_gold.index.names = ["Date"]
comex_gold = comex_gold.astype(np.float64)

eur_usd = pd.read_excel("../data/EUR-USD/Price History.xlsx", skiprows = 26, usecols = "A:H")
eur_usd = eur_usd[["Exchange Date", "Bid", "Ask", "High", "Low", "Open"]]
eur_usd["Exchange Date"] = pd.to_datetime(eur_usd["Exchange Date"])
eur_usd = eur_usd.set_index("Exchange Date")
eur_usd.index.names = ["Date"]
eur_usd = eur_usd.astype(np.float64)

In [None]:
technical_df = []
technical_cols = []

for i in range(len(all_stocks)):
    technical, tech_cols = technical_indicators(all_stocks[i], "Close")
    technical_df.append(technical)
    technical_cols.append(tech_cols)

In [None]:
for i in range(len(technical_df)):
    display(technical_df[i].head(1))

In [None]:
add_feat = []

for i in range(len(all_stocks)):
    add_feat.append(additional_features(all_stocks[i]))
    
for i in range(len(add_feat)):
    display(add_feat[i].head(1))

In [None]:
# merge everything including economic data
final_df = []
for i in range(len(all_stocks)):
    feat_df = pd.merge(technical_df[i], add_feat[i], on = "Date")
    feat_df = pd.merge(feat_df, comex_gold["Close"], on = "Date", suffixes = [None, "_gold"])
    feat_df = pd.merge(feat_df , eur_usd[["High", "Low", "Open"]], on = "Date", suffixes = [None, "_fx"])
    #feat_df = feat_df.drop(["High", "Low", "Open"], axis = 1)
    feat_df = feat_df.dropna()
    final_df.append(feat_df)
    
for i in range(len(final_df)):
    display(final_df[i].head(1))

### Correlation matrix (annotation True)

In [None]:
for i in range(len(final_df)):
    
    corrMatrix = final_df[i].corr()

    fig, ax = plt.subplots(figsize = (20, 20))
    sns.heatmap(corrMatrix, annot = True, ax = ax)

    plt.show()

In [None]:
final_data = []

for i in range(len(final_df)):
    
    drop_matrix = final_df[i].corr().abs()
    upper = drop_matrix.where(np.triu(np.ones(drop_matrix.shape), k=1).astype(np.bool))
    to_drop = [column for column in upper.columns if any(upper[column] > 0.98)]
    final_df[i].drop(to_drop, axis = 1, inplace = True)
    final_data.append(final_df[i])
    
for i in range(len(final_data)):
    
    print(final_data[i].shape)
    display(final_data[i].head(1))

In [None]:
# check order
for i in range(len(final_data)):
    print(stock_names[i])
    print(final_data[i].index[0])
    print(final_data[i].index[-1])
    print(final_data[i].shape)
    print("*"*35)

### create labels and plot correlation/ label - feature

In [None]:
def get_daily_vol(df, span=20):
    
    daily_returns = df.pct_change()
    
    return daily_returns.ewm(span=span).std()


def create_labels(df, daily_vol, t_final = 10, upper_lower_multipliers = [1, 1]):

    out = pd.DataFrame(index = daily_vol.index, columns = ["date_passed", "label", "initial", "upper", 
                                                           "lower", "break", "final"])
    
    for day, vol in daily_vol.iterrows():
        days_passed = len(daily_vol.loc[daily_vol.index[0] : day])

        if (days_passed + t_final < len(daily_vol.index) and t_final != 0):
            vert_barrier = daily_vol.index[days_passed + t_final]

        else:
            vert_barrier = np.nan

        if upper_lower_multipliers[0] > 0:
            top_barrier = df["Close"][day] + df["Close"][day] * upper_lower_multipliers[0] * vol
        else:
            top_barrier = pd.Series(index=close_prices.index)

        if upper_lower_multipliers[1] > 0:
            bot_barrier = df["Close"][day] - df["Close"][day] * upper_lower_multipliers[1] * vol
        else:
            bot_barrier = pd.Series(index=close_prices.index)

        breakthrough_date = vert_barrier
        out.at[day, "initial"] = df["Close"][day]
        out.at[day, "upper"] = top_barrier["Close"]
        out.at[day, "lower"] = bot_barrier["Close"]
        out.at[day, "break"] = breakthrough_date

        for future_date in daily_vol.index[days_passed : min(days_passed + t_final, len(daily_vol.index))]:
            if ((df["Close"].loc[future_date] >= top_barrier["Close"] and top_barrier["Close"] != 0)):
                out.at[day, "date_passed"] = future_date
                out.at[day, "label"] = 1 
                out.at[day, "final"] = df["Close"].loc[future_date]
                breakthrough_date = future_date
                break

            elif ((df["Close"].loc[future_date] <= bot_barrier["Close"] and bot_barrier["Close"] != 0)):
                out.at[day, "date_passed"] = future_date
                out.at[day, "label"] = -1
                out.at[day, "final"] = df["Close"].loc[future_date]
                breakthrough_date = future_date
                break

        if (breakthrough_date == vert_barrier):
            price_initial = df["Close"].loc[day]
            price_final = df["Close"].loc[breakthrough_date]

            if price_final > top_barrier["Close"]:
                out.at[day, "date_passed"] = "break_through"
                out.at[day, "label"] = 1
                out.at[day, "final"] = price_final

            elif price_final < bot_barrier["Close"]:
                out.at[day, "date_passed"] = "break_through"
                out.at[day, "label"] = -1
                out.at[day, "final"] = price_final

            else:
                out.at[day, "date_passed"] = "calculated"
                out.at[day, "label"] = max([(price_final - price_initial) / (top_barrier["Close"] - price_initial),
                                            (price_final - price_initial) / (price_initial - bot_barrier["Close"])],
                                           key=abs)
                out.at[day, "final"] = price_final
    
    return out

In [None]:
labels_df = []

for i in range(len(final_data)):
    vol = get_daily_vol(final_data[i][["Close"]]).dropna()
    labels = create_labels(final_data[i], vol, t_final = 10, upper_lower_multipliers = [2, 2])
    labels = labels[:labels["label"].isnull().values.argmax()]
    labels["label"] = np.where(labels["label"] >= 0, 1, -1)
    labels_df.append(labels)

In [None]:
final_df = []

for i in range(len(final_data)):
    final_df.append(final_data[i].join(labels_df[i]["label"], on = "Date").dropna())

In [None]:
for i in range(len(final_df)):
    
    x = final_df[i].corr()["label"].sort_values(ascending=False)[1:].copy()
    y = x.index 

    fig = plt.figure(figsize = (10, 7))
    ax = fig.add_subplot()
    ax.set_title('Correlation Between Attributes and Target ({})'.format(stock_names[i]), fontsize=16)
    ax.barh(np.arange(len(y)), x, align='center')
    ax.set_yticks(np.arange(len(y)))
    ax.set_yticklabels(y)
    ax.invert_yaxis()
    ax.set_xlabel('Correlation', fontsize=14)
    ax.set_ylabel('Attributes', fontsize=14)
    ax.axvline(linewidth=1, color='black')
    plt.show()

## Create input

In [None]:
from statsmodels.tsa.stattools import adfuller 

def getWeights_FFD(d=0.1, thres=1e-5):
    
    w,k=[1.],1
    while True:
        w_=-w[-1]/k*(d-k+1)
        if abs(w_)<thres:break
        w.append(w_)
        k+=1
    return np.array(w[::-1]).reshape(-1,1)


def transfer_data_by_frac_diff_FFD(col, d=0.1, thres=1e-4):
    #3) Apply weights to values
    w=getWeights_FFD(d,thres)
    width=len(w)-1
    
    df = pd.Series()
    #widow size can't be larger than the size of data
    if width >= col.shape[0]:
        raise Exception("width is oversize")
        
    for i in range(width, col.shape[0]):
        i_0_index, i_1_index = col.index[i-width], col.index[i]
        data = np.dot(w.T, col.loc[i_0_index:i_1_index])[0]
        
        df[i_1_index] = data
                   
    return df


def MemoryVsCorr(total_range, series, treshold):
    
    interval = np.linspace(total_range[0], total_range[1], total_range[2])
    result = pd.DataFrame(np.zeros((len(interval),4)))
    result.columns = ['order','adf','corr', '5%']
    result['order'] = interval

    for counter,order in enumerate(interval):
        seq_traf = transfer_data_by_frac_diff_FFD(series, order, treshold)
        res=adfuller(seq_traf, maxlag=1, regression='c') #autolag='AIC'
        result.loc[counter,'adf']=res[0]
        result.loc[counter,'5%']=res[4]['5%']
        seq_traf = seq_traf.values
        difference = len(series) - len(seq_traf)
        df1 = series[difference:].reset_index(drop=True).values
        result.loc[counter, 'corr'] = np.corrcoef(df1, seq_traf)[0,1]
        
    return result


def df_after_frac(df_before, df_after):

    difference = len(df_before) - len(df_after)
    df_frac = df_after.to_frame().reset_index().copy()
    df_frac.columns = ["Date", "Diff"]
    df_frac.set_index("Date", inplace = True)
    
    df_final = df_before[["Close", "Volume", "MACD", "Signal_Line"]].merge(df_frac, on='Date', how='left')[difference:]
    
    return df_final

In [None]:
for i in range(len(final_df)):
    final_df[i].drop("label", axis = 1, inplace = True)
    display(final_df[i])

In [None]:
%%time

stock_num = 0
adf_list = []
while stock_num < (len(stock_names)):
    df_adf = pd.DataFrame(columns=[final_df[stock_num].columns])
    print(stock_names[stock_num])
    values_df = []
    for col in final_df[stock_num].columns:
        test_adf = MemoryVsCorr([0, 1, 21], final_df[stock_num][col], 1e-3)
        for i in range(len(test_adf)):
            if test_adf.loc[i]["adf"] < test_adf.loc[i]["5%"]:
                values_df.append(test_adf.loc[i]["order"])
                break
            elif i == (len(test_adf)-1):
                values_df.append("out of range")
            else:
                pass
    df_adf.loc[len(df_adf)] = values_df
    adf_list.append(df_adf)
    stock_num += 1

In [None]:
for i in range(len(adf_list)):
    print(stock_names[i])
    display(adf_list[i])

In [None]:
df_frac = []

for i, data in enumerate(final_df):
    df = pd.DataFrame(index = data.index)
    
    for col in data.columns:
        if adf_list[i].loc[0][col].item() != 0:
            print(stock_names[i])
            print(col)
            display(adf_list[i].loc[0])
            fractioned = transfer_data_by_frac_diff_FFD(data[col], adf_list[i].loc[0][col].item(),
                                                        thres = 1e-3)
            
            fractioned = fractioned.to_frame().reset_index()
            fractioned.columns = ["Date", "diff_{}".format(col)]
            fractioned.set_index("Date", inplace=True)
            display(fractioned)
            df = df.join(fractioned, on = "Date")
            print("-" * 35)
        else:
            df = df.join(data[[col]])
            
    df = df.dropna()    
    df_frac.append(df)

file = open("../data/adf_data_new.pkl", "wb")

pickle.dump(stock_names, file)
pickle.dump(labels_df, file)
pickle.dump(final_df, file)
pickle.dump(adf_list, file)
pickle.dump(df_frac, file)

file.close()

In [None]:
file = open("../data/adf_data_new.pkl", "rb")

stock_names = pickle.load(file)
labels_df = pickle.load(file)
final_df = pickle.load(file)
adf_list = pickle.load(file)
df_frac = pickle.load(file)


file.close()

In [None]:
for i in range(len(labels_df)):
    print(stock_names[i])
    display(labels_df[i].head(1))
    display(final_df[i].head(1))
    display(df_frac[i].head(1))
    print("*"*35)

In [None]:
final_df = []

for i in range(len(df_frac)):
    result = pd.merge(df_frac[i], labels_df[i]["label"], on = "Date", how = "left")
    final_df.append(result)

In [None]:
cnn_data = []
cnn_labels = []

for i in range(len(final_df)):
    cnn_labels.append(final_df[i][["label"]])
    data = final_df[i].drop("label", axis = 1)
    print(len(cnn_labels[i]) == len(data))
    cnn_data.append(data)

In [None]:
from pyts.image import GramianAngularField
import math

def create_data(df, df_label, stock_names):
    
    df_new = ((df - df.max()) + (df - df.min()))/(df.max() - df.min())
    df_scaled = df_new.copy()
    for method in ["summation", "difference"]:
        print(stock_names, method)
        X_gasf_list = []
        train_dax = []
        test_dax = []
        target_train = []
        target_test = []
        
        for i in range(30, len(df_scaled), 1):
            X = df_scaled.iloc[i-30:i].T.copy()
            gasf = GramianAngularField(method = method)
            X_gasf = gasf.fit_transform(X)
            X_gasf_list = [i for i in X_gasf]
            dimensional = np.dstack(X_gasf_list)

            target = df_label["label"].iloc[i-1]
            
            assert df_label.iloc[i-1].name == X.columns[-1]
            
            train_dax.append(dimensional)
            target_train.append(target)


        path_train = "../eikon_api/{}/{}/Train".format(method, stock_names)

        os.makedirs(path_train)

        np.save(os.path.join("../eikon_api/{}/{}".format(method, stock_names), "Train", "train.npy"), train_dax)

        np.savetxt(os.path.join("../eikon_api/{}/{}".format(method, stock_names), "Train", "target_train.csv"), 
                   target_train)

In [None]:
for i in range(len(cnn_data)):
    create_data(cnn_data[i], cnn_labels[i], stock_names[i])

In [None]:
train_sum = []
train_diff = []
train_label = []

for method in ["summation", "difference"]:
    for i in range(len(stock_names)):
        
        if method == "summation":
            train_sum.append(np.load(os.path.join("../eikon_api/{}/{}".format(method, stock_names[i]), 
                                              "Train", "train.npy")))
            train_label.append(pd.read_csv("../eikon_api/{}/{}/Train/target_train.csv".format(method, stock_names[i]), 
                                           header = None))
            
        if method == "difference":
            train_diff.append(np.load(os.path.join("../eikon_api/{}/{}".format(method, 
                                                                          stock_names[i]), "Train", "train.npy")))

In [None]:
print("Length Train Sum: ", len(train_sum))
print("Shape Train Sum: ", train_sum[0].shape)
print("*" * 40)
print("Length Train Diff: ", len(train_diff))
print("Shape Train Diff: ", train_diff[0].shape)
print("*" * 40)
print("Length Train: ", len(train_label))
print("Shape Train: ", train_label[0].shape)

In [None]:
plt.figure(figsize=(10,10))

for i in range(len(train_sum[0][0,0,0,:])):
    plt.subplot(4, 3, i+1)   
    plt.axis('off')
    plt.title(cnn_data[0].columns[i])
    plt.imshow(train_sum[0][0,:,:,i], cmap = "gray")
    
plt.tight_layout()
plt.savefig("../data/plots/gaf-sum.png", dpi=150)
plt.show()

In [None]:
plt.figure(figsize=(10,10))

for i in range(len(train_diff[0][0,0,0,:])):
    plt.subplot(4, 3, i+1)   
    plt.axis('off')
    plt.title(cnn_data[0].columns[i])
    plt.imshow(train_diff[0][0,:,:,i], cmap = "gray")
    
plt.tight_layout()
plt.savefig("../data/plots/gaf-diff.png", dpi=150)
plt.show()

In [None]:
y_train_list = []

for i in range(len(train_label)):
    y_train = train_label[i].iloc[:,0].values
    y_train = np.where(y_train == -1, 0, y_train)
    y_train_list.append(y_train)

In [None]:
for i in range(len(y_train_list)):
    print("{}: {}".format(stock_names[i], len(y_train_list[i])))
    unique, counts = np.unique(y_train_list[i], return_counts=True)
    print(dict(zip(unique, counts)))
    print("Share of {}: {}".format(unique[0], round(counts[0]/sum(counts), 2)))
    print("Share of {}: {}".format(unique[1], round(counts[1]/sum(counts), 2)))
    print("*"*35)

In [None]:
import tensorflow as tf

def cnn_model(shape, pool_type = "max", conv_activation = "relu",batch_norm = True, drop_out_layer = 0.2, 
              neurons = 128, lr = 0.00001):
    
    model = tf.keras.models.Sequential()
    model.add(tf.keras.layers.Conv2D(filters = 32, kernel_size = (3, 3), input_shape = shape, 
                                     activation = conv_activation))    
    if batch_norm:
        model.add(tf.keras.layers.BatchNormalization())
    if pool_type == "max":
        model.add(tf.keras.layers.MaxPooling2D(pool_size=(2, 2)))
    if pool_type == "average":
        model.add(tf.keras.layers.AveragePooling2D(pool_size=(2, 2)))
    if pool_type == None:
        pass

        
    model.add(tf.keras.layers.Conv2D(64, (3, 3), activation = conv_activation))
    if batch_norm:
        model.add(tf.keras.layers.BatchNormalization())
    if pool_type == "max":
        model.add(tf.keras.layers.MaxPooling2D(pool_size=(2, 2)))
    if pool_type == "average":
        model.add(tf.keras.layers.AveragePooling2D(pool_size=(2, 2)))
    if pool_type == None:
        pass

#     model.add(tf.keras.layers.Conv2D(128, (3, 3), activation = conv_activation))
#     if batch_norm:
#         model.add(tf.keras.layers.BatchNormalization())
#     if pool_type == "max":
#         model.add(tf.keras.layers.MaxPooling2D(pool_size=(2, 2)))
#     if pool_type == "average":
#         model.add(tf.keras.layers.AveragePooling2D(pool_size=(2, 2)))
#     if pool_type == None:
#         pass

        
    model.add(tf.keras.layers.Flatten())
    
    model.add(tf.keras.layers.Dense(units = neurons, activation = conv_activation))
    if drop_out_layer != 0:
        model.add(tf.keras.layers.Dropout(drop_out_layer))
        
    model.add(tf.keras.layers.Dense(units = neurons, activation = conv_activation))
    if drop_out_layer != 0:
        model.add(tf.keras.layers.Dropout(drop_out_layer))
        
    model.add(tf.keras.layers.Dense(units = 2, activation =  "softmax"))
    
    model.compile(optimizer = tf.keras.optimizers.Adam(learning_rate=lr), 
                  loss = "binary_crossentropy", metrics = ["accuracy"])
    
    return model

In [None]:
# arrays as output otherwise will not work with sk-GS
from sklearn.utils import indexable
from sklearn.utils.validation import _num_samples

class WalkingForward():
    
    def __init__(self, n_splits):
        self.n_splits = n_splits
    
    def get_n_splits(self, X, y, groups):
        return self.n_splits
    
    def split(self, X, y=None, groups=None, train_splits = 2, test_splits = 1):
        X, y, groups = indexable(X, y, groups)
        n_samples = _num_samples(X)
        n_splits = self.n_splits
        n_folds = n_splits + 1
        train_splits, test_splits = int(train_splits), int(test_splits)
        indices = np.arange(n_samples)
        split_size = (n_samples // n_folds)
        test_size = split_size * test_splits
        train_size = split_size * train_splits
        test_starts = range(train_size + n_samples % n_folds, n_samples - (test_size - split_size), split_size)
        for i, test_start in zip(range(len(test_starts)), test_starts):
            rem = 0
            if i == 0:
                rem = n_samples % n_folds
            yield (indices[(test_start - train_size - rem):test_start], indices[test_start:test_start + test_size])

In [None]:
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score, precision_score, recall_score

wfcv = WalkingForward(n_splits = 11)

for method in ["summation", "difference"]:
    for num in range(4, 7, 1):
        for i in range(len(train_sum)):

            selection_df = pd.DataFrame()
            
            combinations_list = []
            acc_list = []
            auc_list = []
            f1_list = []
            precision_list = []
            recall_list = []
            y_pred_list = []
            y_true_list = []
            
            if method == "summation":
                print(stock_names[i], method, num)
                train = train_sum[i].copy()
                
            elif method == "difference":
                print(stock_names[i], method, num)
                train = train_diff[i].copy()
                
            else:
                print("None of the methods was found!")
                break
                
            y_train = y_train_list[i].copy()

            assert len(cnn_data[i].columns) == train.shape[3]
            all_combinations = list(itertools.combinations(range(len(cnn_data[i].columns)), num))
            
            print("Number of Combinations with {} features: ".format(num), len(all_combinations))
            
            for j in range(len(all_combinations)+1):
                if j == 0:
                    print("Base Line")
                    combinations_list.append(["Base Line"])
                    X_train = train.copy()
                else:
                    combo = list(all_combinations[j-1])
                    combo_cols = [cnn_data[i].columns[idx] for idx in combo]
                    combinations_list.append(combo_cols)
                    assert len(combo_cols) == num
                    X_train = np.stack([train[:, :, :, x] for x in combo], axis = 3)

                  
                class_list = []
                real_class_list = []
                
                input_shape = X_train.shape[1:]
                tf.keras.backend.clear_session()
                model = cnn_model(shape = input_shape)
                
                for train_idx, test_idx in wfcv.split(X_train):
                    
                    history = model.fit(X_train[train_idx,:,:,:], y_train[train_idx],
                                        validation_data=(X_train[test_idx,:,:,:], y_train[test_idx]), epochs = 5, 
                                        verbose = 1, batch_size = 16)
                    
                    predict_class = model.predict_classes(X_train[test_idx,:,:,:])
                    class_list.append(list(predict_class))
                    real_class_list.append(y_train[test_idx])
                    
                y_pred_ = [item for sublist in class_list for item in sublist]
                y_test_ = [item for sublist in real_class_list for item in sublist]
                
                acc_list.append(accuracy_score(y_test_, y_pred_))
                auc_list.append(roc_auc_score(y_test_, y_pred_))
                f1_list.append(f1_score(y_test_, y_pred_))
                precision_list.append(precision_score(y_test_, y_pred_))
                recall_list.append(recall_score(y_test_, y_pred_))
                y_pred_list.append(y_pred_)
                y_true_list.append(y_test_)
            selection_df["{}_{}".format(num, method)] = combinations_list
            selection_df["accuracy"] = acc_list
            selection_df["auc"] = auc_list
            selection_df["f1"] = f1_list
            selection_df["precision"] = precision_list
            selection_df["recall"] = recall_list
            selection_df["y_pred"] = y_pred_list
            selection_df["y_true"] = y_true_list
            display(selection_df)
            selection_df.to_csv("../data/scores/score{}_{}_{}.csv".format(num, method, stock_names[i]), index = False)

### Results of the feature selection

In [None]:
from ast import literal_eval

def score_curation(stock_name):
    files_list = [f for f in os.listdir("../data/scores") if os.path.isfile(os.path.join("../data/scores", f))]
    stock_files = [f for f in files_list if stock_name in f]
    summation_files = [method for method in stock_files if "summation" in method]
    difference_files = [method for method in stock_files if "difference" in method]
    
    df_summation = []
    df_difference = []
    
    for i in range(len(summation_files)):
        
        data_sum = pd.read_csv("../data/scores/{}".format(summation_files[i]))
        data_diff = pd.read_csv("../data/scores/{}".format(difference_files[i]))
        data_sum.columns = ["features", "accuracy", "auc", "f1", "precision", "recall", "y_pred", "y_true"]
        data_diff.columns = ["features", "accuracy", "auc", "f1", "precision", "recall", "y_pred", "y_true"]
        
        data_sum["features"] = data_sum.loc[:, "features"].apply(lambda x: literal_eval(x))
        data_diff["features"] = data_diff.loc[:, "features"].apply(lambda x: literal_eval(x))
        data_sum["y_pred"] = data_sum.loc[:, "y_pred"].apply(lambda x: literal_eval(x))
        data_diff["y_pred"] = data_diff.loc[:, "y_pred"].apply(lambda x: literal_eval(x))
        data_sum["y_true"] = data_sum.loc[:, "y_true"].apply(lambda x: literal_eval(x))
        data_diff["y_true"] = data_diff.loc[:, "y_true"].apply(lambda x: literal_eval(x))
        
        data_sum["method"] = "sum"
        data_diff["method"] = "diff"
        
        df_summation.append(data_sum)
        df_difference.append(data_diff)
    
    
    final_sum = pd.concat(df_summation, ignore_index = True)
    final_diff = pd.concat(df_difference, ignore_index = True)
    
    final = pd.concat([final_sum, final_diff], ignore_index = True)
    final["len"] = final["features"].apply(lambda x: len(x))
    final["features"] = [','.join(map(str, string)) for string in final['features']]
    
#     final = final.groupby(["features", "method", "len"])[["accuracy", "auc", "f1", "precision", 
#                                                           "recall", "y_pred", "y_true"]].mean().reset_index()    
    return final

#### Daimler

In [None]:
daimler_scores = score_curation("Daimler")

display(daimler_scores[daimler_scores["features"] == "Base Line"].groupby(["features", "method", 
                                                                   "len"])[["accuracy", "auc", "f1", "precision", 
                                                                            "recall"]].mean().reset_index())

daimler_scores.sort_values(by = "accuracy", ascending = False)

In [None]:
features_daimler = ["Volume", "Signal_Line", "f01", "diff_Close_gold"]
features_daimler_idx = [i for i in range(len(cnn_data[0].columns)) if list(cnn_data[0].columns)[i]
                        in features_daimler]
gs_daimler = np.stack([train_sum[0][:,:,:,i] for i in features_daimler_idx], axis = 3)

print(gs_daimler.shape)

#### Bayer

In [None]:
bayer_scores = score_curation("Bayer")
display(bayer_scores[bayer_scores["features"] == "Base Line"].groupby(["features", "method", 
                                                                   "len"])[["accuracy", "auc", "f1", "precision", 
                                                                            "recall"]].mean().reset_index())

bayer_scores.sort_values(by = "accuracy", ascending = False)

In [None]:
features_bayer = ["diff_Close", "Volume", "MACD", "Signal_Line", "f01", "f02"]
features_bayer_idx = [i for i in range(len(cnn_data[1].columns)) if list(cnn_data[1].columns)[i]
                        in features_bayer]
gs_bayer = np.stack([train_diff[1][:,:,:,i] for i in features_bayer_idx], axis = 3)

print(gs_bayer.shape)

#### SAP

In [None]:
sap_scores = score_curation("SAP")
display(sap_scores[sap_scores["features"] == "Base Line"].groupby(["features", "method", 
                                                                   "len"])[["accuracy", "auc", "f1", "precision", 
                                                                            "recall"]].mean().reset_index())
sap_scores.sort_values(by = "accuracy", ascending = False)

In [None]:
features_sap = ["diff_Close", "MACD", "f01", "diff_Close_gold", "diff_High_fx"]
features_sap_idx = [i for i in range(len(cnn_data[2].columns)) if list(cnn_data[2].columns)[i]
                        in features_sap]
gs_sap = np.stack([train_sum[2][:,:,:,i] for i in features_sap_idx], axis = 3)

print(gs_sap.shape)

#### Deutsche Bank

In [None]:
db_scores = score_curation("Deutsche Bank")

display(db_scores[db_scores["features"] == "Base Line"].groupby(["features", "method", 
                                                                   "len"])[["accuracy", "auc", "f1", "precision", 
                                                                            "recall"]].mean().reset_index())

db_scores.sort_values(by = "accuracy", ascending = False)

In [None]:
features_db = ["diff_Close", "MACD", "diff_Close_gold", "diff_High_fx"]
features_db_idx = [i for i in range(len(cnn_data[3].columns)) if list(cnn_data[3].columns)[i]
                        in features_db]
gs_db = np.stack([train_sum[3][:,:,:,i] for i in features_db_idx], axis = 3)

print(gs_db.shape)

In [None]:
gs_data = [gs_daimler, gs_bayer, gs_sap, gs_db]

### final model/ hyperparameter tuning

#### randomized search

In [None]:
print("Daimler shape: ", gs_daimler.shape)
print("Bayer shape: ", gs_bayer.shape)
print("SAP shape: ", gs_sap.shape)
print("Deutsche Bank: ", gs_db.shape)

In [None]:
# implement own random search
# add none to pooling layer


param_grid = {
    "pool_type":["max", "average"],
    "conv_activation":["relu", "sigmoid", "tanh"],
    "lr":[0.001, 0.0001],
    "batch_norm":[True, False],
    "drop_out_layer":[0, 0.1, 0.2, 0.3, 0.4, 0.5],
    "batch_size":[1, 16, 32, 64],
    "neurons" : [32, 64, 128, 256],
    "epochs":[10, 20, 30, 50]
}

In [None]:
com = 1
for x in param_grid.values():
    com *= len(x)
print('There are {} combinations'.format(com))
print('This would take {:.0f} days to finish.'.format((100 * com) / (60 * 60 * 24)))

In [None]:
import random

wfcv = WalkingForward(n_splits = 11)

random_results_df = []
score_df_list = []


for i, model_data in enumerate(gs_data):
    print(stock_names[i])
    shape = model_data.shape[1:]
    y_train = y_train_list[i].copy()
    
    random_results = pd.DataFrame(columns = param_grid.keys())
    score_df = pd.DataFrame()
    acc_list = []
    auc_list = []
    f1_list = []
    precision_list = []
    recall_list = []
    y_pred_list = []
    y_true_list = []
    
    for j in range(60):
        is_in = True
        while is_in == True:
            params = {k: random.sample(v, 1)[0] for k, v in param_grid.items()}
            is_in = (random_results == list(params.values())).all(1).any()
            
        if is_in == False:
            print(params)
            random_results.loc[len(random_results)] = list(params.values())
            tf.keras.backend.clear_session()
            model_cv = cnn_model(shape = shape, pool_type = params["pool_type"], 
                                 conv_activation = params["conv_activation"], batch_norm = params["batch_norm"], 
                                 drop_out_layer = params["drop_out_layer"], neurons = params["neurons"], 
                                 lr = params["lr"])
            
            class_list = []
            real_class_list =  []
            for train_idx, test_idx in wfcv.split(model_data):
                
                history = model_cv.fit(model_data[train_idx,:,:,:], y_train[train_idx],
                                       validation_data=(model_data[test_idx,:,:,:], y_train[test_idx]),
                                       epochs = params["epochs"], verbose = 1, batch_size = params["batch_size"])

                predict_class = model_cv.predict_classes(model_data[test_idx,:,:,:])
                class_list.append(list(predict_class))
                real_class_list.append(y_train[test_idx])
                print(predict_class)
                
            y_pred_ = [item for sublist in class_list for item in sublist]
            y_test_ = [item for sublist in real_class_list for item in sublist]

            acc_list.append(accuracy_score(y_test_, y_pred_))
            auc_list.append(roc_auc_score(y_test_, y_pred_))
            f1_list.append(f1_score(y_test_, y_pred_))
            precision_list.append(precision_score(y_test_, y_pred_))
            recall_list.append(recall_score(y_test_, y_pred_))
            y_pred_list.append(y_pred_)
            y_true_list.append(y_test_)
        
    score_df["accuracy"] = acc_list
    score_df["auc"] = auc_list
    score_df["f1"] = f1_list
    score_df["precision"] = precision_list
    score_df["recall"] = recall_list
    score_df["y_pred"] = y_pred_list
    score_df["y_true"] = y_true_list
    random_results_df.append(random_results)
    score_df_list.append(score_df)

file = open("../data/best_param_score_new.pkl", "wb")

pickle.dump(random_results_df, file)
pickle.dump(score_df_list, file)

file.close()

In [None]:
from collections import Counter

for i in range(len(score_df_list)):
    
    print("Stock", stock_names[i])

    best_param_idx =score_df_list[i].sort_values(by = "accuracy", ascending = False).reset_index().loc[0]["index"]
    display(score_df_list[i].sort_values(by = "accuracy", ascending = False).reset_index().loc[[0]])
        
    display(random_results_df[i].loc[[best_param_idx]])
    print("True", Counter(score_df_list[i].loc[best_param_idx]["y_true"]))

    print("Zero Share", max(list(Counter(score_df_list[i].loc[best_param_idx]["y_true"]).values())) / 
          sum(list(Counter(score_df_list[i].loc[best_param_idx]["y_true"]).values())))

    print("Pred", Counter(score_df_list[i].loc[best_param_idx]["y_pred"]))
    print("*" * 40)