In [4]:
import pandas as pd
import numpy as np
from leak_detect import detect_vertical_leakage, detect_horizontal_leakage
# to read all parameter descriptions
# ?detect_vertical_leakage

In [7]:
# Data source (Alphavantage API): https://www.alphavantage.co/documentation/
data = pd.read_csv('data/daily_adjusted_AAPL.csv')
# predict price after 5 days
prediction_window = 5
data['target'] = data['open'].shift(-prediction_window) 


## Vertical leakge

In [8]:
# Data creation function
def create_features(data_input):
    data = data_input.copy()
    # past 5 day return
    close_5days_before = data['close'].shift(5)
    data['return_5day'] = (data['close'] - close_5days_before)/close_5days_before
    
    # past 2 day return. Leaky uses price after 2 days
    close_2days_before = data['close'].shift(-2) 
    data['return_2day_leaky'] = (data['close'] - close_2days_before)/close_2days_before
    data['open_10day_before_leaky'] = data['open'].shift(-10)
    
    return data

In [9]:
input_feature_cols = ['open', 'high', 'low', 'close']
output_feature_cols = ['return_5day', 'return_2day_leaky', 'open_10day_before_leaky']
detect_vertical_leakage(create_features, data, input_feature_cols, output_feature_cols, only_nan=False, 
                       direction='upward')

Checking for vertical leakage in upward direction...
By replacing 'input_feature_cols' after row number 2516 with NANs:
Oops vertical leakage detected!!
List of columns and number of previous rows into which data is leaking from a row:
return_2day_leaky : 2
open_10day_before_leaky : 10


By adding imaginary component to 'input_feature_cols' after row number 2516:
Oops vertical leakage detected!!
List of columns and number of previous rows into which data is leaking from a row:
return_2day_leaky : 2
open_10day_before_leaky : 10




True

In [12]:
# Example where NANs are replaced with a value. Using NANs doesn't detect any leakage, but using complex numbers 
# we are able to detect leakage.

In [13]:
def create_features(data_input):
    data = data_input.copy()
    # past 5 day return
    close_5days_before = data['close'].shift(5)
    data['return_5day'] = (data['close'] - close_5days_before)/close_5days_before
    
    # past 2 day return. Leaky uses price after 2 days
    close_2days_before = data['close'].shift(-2)
    data['return_2day_leaky'] = ((data['close'] - close_2days_before)/close_2days_before).fillna(0.1)
    data['open_10day_before_leaky'] = data['open'].shift(-10).fillna(100)
    
    return data

In [14]:
input_feature_cols = ['open', 'high', 'low', 'close']
output_feature_cols = ['return_5day', 'return_2day_leaky', 'open_10day_before_leaky']
detect_vertical_leakage(create_features, data, input_feature_cols, output_feature_cols, only_nan=False, 
                       direction='upward')

Checking for vertical leakage in upward direction...
By replacing 'input_feature_cols' after row number 2516 with NANs:
No vertical leakage detected. Good to go! Yay!!


By adding imaginary component to 'input_feature_cols' after row number 2516:
Oops vertical leakage detected!!
List of columns and number of previous rows into which data is leaking from a row:
return_2day_leaky : 2
open_10day_before_leaky : 10




True

## Horizontal leakge

In [15]:
data = pd.read_csv('data/daily_adjusted_AAPL.csv')
# predict price after 5 days
prediction_window = 5
data['target'] = data['open'].shift(-prediction_window) 


In [16]:
# Data creation function
def create_features(data_input):
    data = data_input.copy()
    # past 2 and 5 day return
    close_5days_before = data['close'].shift(5)
    close_2days_before = data['close'].shift(2)
    
    data['return_5day'] = (data['close'] - close_5days_before)/close_5days_before
    data['return_2day'] = (data['close'] - close_2days_before)/close_2days_before
    
    # leaky feature which uses target column
    data['open_1day_before_leaky'] = data['target'].shift(-1)
    
    return data


In [17]:
target_cols = ['target']
input_feature_cols = ['open', 'high', 'low', 'close']
output_feature_cols = ['return_5day', 'return_2day', 'open_1day_before_leaky']


In [19]:
# checks for leakage from target cols to feature cols and from input feature cols to target cols
detect_horizontal_leakage(create_features, data, target_cols, output_feature_cols, input_feature_cols)

Checking for leakage from target columns to feature columns...
By replacing target with NANs:
Oops horizontal leakage detected!! 
List of columns and their respective number of rows with leaky data:
open_1day_before_leaky : 5027


By adding imaginary component to target columns:
Oops horizontal leakage detected!! 
List of columns and their respective number of rows with leaky data:
open_1day_before_leaky : 5027


---------------------------------------------------------------------------
Checking for leakage from input feature columns to target columns...
By replacing input feature columns with NANs:
No horizontal leakage detected. Good to go! Yay!!


By adding imaginary component to input feature columns:
No horizontal leakage detected. Good to go! Yay!!




True