In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime
import pytz
import os
import sys
import pickle

from pyspark import SparkConf
from pyspark.sql import SparkSession
import pyspark.sql.functions as f
from pyspark.sql.types import ArrayType, IntegerType, FloatType

In [2]:
application_root_directory = os.environ['BDS_HOME']
sys.path.append(application_root_directory)

application_django_directory = application_root_directory + '/badassdatascience/django'
sys.path.append(application_django_directory)

os.environ['DJANGO_ALLOW_ASYNC_UNSAFE'] = 'true'

import boilerplate

In [3]:
from utilities.date_and_time_related_calculations import compute_datetime_information
from utilities.make_dataframe_from_db import make_candlestick_dataframe

from utilities.basic import udf_difference_an_array, udf_deal_with_offset, nan_helper, nan_count_spark

In [4]:
config = {
    'pull_and_compute_data' : False,
    
    'price_type_name' : 'mid',
    'instrument_name' : 'EUR/USD',
    'interval_name' : 'Minute',
    'tz' : pytz.timezone('US/Eastern'),

    'halflife' : 10,
    'seconds_divisor' : 60,

    'reduce_vector_sizes' : True,
    'modulus_integer' : 30,
    
    'reduce_size_for_debugging' : False,

    'n_back' : 180,
    'n_forward' : 30,

    'train_val_test_split' : {
        'train' : 0.75,
        'val' : 0.125,
        'test' : 0.125,
    },
    
    'output_directory' : '/home/emily/Desktop/projects/test/badass-data-science/badassdatascience/forecasting/deep_learning/output',
}

In [5]:
spark_config = SparkConf().setAll(
    [
        ('spark.executor.memory', '15g'),
        ('spark.executor.cores', '3'),
        ('spark.cores.max', '3'),
        ('spark.driver.memory', '15g'),
        ('spark.sql.execution.arrow.pyspark.enabled', 'true'),
    ]
)

In [None]:
def make_initial_pdf(config):
    pdf = (
        compute_datetime_information(
            make_candlestick_dataframe(config['price_type_name'], config['instrument_name'], config['interval_name']),
            config['tz'],
        )
    )

    pdf['timestamp'] = pdf.index

    pdf = (
        pd.merge(
            pdf,

            # FIX:
            #pd.read_csv(self.config['output_directory'] + '/' + self.config['shifted_weekday_lookup_table_filename']),
            pd.read_csv('/home/emily/Desktop/projects/test/badass-data-science/badassdatascience/forecasting/deep_learning/output/df_weekday_shift_lookup_table.csv'),

            on = ['weekday_tz', 'hour_tz'],
            how = 'left',
        )
            .sort_values(by = 'datetime_tz')
    )

    #pdf['lhc'] = pdf[['l', 'h', 'c']].mean(axis=1)
    
    return pdf.copy()

In [None]:
def shift_days_and_hours_as_needed(df):
    df['original_date'] = [x.date() for x in df['datetime_tz']]
    df['to_shift'] = df['weekday_shifted'] - df['weekday_tz']

    pdf_date_to_shift = (
        df
        .sort_values(by = 'datetime_tz')
        [['weekday_tz', 'hour_tz', 'weekday_shifted', 'original_date', 'to_shift']]
        .drop_duplicates()
    )

    new_date_list = []
    for i, row in pdf_date_to_shift.iterrows():
        if row['to_shift'] > 0:
            delta = datetime.timedelta(days = row['to_shift'])
            new_date_list.append(row['original_date'] + delta)
        elif row['to_shift'] == -6:
            delta = datetime.timedelta(days = 1)
            new_date_list.append(row['original_date'] + delta)
        else:
            new_date_list.append(row['original_date'])

    pdf_date_to_shift['original_date_shifted'] = new_date_list

    pdf = (
        pd.merge(
            df.drop(columns = ['to_shift']),
            pdf_date_to_shift,
            on = ['weekday_tz', 'hour_tz', 'weekday_shifted', 'original_date'],
            how = 'left',
        )
        .drop(columns = ['original_date', 'to_shift'])
        .sort_values(by = ['datetime_tz'])
    )

    return pdf.copy()

In [None]:
initial_df_filepath = (
    config['output_directory']
    + '/' 
    + config['price_type_name'] + '-' 
    + config['instrument_name'].replace('/', '_') + '-' 
    +  config['interval_name']
    + '.parquet'
)

if config['pull_and_compute_data']:

    df = make_initial_pdf(config)
    df = shift_days_and_hours_as_needed(df)

    df = (
        df
        [~df['weekday_shifted'].isna()]
    ).copy()

    df['Return'] = df['c'] - df['o']
    df['Volatility'] = df['h'] - df['l']

    df.to_parquet(initial_df_filepath)

else:
    df = pd.read_parquet(initial_df_filepath)

In [None]:
df.drop(
    columns = [
        'o', 'l', 'h', 'c',
        'instrument', 'weekday_tz', 'hour_tz', 'weekday_shifted',
    ],https://stackoverflow.com/questions/58002668/pandas-groupby-ewm
    inplace = True,
)

In [None]:
df

In [None]:
# https://stackoverflow.com/questions/58002668/pandas-groupby-ewm

def calculate_emw_mean(df, column_name, halflife = 10):
    result = (
        df
        .sort_values(by = 'timestamp')
        .groupby(['original_date_shifted'])
        [column_name]
        .transform(
            lambda x : x.ewm(
                halflife = halflife
            )
            .mean()
        )
        .values
    )
    return result

def calculate_emw_var(df, column_name, halflife = 10):
    result = (
        df
        .sort_values(by = 'timestamp')
        .groupby(['original_date_shifted'])
        [column_name]
        .transform(
            lambda x : x.ewm(
                halflife = halflife
            )
            .var()
        )
        .values
    )
    return result

In [None]:
df['emw_volume_mean'] = calculate_emw_mean(df, 'volume')
df['emw_return_mean'] = calculate_emw_mean(df, 'Return')
df['emw_volatility_mean'] = calculate_emw_mean(df, 'Volatility')

df['emw_volume_var'] = calculate_emw_var(df, 'volume')
df['emw_return_var'] = calculate_emw_var(df, 'Return')
df['emw_volatility_var'] = calculate_emw_var(df, 'Volatility')

df = df.dropna()

df = df.copy()

for i in ['volume', 'Return', 'Volatility']:
    df['scaled_' + i.lower()] = (df[i] - df['emw_' + i.lower() + '_mean']) / (df['emw_' + i.lower() + '_var']**0.5)

In [None]:
df = df[
    [
        'timestamp', 'original_date_shifted',
        'scaled_volume', 'scaled_return', 'scaled_volatility',
        'datetime_tz',
    ]
].copy()

df

In [7]:
spark = (
    SparkSession
    .builder
    .master('local[*]')
    .appName('badass')
    .config(conf = spark_config)
    .getOrCreate()
)    

25/02/10 02:30:21 WARN Utils: Your hostname, emily-MS-7B96 resolves to a loopback address: 127.0.1.1; using 192.168.1.251 instead (on interface wlp5s0)
25/02/10 02:30:21 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/02/10 02:30:21 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [None]:
def move_to_spark(pdf, spark):
        
    sdf = spark.createDataFrame(pdf)

    sdf_timestamps = (
        sdf
        .select('timestamp')
        .distinct()
        .withColumn('dummy_variable', f.lit(True))    
        .orderBy('timestamp')
    )

    #
    # check to see if we used this later
    #
    sdf_all_possible_timestamps = (
        sdf_timestamps
        .join(
            sdf,
            ['timestamp'],
            'outer',
        )
        .drop('dummy_variable')
        .orderBy('timestamp')
    )

    return sdf, sdf_all_possible_timestamps

In [None]:
sdf, sdf_all_possible_timestamps = move_to_spark(df, spark)

In [None]:
sdf.show(5)

In [None]:
sdf_all_possible_timestamps.show(5)

In [None]:
sdf_arrays = (
    sdf
    .orderBy('datetime_tz')
    .groupBy('original_date_shifted')
    .agg(
        f.collect_list('scaled_return').alias('return'),
        f.collect_list('scaled_volume').alias('volume'),
        f.collect_list('scaled_volatility').alias('volatility'),
        f.collect_list('timestamp').alias('timestamp')
    )
    .orderBy('original_date_shifted')

    .withColumn('array_length_return', f.array_size(f.col('return')))
    .withColumn('array_length_volume', f.array_size(f.col('volume')))
    .withColumn('array_length_volatility', f.array_size(f.col('volatility')))
    .withColumn('array_length_timestamp', f.array_size(f.col('timestamp')))

    .withColumn(
        'length_test',
        (
            (f.col('array_length_return') == f.col('array_length_volume')) &
            (f.col('array_length_return') == f.col('array_length_timestamp'))
        )
    )
    .where(f.col('length_test') == True)
    .withColumnRenamed('array_length_return', 'array_length')
    .drop('array_length_volume', 'array_length_timestamp', 'array_length_volatility',)
            
    .withColumn('seconds_divisor', f.lit(config['seconds_divisor']))
    .withColumn('diff_timestamp', udf_difference_an_array(f.col('timestamp'), f.col('seconds_divisor')))

    # maybe?
    .withColumn('array_length_diff_timestamp', f.array_size(f.col('diff_timestamp')))

    .drop('seconds_divisor')
            
    .orderBy('original_date_shifted')
    
    
)

In [None]:
sdf_arrays.show(5)

In [None]:
#from utilities.seasonal_calculations import udf_normalized_spark_friendly_sine_with_24_hour_period
#from utilities.seasonal_calculations import udf_normalized_spark_friendly_cosine_with_24_hour_period

In [None]:
#sdf_arrays = (
#    sdf_arrays
#    .withColumn('sine_for_full_day', udf_get_the_sine_for_full_day(f.col('timestamp')))
#    .withColumn('cosine_for_full_day', udf_get_the_cosine_for_full_day(f.col('timestamp')))
#    .orderBy('original_date_shifted')
#)

In [8]:
#
# TEMP
#
#sdf_arrays.write.parquet('output/proto.parquet')

#
# temp loading from saved waypoint
#
sdf_arrays = spark.read.parquet('output/proto.parquet')


In [9]:
sdf_arrays.show(5)

+---------------------+--------------------+--------------------+--------------------+--------------------+------------+-----------+--------------------+---------------------------+
|original_date_shifted|              return|              volume|          volatility|           timestamp|array_length|length_test|      diff_timestamp|array_length_diff_timestamp|
+---------------------+--------------------+--------------------+--------------------+--------------------+------------+-----------+--------------------+---------------------------+
|           2013-05-24|[0.68261013472090...|[0.68261013472090...|[0.68261013472090...|[1369342860, 1369...|        1434|       true|[1, 1, 1, 1, 1, 1...|                       1433|
|           2013-05-27|[-0.6826101347209...|[0.68261013472090...|[-0.6826101347209...|[1369602060, 1369...|        1424|       true|[1, 1, 1, 1, 1, 1...|                       1423|
|           2013-05-28|[-0.6826101347209...|[-0.6826101347209...|[-0.6826101347209...|[136

In [10]:
if config['reduce_size_for_debugging']:
    sdf_arrays = sdf_arrays.limit(10)

In [11]:
min_array_length = sdf_arrays.select(f.min(f.col('array_length')).alias('min_array_length')).take(1)[0]['min_array_length']        
max_array_length = 1 + sdf_arrays.select(f.max(f.col('array_length')).alias('max_array_length')).take(1)[0]['max_array_length']

sdf_arrays = (
    sdf_arrays
    .withColumn('min_array_length', f.lit(min_array_length))
    .withColumn('max_array_length', f.lit(max_array_length))
)

In [12]:
sdf_sum_diff = (
    sdf_arrays
    .select(
        'original_date_shifted',
        'diff_timestamp',
    )
    .withColumn('sum_diff_timestamp', f.expr('AGGREGATE(diff_timestamp, 0, (acc, x) -> acc + x)'))
)

max_sum_diff_timestamp = np.max(np.array(sdf_sum_diff.select('sum_diff_timestamp').collect()))

sdf_arrays = (
    sdf_arrays
    .withColumn('max_sum_diff_timestamp', f.lit(max_sum_diff_timestamp))
)

In [13]:
sdf_arrays.show(2)

+---------------------+--------------------+--------------------+--------------------+--------------------+------------+-----------+--------------------+---------------------------+----------------+----------------+----------------------+
|original_date_shifted|              return|              volume|          volatility|           timestamp|array_length|length_test|      diff_timestamp|array_length_diff_timestamp|min_array_length|max_array_length|max_sum_diff_timestamp|
+---------------------+--------------------+--------------------+--------------------+--------------------+------------+-----------+--------------------+---------------------------+----------------+----------------+----------------------+
|           2013-05-24|[0.68261013472090...|[0.68261013472090...|[0.68261013472090...|[1369342860, 1369...|        1434|       true|[1, 1, 1, 1, 1, 1...|                       1433|              76|            1440|                  1438|
|           2013-05-27|[-0.6826101347209...|

In [14]:
sdf_arrays = (
    sdf_arrays
    .withColumn('corrected_offset_return', udf_deal_with_offset(f.col('return'), f.col('diff_timestamp'), f.col('max_array_length')))
    .withColumn('corrected_offset_volume', udf_deal_with_offset(f.col('volume'), f.col('diff_timestamp'), f.col('max_array_length')))
    .withColumn('corrected_offset_volatility', udf_deal_with_offset(f.col('volatility'), f.col('diff_timestamp'), f.col('max_array_length')))
    .withColumn('corrected_offset_length', f.size(f.col('corrected_offset_volume')))
    .drop(
        'return', 'volume', 'volatility',
        'timestamp', 'diff_timestamp', 'diff_sum' #, 'max_sum_diff_timestamp'
        )
    )

In [15]:
max_array_length

1440

In [16]:
sdf_arrays.select('corrected_offset_volatility').show()

+---------------------------+
|corrected_offset_volatility|
+---------------------------+
|       [0.68261015, 0.38...|
|       [-0.68261015, 1.0...|
|       [-0.68261015, 1.0...|
|       [-0.68261015, -0....|
|       [0.68261015, -0.5...|
|       [-0.68261015, -0....|
|       [-0.68261015, -0....|
|       [0.68261015, -1.0...|
|       [0.68261015, 0.83...|
|       [-0.68261015, -0....|
|       [-0.68261015, 0.9...|
|       [0.68261015, -0.7...|
|       [NaN, 0.68261015,...|
|       [-0.68261015, 1.0...|
|       [-0.68261015, 0.8...|
|       [-0.68261015, 0.0...|
|       [0.68261015, -1.0...|
|       [-0.68261015, -1....|
|       [-0.68261015, -0....|
|       [-0.68261015, -0....|
+---------------------------+
only showing top 20 rows



                                                                                

In [17]:
# there is probably a better way to convert a 2D np.array to a 2D np.matrix:

M_unscaled_dict = {}
       
for ci, column_name in enumerate(
    [
        #'corrected_offset_volatility',
        'corrected_offset_return', 'corrected_offset_volume', # 'corrected_offset_volatility',
    ]
):

    M_pre = np.array(sdf_arrays.select(column_name).toPandas()[column_name].to_list())
    M = np.zeros([M_pre.shape[0], M_pre.shape[1]])

    for i in range(0, M.shape[0]):
        M[i, :] = M_pre[i]

    M_unscaled_dict[column_name] = M

                                                                                

In [18]:
X_list = {}
y_list = {}
y_full_list = {}

for signal_name in [
    'corrected_offset_return', 'corrected_offset_volume',
]:
    n_rows, n_cols = M_unscaled_dict[signal_name].shape

    X_list[signal_name] = []
    y_list[signal_name] = []
    y_full_list[signal_name] = []
    
    for r in range(0, n_rows):
        signal = M_unscaled_dict[signal_name][r]


        i = len(signal) - 1
        while np.isnan(signal[i]):
            i -= 1

            # refactor this
            if i < 1:
                break

                    
        signal = signal[0:(i + 1)]

        # https://stackoverflow.com/questions/6518811/interpolate-nan-values-in-a-numpy-array
        nans, x = nan_helper(signal)
                                
        signal[nans] = np.interp(x(nans), x(~nans), signal[~nans])
                                
        for i in range(
            config['n_back'],
            len(signal) - config['n_back'] - config['n_forward']
        ):
            back = np.array(signal[(i - config['n_back']):i])
            forward = np.array(signal[i:(i + config['n_forward'])])
            
            the_min = min(forward)
            the_max = max(forward)
            the_mean = np.mean(forward)
            the_median = np.median(forward)
        
            X_list[signal_name].append(back)
            y_list[signal_name].append([the_min, the_max]) # the_mean, the_median])
            y_full_list[signal_name].append(forward)

X_all = np.array(X_list['corrected_offset_return'])
X_volume_all = np.array(X_list['corrected_offset_volume'])
y_all = np.array(y_list['corrected_offset_return'])
y_forward_all = np.array(y_full_list['corrected_offset_return'])
row_count_all = X_all.shape[0]

In [19]:
X_all

array([[ 0.68261015,  0.91768098,  1.1117444 , ..., -1.54086685,
        -0.32889849,  1.03203094],
       [ 0.91768098,  1.1117444 ,  0.241974  , ..., -0.32889849,
         1.03203094,  0.40129489],
       [ 1.1117444 ,  0.241974  , -0.7741307 , ...,  1.03203094,
         0.40129489,  1.14176559],
       ...,
       [ 0.79034519,  0.71972829,  0.65465862, ..., -1.83234978,
         0.07136966,  0.13814551],
       [ 0.71972829,  0.65465862, -0.66465914, ...,  0.07136966,
         0.13814551, -0.43700972],
       [ 0.65465862, -0.66465914, -0.86472833, ...,  0.13814551,
        -0.43700972, -0.05362904]])

In [20]:
y_all

array([[-2.70081162,  1.80636013],
       [-2.70081162,  1.80636013],
       [-2.70081162,  1.80636013],
       ...,
       [-1.97096598,  1.20855021],
       [-1.97096598,  1.20855021],
       [-1.97096598,  1.22635496]])

In [21]:
np.mean(y_forward_all, axis=1)

array([-0.04465225, -0.06476689, -0.06433952, ..., -0.04953714,
       -0.02290949,  0.01975664])

In [22]:

indices = np.arange(0, X_all.shape[0])
np.random.shuffle(indices)

shuffled_indices = indices

X_all = X_all[indices, :]
X_volume_all = X_volume_all[indices, :]
y_all = y_all[indices]
y_forward_all = y_forward_all[indices]


#scaled_dict['y_all_scaled'] = scaled_dict['y_all_scaled'][indices, :]
#scaled_dict['y_forward_all_scaled'] = scaled_dict['y_forward_all_scaled'][indices, :]
        
#scaled_dict['X_all_scaled'] = scaled_dict['X_all_scaled'][indices, :]
#scaled_dict['X_all_scaled_mean'] = scaled_dict['X_all_scaled_mean'][indices, :]
#scaled_dict['X_all_scaled_std'] = scaled_dict['X_all_scaled_std'][indices, :]
    
#scaled_dict['X_volume_all_scaled'] = scaled_dict['X_volume_all_scaled'][indices, :]
#scaled_dict['X_volume_all_scaled_mean'] = scaled_dict['X_volume_all_scaled_mean'][indices, :]
#scaled_dict['X_volume_all_scaled_std'] = scaled_dict['X_volume_all_scaled_std'][indices, :]
    
##self.scaled_dict['MSS_all_scaled'] = self.scaled_dict['MSS_all_scaled'][indices, :]
##self.scaled_dict['MSS_all_scaled_mean'] = self.scaled_dict['MSS_all_scaled_mean'][indices, :]
##self.scaled_dict['MSS_all_scaled_std'] = self.scaled_dict['MSS_all_scaled_std'][indices, :]

In [23]:
# figure out a way to compute this rather than hard code it
n_features = 2
        
n_samples = X_all.shape[0]
n_timepoints = config['n_back']

M = np.zeros([n_samples, n_timepoints, n_features])

for i in range(0, n_samples):
    M[i, :, 0] = X_all[i, :]
    M[i, :, 1] = X_volume_all[i, :]

In [24]:
if config['reduce_vector_sizes']:
    indices_modulus = np.array([x % config['modulus_integer'] for x in range(0, M.shape[0])])
    indices_modulus_selected = np.where(indices_modulus == 0)[0]
else:
    indices_modulus_selected = np.array(range(0, M.shape[0]))

M_after_modulus_operation = M[indices_modulus_selected, :, :]
y_after_modulus_operation = y_all[indices_modulus_selected, :]
y_forward_after_modulus_operation = y_forward_all[indices_modulus_selected, :]

In [25]:
row_count = M_after_modulus_operation.shape[0]

train_val_test_dict = {}
position = 0
for group in ['train', 'val', 'test']:
    n = int(config['train_val_test_split'][group] * row_count)

    train_val_test_dict[group] = {
                
        'M' : M_after_modulus_operation[position:(position + n), :, :],
        'y' : y_after_modulus_operation[position:(position + n), :],
        'y_forward' : y_forward_after_modulus_operation[position:(position + n), :],
        'n' : n,
        'position' : position,
    }
            
    position += n

In [26]:
with open('output/booger.pickled', 'wb') as fff:
    pickle.dump(train_val_test_dict, fff)

In [27]:
M.shape

(3777202, 180, 2)

In [28]:
train_val_test_dict['train']['M'].shape

(94430, 180, 2)

In [29]:
train_val_test_dict['train']['y'].shape

(94430, 2)