In [1]:
import pandas as pd
import numpy as np
import os

os.environ['CUDA_VISIBLE_DEVICES'] = '-1'


In [2]:
df = pd.read_csv('data/data.csv')

In [3]:
df['date'] = pd.to_datetime(df['date'])



In [4]:
def compute_features(dataframe):
    g = dataframe.groupby(['ticker'])
    date_g = g['date'].apply(list)
    close_g = g['close'].apply(list)
    volume_g = g['volume'].apply(list)
    
    date_df = pd.DataFrame(date_g).reset_index()
    close_df = pd.DataFrame(close_g).reset_index()
    volume_df = pd.DataFrame(volume_g).reset_index()
    
    rs_df = date_df.merge(close_df, how = 'left', left_on = 'ticker', right_on = 'ticker').merge(volume_df, how = 'left', left_on = 'ticker', right_on = 'ticker')
    
    rs_df['features'] = rs_df.apply(lambda r: 
                               {k: {'close': c, 'volume': v} for k,c,v in zip(r['date'], r['close'], r['volume'])} , axis =1)
    
    return rs_df

In [5]:
final_df = compute_features(df)

In [6]:
ticker_array = final_df['ticker'].values

In [7]:
import datetime

def check_workday(date):
    if (date.weekday() >= 0) and (date.weekday() <= 4):
        return True
    return False

def generate_workdays(startime, endtime):
    step = datetime.timedelta(days=1)
    start = pd.to_datetime(startime)
    end = pd.to_datetime(endtime)
    results = []
    while start <= end:
        if check_workday(start):
            results.append(start)
        start += step
    return results

def generate_data(star_time, end_time, predict_time):
    x = []
    y = []
    days = generate_workdays(star_time, end_time)
    step =  datetime.timedelta(days=1)
    predict_days = generate_workdays(end_time + step , predict_time)
    for i in range(len(ticker_array)):
        values = []
        for d in days:
            if d in final_df['features'][i]:
                values.append([final_df['features'][i][d]['close'], final_df['features'][i][d]['volume']])
            elif len(values) > 0:
                values.append(values[-1])
            else:
                values.append([0, 0])
        x.append(values)
        return_values = []
        for d in predict_days:
            if d in final_df['features'][i]:
                return_values.append(final_df['features'][i][d]['close'])
            elif len(return_values) > 0:
                return_values.append(return_values[-1])
            else:
                return_values.append(0)

        rate_values = []
        for j in range(1, len(return_values)):
            if (return_values[j-1] != 0) and (return_values[j] != 0):
                rate_values.append(return_values[j]/return_values[j-1] - 1)
            else:
                rate_values.append(0.0)
        y.append(rate_values)
    return x, y
    

In [8]:
max_length = datetime.timedelta(days=90)
predict_lenght = datetime.timedelta(days=28)
min_time = df['date'].min()
max_time = df['date'].max()
end_time = max_time - predict_lenght
star_time = end_time - max_length
predict_time = max_time
X = []
Y = []
step =  datetime.timedelta(days=1)

while star_time >= min_time:
    x, y = generate_data(star_time, end_time, predict_time)
    weekday = star_time.weekday()
    if weekday == 0:
        k = 3
    elif weekday == 6:
        k = 2
    else:
        k = 1
    predict_time = predict_time - k*step
    end_time = end_time - k*step
    star_time = star_time - k*step
    X.append(x)
    Y.append(y)

KeyboardInterrupt: 

In [None]:
import pickle


In [None]:
with open('data/input.pkl', 'wb') as f:
    pickle.dump(np.array(X), f)

In [None]:
with open('data/output.pkl', 'wb') as f:
    pickle.dump(np.array(Y), f)

In [None]:
len(Y)