In [None]:
from itertools import chain
import pandas as pd
import numpy as np
import datetime
import time
import os
from pathlib import Path
from alpha_vantage.timeseries import TimeSeries
import sys

In [2]:
ALPHA_VANTAGE_DIR_PATH = Path("alphadata").absolute()

In [4]:
def generate_monthly_stats(df):
    #print(df)
    log_return = df["Close"].apply(np.log).diff()
    half_way_point = len(df) // 2

    return {
        "Open": df["Open"].iloc[0],
        "High": df["High"].max(),
        "Low": df["Low"].min(),
        "Close": df["Close"].iloc[-1],
        "Volume": df["Volume"].sum(),
        "first_half_log_return_mean": log_return.iloc[:half_way_point].mean(),
        "first_half_log_return_std": log_return.iloc[:half_way_point].std(),
        "second_half_log_return_mean": log_return.iloc[half_way_point:].mean(),
        "second_half_log_return_std": log_return.iloc[half_way_point:].std(),
        "first_second_half_log_return_diff": (
            log_return.iloc[half_way_point:].sum()
            - log_return.iloc[:half_way_point].sum()
        ),
        "log_return_mean": log_return.mean(),
        "log_return_std": log_return.std(),
        "log_return_min": log_return.min(),
        "log_return_max": log_return.max(),
        "month_log_return": np.log(df["Close"].iloc[-1] / df["Open"].iloc[0]),
        "pct_bull": (log_return > 0).mean()
    }

In [5]:
tickers = list(set(os.listdir(ALPHA_VANTAGE_DIR_PATH)))

In [6]:
slippage = .005 # 0.5% slippage per trade

In [33]:
dict_dfs = dict()

In [34]:
for t in tickers:
    df = (pd.read_csv(os.path.join(ALPHA_VANTAGE_DIR_PATH,t), index_col=0, parse_dates=True).groupby(pd.Grouper(freq="1M")).apply(generate_monthly_stats))

In [44]:
cols = list(temp.iloc[0].keys())
df=pd.DataFrame(columns=cols)

In [47]:
df = pd.DataFrame(list(temp),index=temp.index)

In [None]:
    # this stock is not available on alpha vantage
    if not (ALPHA_VANTAGE_DIR_PATH / f"{t}.csv").is_file():
        continue
    temp = (
        pd.read_csv(ALPHA_VANTAGE_DIR_PATH / f"{t}.csv", index_col=0, parse_dates=True)
        .groupby(pd.Grouper(freq="1M"))
        .apply(generate_monthly_stats)
    )
    print(temp)
    df["next_month_log_return"] = np.log(np.exp(df['month_log_return'].shift(-1))*(1 - slippage) / (1 + slippage))
    dict_dfs[t] = df

In [56]:
df['month_log_return'].shift(-1)

date
2019-03-31   -0.018264
2019-04-30    0.037967
2019-05-31   -0.012765
2019-06-30   -0.001963
2019-07-31   -0.002455
2019-08-31   -0.003922
2019-09-30    0.003937
2019-10-31   -0.007874
2019-11-30   -0.021979
2019-12-31    0.000000
2020-01-31    0.000000
2020-02-29         NaN
Freq: M, Name: month_log_return, dtype: float64

date
2019-03-31   -0.028264
2019-04-30    0.027967
2019-05-31   -0.022766
2019-06-30   -0.011963
2019-07-31   -0.012455
2019-08-31   -0.013922
2019-09-30   -0.006063
2019-10-31   -0.017874
2019-11-30   -0.031979
2019-12-31   -0.010000
2020-01-31   -0.010000
2020-02-29         NaN
Freq: M, Name: month_log_return, dtype: float64

In [None]:
df['next_month_log_return']