In [1]:
from itertools import chain
import pandas as pd
import numpy as np
import datetime
import time
import os
from pathlib import Path
from alpha_vantage.timeseries import TimeSeries
import sys
from tqdm import tqdm

In [2]:
ALPHA_VANTAGE_DIR_PATH = Path("alphadata").absolute()
# GET TICKERS
tickers = os.listdir(ALPHA_VANTAGE_DIR_PATH)

In [3]:
slippage = .005 # 0.5% slippage per trade
dict_dfs = dict()

In [4]:
def generate_monthly_stats(df):
    #print(df)
    log_return = df["Close"].apply(np.log).diff()
    half_way_point = len(df) // 2

    return {
        "Open": df["Open"].iloc[0],
        "High": df["High"].max(),
        "Low": df["Low"].min(),
        "Close": df["Close"].iloc[-1],
        "Volume": df["Volume"].sum(),
        "first_half_log_return_mean": log_return.iloc[:half_way_point].mean(),
        "first_half_log_return_std": log_return.iloc[:half_way_point].std(),
        "second_half_log_return_mean": log_return.iloc[half_way_point:].mean(),
        "second_half_log_return_std": log_return.iloc[half_way_point:].std(),
        "first_second_half_log_return_diff": (
            log_return.iloc[half_way_point:].sum()
            - log_return.iloc[:half_way_point].sum()
        ),
        "log_return_mean": log_return.mean(),
        "log_return_std": log_return.std(),
        "log_return_min": log_return.min(),
        "log_return_max": log_return.max(),
        "month_log_return": np.log(df["Close"].iloc[-1] / df["Open"].iloc[0]),
        "pct_bull": (log_return > 0).mean()
    }

In [6]:
s = datetime.datetime.now()

for t in tqdm(tickers):
    # t = tickers[0]
    try:
        temp = (
                pd.read_csv(ALPHA_VANTAGE_DIR_PATH / f"{t}", index_col=0, parse_dates=True)
                .groupby(pd.Grouper(freq="1M"))
                .apply(generate_monthly_stats)
            )


        df = pd.DataFrame()
        for i in range(len(temp)):
            df = df.append(pd.DataFrame.from_dict(temp.iloc[i],orient='index').T, ignore_index=True)
        # df = df.append(pd.DataFrame.from_dict(temp.iloc[1],orient='index').T, ignore_index=True)
        df.set_index = temp.index

        df.index = list(temp.index)

        df["next_month_log_return"] = np.log(
                np.exp(df["month_log_return"].shift(-1)) * (1 - slippage) / (1 + slippage)
            )
        dict_dfs[t] = df
    except:
        print(f"Skipping: {t}")

e = datetime.datetime.now()

print(e-s)

 27%|██▋       | 244/901 [11:45<33:11,  3.03s/it]

Skipping: PSFT.csv


 40%|███▉      | 359/901 [16:51<18:03,  2.00s/it]

Skipping: APOL.csv


 46%|████▌     | 411/901 [19:20<28:49,  3.53s/it]

Skipping: USBC.csv


 52%|█████▏    | 465/901 [21:54<21:34,  2.97s/it]

Skipping: CPWR.csv


 92%|█████████▏| 833/901 [38:23<02:56,  2.60s/it]

Skipping: NVLS.csv


 93%|█████████▎| 837/901 [38:33<02:53,  2.71s/it]

Skipping: DNB.csv


 95%|█████████▍| 854/901 [39:12<02:28,  3.15s/it]

Skipping: NE.csv


100%|██████████| 901/901 [41:16<00:00,  2.75s/it]

0:41:16.423575





In [8]:
dict_dfs[t].to_csv(t)

In [9]:
for ticker in tqdm(dict_dfs):
    dict_dfs[ticker].to_csv(f'tickers_summary/{ticker}')

100%|██████████| 894/894 [00:21<00:00, 42.34it/s]
