In [30]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [31]:
# log

import os
from datetime import datetime
from inspect import currentframe, getframeinfo


class MeowLogger(object):
    def __init__(self):
        self.logf = None

    def __del__(self):
        if self.logf is not None:
            self.logf.close()

    def __header(self, pid):
        now = datetime.now()
        frameInfo = getframeinfo(currentframe().f_back.f_back)
        if pid:
            return "[\033[90m{}|\033[0m{}:{}|{}] ".format(now.strftime("%Y-%m-%dT%H:%M:%S.%f"), os.path.basename(frameInfo.filename), frameInfo.lineno, os.getpid())
        return "[\033[90m{}|\033[0m{}:{}] ".format(now.strftime("%Y-%m-%dT%H:%M:%S.%f"), os.path.basename(frameInfo.filename), frameInfo.lineno)

    def setLogFile(self, filename):
        if self.logf is not None:
            self.logf.close()
        self.logf = open(filename, "w")

    def log(self, content, muted=False):
        if muted:
            return
        if self.logf is not None:
            self.logf.write(content + "\n")
            self.logf.flush()
            return
        print(content)

    def inf(self, line, pid=False, muted=False):
        self.log(self.__header(pid) + line, muted)

    def grey(self, line, pid=False, muted=False):
        self.log("{}\033[90m{}\033[0m".format(self.__header(pid), line), muted)

    def red(self, line, pid=False, muted=False):
        self.log("{}\033[91m{}\033[0m".format(self.__header(pid), line), muted)

    def green(self, line, pid=False, muted=False):
        self.log("{}\033[92m{}\033[0m".format(self.__header(pid), line), muted)

    def yellow(self, line, pid=False, muted=False):
        self.log("{}\033[93m{}\033[0m".format(self.__header(pid), line), muted)

    def blue(self, line, pid=False, muted=False):
        self.log("{}\033[94m{}\033[0m".format(self.__header(pid), line), muted)

    def pink(self, line, pid=False, muted=False):
        self.log("{}\033[95m{}\033[0m".format(self.__header(pid), line), muted)

    def cyan(self, line, pid=False, muted=False):
        self.log("{}\033[96m{}\033[0m".format(self.__header(pid), line), muted)


log = MeowLogger()

In [32]:
# calendar
import bisect

class Calendar(object):
    def __init__(self, base_dir):
        calendarFile = os.path.join(base_dir, "resources", "calendar")
        with open(calendarFile) as f:
            tokens = f.read().splitlines()
            self.tradingDays = sorted([int(x) for x in tokens])
            self.tradingDaySet = set(self.tradingDays)

    def isTradingDay(self, date):
        if not isinstance(date, int):
            date = int(date)
        return date in self.tradingDaySet

    def toTradingDay(self, date):
        if not isinstance(date, int):
            date = int(date)
        index = bisect.bisect_left(self.tradingDays, date)
        return self.tradingDays[index]

    def next(self, date):
        if not isinstance(date, int):
            date = int(date)
        index = bisect.bisect_right(self.tradingDays, date)
        if index >= len(self.tradingDays):
            return None
        return self.tradingDays[index]

    def prev(self, date):
        if not isinstance(date, int):
            date = int(date)
        index = bisect.bisect_left(self.tradingDays, date)
        if index == 0:
            return None
        return self.tradingDays[index - 1]

    def shift(self, date, n):
        if not isinstance(date, int):
            date = int(date)
        if not isinstance(n, int):
            log.red("Invalid shift n: {}".format(n))
            return None

        index = bisect.bisect_left(self.tradingDays, date)
        if index == 0:
            log.red("Failed to shift for date {}, n={}".format(date, n))
            return None
        return self.tradingDays[index + n]

    def prevn(self, date, n):
        if not isinstance(date, int):
            date = int(date)
        if not isinstance(n, int) or n < 1:
            log.red("Invalid prevn: date={},n={}".format(date, n))
            return None

        index = bisect.bisect_left(self.tradingDays, date)
        if index == 0:
            log.red("Failed to find prev trading day for date {}".format(date))
            return None
        if index < n:
            log.yellow("Not enough days for prevn: date={},n={},index={}".format(date, n, index))

        return self.tradingDays[max(index - n, 0) : index]

    def nextn(self, date, n):
        if not isinstance(date, int):
            date = int(date)
        if not isinstance(n, int) or n < 1:
            log.red("Invalid nextn: date={},n={}".format(date, n))
            return None

        index = bisect.bisect_right(self.tradingDays, date)
        if index >= len(self.tradingDays):
            log.red("Failed to find next trading day for date {}".format(date))
            return None
        if index + n > len(self.tradingDays):
            log.yellow("Not enough days for next: date={},n={},index={}".format(date, n, index))

        return self.tradingDays[index: min(index + n, len(self.tradingDays))]

    def range(self, startDate, endDate):
        if not isinstance(startDate, int):
            startDate = int(startDate)
        if not isinstance(endDate, int):
            endDate = int(endDate)
        if startDate > endDate:
            log.red("Invalid range - startDate is larger than endDate: startDate={},endDate={}".format(startDate, endDate))
            return None

        startIndex = bisect.bisect_left(self.tradingDays, startDate)
        if (startIndex == len(self.tradingDays)):
            log.red("No valid trading days found within the range [{}, {})".format(startDate, endDate))
            return None

        endIndex = bisect.bisect_right(self.tradingDays, endDate)
        return self.tradingDays[startIndex : endIndex]

calendar = Calendar(base_dir)

In [33]:
# data loader
class MeowDataLoader(object):
    def __init__(self, h5dir, base_dir):
        self.h5dir = h5dir
        self.calendar = Calendar(base_dir)


    def loadDates(self, dates):
        if len(dates) == 0:
            raise ValueError("Dates empty")
        log.inf("Loading data of {} dates from {} to {}...".format(len(dates), min(dates), max(dates)))
        return pd.concat(self.loadDate(x) for x in dates)


    def loadDate(self, date):
        if not self.calendar.isTradingDay(date):
            raise ValueError("Not a trading day: {}".format(date))
        h5File = os.path.join(self.h5dir, "{}.h5".format(date))
        df = pd.read_hdf(h5File)
        df.loc[:, "date"] = date
        precols = ["symbol", "interval", "date"]
        df = df[precols + [x for x in df.columns if x not in precols]] # re-arrange columns
        return df

In [43]:
# mdl:baseline

from sklearn.linear_model import Ridge

class MeowModel(object):
    def __init__(self, cacheDir):
        self.estimator = Ridge(
            alpha=0.5,
            random_state=None,
            fit_intercept=False,
            tol=1e-8
        )

    def fit(self, xdf, ydf):
        self.estimator.fit(
            X=xdf.values,
            y=ydf.values,
        )
        log.inf("Done fitting")

    def predict(self, xdf):
        return self.estimator.predict(xdf.to_numpy())

In [35]:
# mdl:decision tree

from sklearn.tree import DecisionTreeRegressor

class MeowDecisionTreeModel(MeowModel):
    def __init__(self, cacheDir):
        self.estimator = DecisionTreeRegressor(
            max_depth=10,  # 可以调整这个参数
            random_state=42
        )

In [36]:
# eval

class MeowEvaluator(object):
    def __init__(self, cacheDir):
        self.cacheDir = cacheDir
        self.predictionCol = "forecast"
        self.ycol = "fret12"

    def eval(self, ydf):
        ydf = ydf.replace([np.inf, -np.inf], np.nan).fillna(0)
        pcor = ydf[[self.predictionCol, self.ycol]].corr().to_numpy()[0, 1]
        r2 = 1 - ((ydf[self.predictionCol] - ydf[self.ycol]) ** 2).sum() / ydf[self.ycol].var() / ydf.shape[0]
        mse = ((ydf[self.predictionCol] - ydf[self.ycol]) ** 2).sum() / ydf.shape[0]
        log.inf("Meow evaluation summary: Pearson correlation={:.4f}, R2={:.5f}, MSE={:.2f}".format(pcor, r2, mse))

In [37]:
# features

class MeowFeatureGenerator(object):
    @classmethod
    def featureNames(cls):
        return [
            "ob_imb0",
            "ob_imb4",
            "ob_imb9",
            "trade_imb",
            "trade_imbema5",
            "lagret12",
        ]

    def __init__(self, cacheDir):
        self.cacheDir = cacheDir
        self.ycol = "fret12"
        self.mcols = ["symbol", "date", "interval"]

    def genFeatures(self, df):
        log.inf("Generating {} features from raw data...".format(len(self.featureNames())))
        df.loc[:, "ob_imb0"] = (df["asize0"] - df["bsize0"]) / (df["asize0"] + df["bsize0"])
        df.loc[:, "ob_imb4"] = (df["asize0_4"] - df["bsize0_4"]) / (df["asize0_4"] + df["bsize0_4"])
        df.loc[:, "ob_imb9"] = (df["asize5_9"] - df["bsize5_9"]) / (df["asize5_9"] + df["bsize5_9"])
        df.loc[:, "trade_imb"] = (df["tradeBuyQty"] - df["tradeSellQty"]) / (df["tradeBuyQty"] + df["tradeSellQty"])
        df.loc[:, "trade_imbema5"] = df["trade_imb"].ewm(halflife=5).mean()
        df.loc[:, "bret12"] = (df["midpx"] - df["midpx"].shift(12)) / df["midpx"].shift(12) # backward return
        cxbret = df.groupby("interval")[["bret12"]].mean().reset_index().rename(columns={"bret12": "cx_bret12"})
        df = df.merge(cxbret, on="interval", how="left")
        df.loc[:, "lagret12"] = df["bret12"] - df["cx_bret12"]
        xdf = df[self.mcols + self.featureNames()].set_index(self.mcols)
        ydf = df[self.mcols + [self.ycol]].set_index(self.mcols)
        return xdf.fillna(0), ydf.fillna(0)


In [38]:
# meow

class MeowEngine(object):
    def __init__(self, h5dir, cacheDir, base_dir):
        self.calendar = Calendar(base_dir)
        self.h5dir = h5dir
        if not os.path.exists(h5dir):
            raise ValueError("Data directory not exists: {}".format(self.h5dir))
        if not os.path.isdir(h5dir):
            raise ValueError("Invalid data directory: {}".format(self.h5dir))
        self.cacheDir = cacheDir # this is not used in sample code
        self.dloader = MeowDataLoader(h5dir=h5dir, base_dir=base_dir)
        self.featGenerator = MeowFeatureGenerator(cacheDir=cacheDir)
        self.model = MeowModel(cacheDir=cacheDir)
        self.evaluator = MeowEvaluator(cacheDir=cacheDir)

    def fit(self, startDate, endDate):
        dates = self.calendar.range(startDate, endDate)
        rawData = self.dloader.loadDates(dates)
        log.inf("Running model fitting...")
        xdf, ydf = self.featGenerator.genFeatures(rawData)
        self.model.fit(xdf, ydf)

    def predict(self, xdf):
        return self.model.predict(xdf)

    def eval(self, startDate, endDate):
        log.inf("Running model evaluation...")
        dates = self.calendar.range(startDate, endDate)
        rawData = self.dloader.loadDates(dates)
        xdf, ydf = self.featGenerator.genFeatures(rawData)
        ydf.loc[:, "forecast"] = self.predict(xdf)
        self.evaluator.eval(ydf)



In [44]:
# main function:baseline
# if __name__ == "__main__":
base_dir = "D:\\00-homework\\lectures\\00-machine_learning\\meow"
engine = MeowEngine(h5dir="archive", cacheDir=None, base_dir=base_dir)
engine.fit(20230601, 20231130)
engine.eval(20231201, 20231229)

[[90m2024-05-27T20:19:48.883854|[0m<ipython-input-33-13b053d43d53>:11] Loading data of 123 dates from 20230601 to 20231130...
[[90m2024-05-27T20:20:56.821930|[0m<ipython-input-38-70f028236a4e>:20] Running model fitting...
[[90m2024-05-27T20:20:56.931768|[0m<ipython-input-37-b3b46447fb4a>:21] Generating 6 features from raw data...


MemoryError: Unable to allocate 4.27 GiB for an array with shape (67, 8548841) and data type float64

In [41]:
!pip install tables

Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple
Collecting tables
  Using cached https://pypi.tuna.tsinghua.edu.cn/packages/0f/c5/755a6ac9baf2a9729b75d413f1590a86081cdcc9796a9f3d54a6ae5c0639/tables-3.7.0-cp36-cp36m-win_amd64.whl (6.9 MB)
Collecting numexpr>=2.6.2
  Using cached https://pypi.tuna.tsinghua.edu.cn/packages/09/07/294921d4427e5ef8c224b1184beede0e920c71f1e2f7bc116b5a84ff0677/numexpr-2.8.1-cp36-cp36m-win_amd64.whl (88 kB)
Installing collected packages: numexpr, tables
Successfully installed numexpr-2.8.1 tables-3.7.0


In [None]:
# main function:decision tree

if __name__ == "__main__":
    engine = MeowEngine(h5dir="archive", cacheDir=None, model_class=MeowDecisionTreeModel)
    engine.fit(20230601, 20231130)
    engine.eval(20231201, 20231229)