In [None]:
#Working with QDA - a nonlinear LDA

In [24]:
import pandas as pd
import yfinance as yf

%matplotlib inline

tickers = ["F", "TM", "GM", "TSLA"]

first_date = '2009-01-01'
last_date = '2024-08-17'

# 从 Yahoo Finance 获取数据
stock_panel = yf.download(tickers, start=first_date, end=last_date)

# 获取收盘价数据并删除缺失值
stock_df = stock_panel['Close'].dropna()

# 计算180天后的股票是否上涨
classes = (stock_df.shift(-180) > stock_df).astype(int)

# 将 stock_panel 的数据用于进一步处理
X = stock_panel.stack().reset_index()
X = X.rename(columns={'level_1': 'Ticker', 0: 'Close'})

# 处理 classes 数据
classes = classes.stack().reset_index()
classes = classes.rename(columns={'level_1': 'Ticker', 0: 'is_higher'})

# 合并数据框架并删除缺失值
data = pd.merge(X, classes, on=['Date', 'Ticker']).dropna()

import patsy

# 构建设计矩阵，公式中 - 1 表示不包括截距项
X = patsy.dmatrix("Open + High + Low + Close + Volume + is_higher - 1", data, return_type='dataframe')

# 显示前几行数据
X.head()



[*********************100%%**********************]  4 of 4 completed


Unnamed: 0,Open,High,Low,Close,Volume,is_higher
0,16.77,16.870001,16.049999,16.120001,256937900.0,0.0
1,35.0,35.990002,33.889999,34.189999,457044300.0,0.0
2,77.360001,77.510002,76.830002,77.290001,989100.0,0.0
3,2.044667,2.049333,1.928,1.992667,14341500.0,0.0
4,16.02,16.379999,15.83,16.280001,130323600.0,0.0


In [26]:
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis as QDA
qda = QDA()

qda.fit(X.iloc[:, :-1], X.iloc[:, -1])
predictions = qda.predict(X.iloc[:, :-1])
predictions.sum()

11926.0

In [30]:
from sklearn.metrics import classification_report
print (classification_report(X.iloc[:, -1].values, predictions))

              precision    recall  f1-score   support

         0.0       0.63      0.17      0.27      6995
         1.0       0.51      0.90      0.65      6837

    accuracy                           0.53     13832
   macro avg       0.57      0.53      0.46     13832
weighted avg       0.57      0.53      0.46     13832



In [32]:
from sklearn.model_selection import ShuffleSplit
import scipy.stats as sp

shuffle_split_inst = ShuffleSplit()

for test, train in shuffle_split_inst.split(X):
      train_set = X.iloc[train]
      train_close = train_set.Close

      train_0 = train_close[~train_set.is_higher.astype(bool)]
      train_1 = train_close[train_set.is_higher.astype(bool)]

      test_set = X.iloc[test]
      test_close = test_set.Close.values

ll_0 = sp.norm.pdf(test_close, train_0.mean())
ll_1 = sp.norm.pdf(test_close, train_1.mean())

(ll_0 > ll_1).mean()

0.06515102827763496