In [5]:
import numpy as np # linear algebra
import pandas as pd # data processing
import yfinance as yf # Yahoo finance API

In [10]:
import math
import warnings
warnings.filterwarnings("ignore")
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, fbeta_score
from sklearn.metrics import classification_report, confusion_matrix, make_scorer
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import GridSearchCV, TimeSeriesSplit
from sklearn.linear_model import LogisticRegression

In [12]:
df = yf.download('FRCOY', start = '1990-01-01', end = '2022-07-01', interval = '1d', progress = False, auto_adjust = True)
df.drop(columns = ['Volume'], inplace = True)
df['y'] = np.where(df['Close'] > df['Close'].shift(1), 1, 0)
df

Unnamed: 0_level_0,Open,High,Low,Close,y
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2010-01-05,6.120000,6.233333,6.110000,6.110000,0
2010-01-06,6.123333,6.123333,6.106667,6.106667,0
2010-01-07,6.050000,6.050000,6.000000,6.036667,0
2010-01-08,6.166667,6.166667,5.966667,5.966667,0
2010-01-11,6.033333,6.033333,5.940000,5.940000,0
...,...,...,...,...,...
2022-06-24,17.610001,17.629999,17.526667,17.629999,1
2022-06-27,18.126667,18.126667,17.500000,17.500000,0
2022-06-28,17.176666,17.756666,17.176666,17.610001,1
2022-06-29,17.583332,17.583332,17.403334,17.440001,0


In [13]:
df['Open_1'] = df['Open'].shift(1)
df['High_1'] = df['High'].shift(1)
df['Low_1'] = df['Low'].shift(1)
df['Close_1'] = df['Close'].shift(1)
features = ['Open_1','High_1','Low_1','Close_1']
df.dropna(inplace=True)
df[features]

Unnamed: 0_level_0,Open_1,High_1,Low_1,Close_1
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2010-01-06,6.120000,6.233333,6.110000,6.110000
2010-01-07,6.123333,6.123333,6.106667,6.106667
2010-01-08,6.050000,6.050000,6.000000,6.036667
2010-01-11,6.166667,6.166667,5.966667,5.966667
2010-01-12,6.033333,6.033333,5.940000,5.940000
...,...,...,...,...
2022-06-24,17.520000,17.600000,17.379999,17.549999
2022-06-27,17.610001,17.629999,17.526667,17.629999
2022-06-28,18.126667,18.126667,17.500000,17.500000
2022-06-29,17.176666,17.756666,17.176666,17.610001


In [17]:
X = df[features].values
y = df.y
X

array([[ 6.11999989,  6.23333311,  6.11000013,  6.11000013],
       [ 6.12333298,  6.12333298,  6.10666704,  6.10666704],
       [ 6.05000019,  6.05000019,  6.        ,  6.03666687],
       ...,
       [18.12666702, 18.12666702, 17.5       , 17.5       ],
       [17.17666626, 17.75666618, 17.17666626, 17.61000061],
       [17.58333206, 17.58333206, 17.40333366, 17.44000053]])

In [None]:
index = int(len(y)*0.9)

In [None]:
X_train, y_train = X[:index:], y[:index:]
X_test, y_test = X[index:], y[index:]

In [None]:
penalty = ['l1','l2','elasticnet'] 
CC = [1e-8,1e-7,1e-6,1e-5,1e-4,1e-3,1e-2,1e-1,0,1e1,1e2,1e3,1e4,1e5,1e6,1e7] 
solver = ['newton-cg','lbfgs','liblinear','sag','saga'] 

max_iter = [i for i in range(10, 500, 20)]
param_grid = dict(penalty=['l2'],C=CC,solver=solver,max_iter=max_iter) 

cv = TimeSeriesSplit(n_splits = 5)
lr = LogisticRegression()

grid = GridSearchCV(estimator=lr, param_grid = param_grid, scoring = 'roc_auc', cv=cv, verbose=1)
grid.fit(X_train, y_train)

In [None]:
grid.best_params_

In [None]:
param_grid = dict(penalty=['l2'],C=[0.001],solver=['saga'],max_iter=[70]) 
grid = GridSearchCV(estimator=lr, param_grid = param_grid, scoring = 'roc_auc', cv=cv, verbose=1)
grid.fit(X_train, y_train)

In [None]:
model = grid.best_estimator_
y_pred = model.predict(X_test)

In [None]:
confusion_matrix(y_test, y_test)
classification_report(y_test, y_pred)