In [10]:
import pandas as pd
from fredapi import Fred
import yfinance as yf
from ta import add_all_ta_features

# Setup
fred_api_key = '1e074c3898d6261abc56ff5962358644'  # FRED API key
fred = Fred(api_key=fred_api_key)
stock_symbol = 'AAPL'  # Stock symbol for Apple Inc.

# Define the economic indicators to fetch from FRED
indicators = {
    'DFF': 'Fed Funds Rate',
    'DGS10': '10-Yr Treasury',
    'VIXCLS': 'VIX',
    'DCOILWTICO': 'WTI Crude Oil',
    'DCOILBRENTEU': 'Brent Crude Oil',
    'DTWEXBGS': 'Trade Weighted USD Index'
}

# Fetch Economic Indicators from FRED
fred_data = pd.DataFrame()
for code, name in indicators.items():
    series = fred.get_series(code, observation_start='2018-01-01', observation_end='2024-01-01')
    fred_data[name] = series

fred_data.index = pd.to_datetime(fred_data.index)  # Ensure the index is datetime for easier alignment

# Fetch Stock Data from Yahoo Finance
aapl_data = yf.download(stock_symbol, start='2018-01-01', end='2024-01-01')

aapl_data = add_all_ta_features(
            aapl_data,
            open="Open",
            high="High",
            low="Low",
            close="Close",
            volume="Volume",
            fillna=True,
        )
aapl_data["Target"] = (aapl_data["Close"].shift(-1) > aapl_data["Close"]).astype(int)

aapl_data["return"] = aapl_data["Close"].pct_change()

aapl_data = aapl_data.dropna()  # Drop any rows with NaN values

# Ensure the index is datetime for easier alignment
aapl_data.index = pd.to_datetime(aapl_data.index)

# Aligning Data by Date Without Filling Missing Data
# To align all columns from AAPL data with FRED data, we merge on the index (date) without filling missing data
aligned_data = pd.merge(fred_data, aapl_data, left_index=True, right_index=True, how='inner')

# List of additional symbols to fetch from Yahoo Finance
additional_symbols = ['^GSPC', 'GC=F', 'CL=F']

# Fetch additional data and align it
for symbol in additional_symbols:
    # Fetch data for each symbol
    symbol_data = yf.download(symbol, start='2018-01-01', end='2024-01-01')
    # Ensure the index is datetime for easier alignment
    symbol_data.index = pd.to_datetime(symbol_data.index)
    # Merge with the aligned_data DataFrame
    aligned_data = pd.merge(aligned_data, symbol_data[['Close']], left_index=True, right_index=True, how='inner', suffixes=('', f'_{symbol}'))


# After merging all additional data, check for NaN values and drop any rows with NaN values to ensure 100% alignment
aligned_data.dropna(inplace=True)

# Validation
if aligned_data.isnull().any().any():
    print("There are NaN values in the DataFrame.")
else:
    print("Data is 100% aligned with no NaN values.")

# Display the shape of the final DataFrame and a few rows to verify
print(f"Shape of the DataFrame after aligning and cleaning: {aligned_data.shape}")
print(aligned_data.head())



  df.index += _pd.TimedeltaIndex(dst_error_hours, 'h')
[*********************100%%**********************]  1 of 1 completed
  self._psar[i] = high2
  df.index += _pd.TimedeltaIndex(dst_error_hours, 'h')
[*********************100%%**********************]  1 of 1 completed
  df.index += _pd.TimedeltaIndex(dst_error_hours, 'h')
[*********************100%%**********************]  1 of 1 completed
  df.index += _pd.TimedeltaIndex(dst_error_hours, 'h')
[*********************100%%**********************]  1 of 1 completed

Data is 100% aligned with no NaN values.
Shape of the DataFrame after aligning and cleaning: (1459, 103)
            Fed Funds Rate  10-Yr Treasury    VIX  WTI Crude Oil  \
2018-01-03            1.42            2.44   9.15          61.61   
2018-01-04            1.42            2.46   9.22          61.98   
2018-01-05            1.42            2.47   9.22          61.49   
2018-01-08            1.42            2.49   9.52          61.73   
2018-01-09            1.42            2.55  10.08          62.92   

            Brent Crude Oil  Trade Weighted USD Index       Open       High  \
2018-01-03            67.85                  109.6751  43.132500  43.637501   
2018-01-04            68.73                  109.4779  43.134998  43.367500   
2018-01-05            68.01                  109.3496  43.360001  43.842499   
2018-01-08            68.48                  109.5678  43.587502  43.902500   
2018-01-09            69.08                  109.9022  43.637501  43.764999   

           




In [11]:
import pandas as pd
import yfinance as yf
from fredapi import Fred
from ta import add_all_ta_features

def setup_fred(api_key):
    """Setup FRED API client."""
    return Fred(api_key=api_key)

def fetch_fred_series(fred, series_code, start_date, end_date):
    """Fetch a time series from FRED."""
    try:
        series = fred.get_series(series_code, observation_start=start_date, observation_end=end_date)
        return pd.Series(series, name=series_code)
    except Exception as e:
        print(f"Error fetching {series_code} from FRED: {e}")
        return pd.Series(name=series_code)

def fetch_and_prepare_stock_data(symbol, start_date, end_date):
    """Fetch stock data from Yahoo Finance and prepare it with technical analysis features."""
    stock_data = yf.download(symbol, start=start_date, end=end_date)
    stock_data = add_all_ta_features(
        stock_data,
        open="Open", high="High", low="Low", close="Close", volume="Volume", fillna=True
    )
    stock_data["Target"] = (stock_data["Close"].shift(-1) > stock_data["Close"]).astype(int)
    stock_data["return"] = stock_data["Close"].pct_change()
    return stock_data.dropna()

def align_data(*dataframes):
    """Align multiple dataframes by their index."""
    aligned_df = pd.concat(dataframes, axis=1, join='inner')
    aligned_df.dropna(inplace=True)  # Ensure no NaN values after alignment
    return aligned_df

# Configuration and setup
fred_api_key = '1e074c3898d6261abc56ff5962358644'  # Replace with your FRED API key
fred = setup_fred(fred_api_key)
stock_symbol = 'AAPL'
start_date, end_date = '2018-01-01', '2024-01-01'

# Economic indicators to fetch
indicators = ['DFF', 'DGS10', 'VIXCLS', 'DCOILWTICO', 'DCOILBRENTEU', 'DTWEXBGS']

# Fetch and prepare economic indicators data
fred_data = pd.concat([fetch_fred_series(fred, code, start_date, end_date) for code in indicators], axis=1)

# Fetch and prepare stock data
aapl_data = fetch_and_prepare_stock_data(stock_symbol, start_date, end_date)

# Fetch additional data and align all
additional_symbols = ['^GSPC', 'GC=F', 'CL=F']
additional_data = [yf.download(symbol, start=start_date, end=end_date)['Close'].rename(symbol) for symbol in additional_symbols]

# Aligning all data
aligned_data = align_data(fred_data, aapl_data, *additional_data)

# Validation and display
if aligned_data.isnull().any().any():
    print("There are NaN values in the DataFrame.")
else:
    print("Data is 100% aligned with no NaN values.")

print(f"Shape of the DataFrame after aligning and cleaning: {aligned_data.shape}")
print(aligned_data.head())


  df.index += _pd.TimedeltaIndex(dst_error_hours, 'h')
[*********************100%%**********************]  1 of 1 completed
  self._psar[i] = high2
  df.index += _pd.TimedeltaIndex(dst_error_hours, 'h')
[*********************100%%**********************]  1 of 1 completed
  df.index += _pd.TimedeltaIndex(dst_error_hours, 'h')
[*********************100%%**********************]  1 of 1 completed
  df.index += _pd.TimedeltaIndex(dst_error_hours, 'h')
[*********************100%%**********************]  1 of 1 completed

Data is 100% aligned with no NaN values.
Shape of the DataFrame after aligning and cleaning: (1459, 103)
             DFF  DGS10  VIXCLS  DCOILWTICO  DCOILBRENTEU  DTWEXBGS  \
2018-01-03  1.42   2.44    9.15       61.61         67.85  109.6751   
2018-01-04  1.42   2.46    9.22       61.98         68.73  109.4779   
2018-01-05  1.42   2.47    9.22       61.49         68.01  109.3496   
2018-01-08  1.42   2.49    9.52       61.73         68.48  109.5678   
2018-01-09  1.42   2.55   10.08       62.92         69.08  109.9022   

                 Open       High        Low      Close  ...  \
2018-01-03  43.132500  43.637501  42.990002  43.057499  ...   
2018-01-04  43.134998  43.367500  43.020000  43.257500  ...   
2018-01-05  43.360001  43.842499  43.262501  43.750000  ...   
2018-01-08  43.587502  43.902500  43.482498  43.587502  ...   
2018-01-09  43.637501  43.764999  43.352501  43.582500  ...   

            momentum_pvo_hist  momentum_kama  others_dr  others_dlr  \
2018-01-03        




In [12]:
from statsmodels.tsa.stattools import adfuller
from fracdiff.sklearn import FracdiffStat

def apply_fractional_differencing(
    df: pd.DataFrame, threshold: float = 0.05
) -> pd.DataFrame:
    f_diff = FracdiffStat()
    for column in df.columns.difference(["Target"]):
        p_value = adfuller(df[column])[1]
        if p_value > threshold:
            transformed_column = f_diff.fit_transform(df[[column]])
            df[column + "_fdiff"] = transformed_column.squeeze()
    return df

# Apply fractional differencing
aligned_data = apply_fractional_differencing(aligned_data)

  df[column + "_fdiff"] = transformed_column.squeeze()
  df[column + "_fdiff"] = transformed_column.squeeze()
  df[column + "_fdiff"] = transformed_column.squeeze()
  df[column + "_fdiff"] = transformed_column.squeeze()
  df[column + "_fdiff"] = transformed_column.squeeze()
  df[column + "_fdiff"] = transformed_column.squeeze()
  df[column + "_fdiff"] = transformed_column.squeeze()
  df[column + "_fdiff"] = transformed_column.squeeze()
  df[column + "_fdiff"] = transformed_column.squeeze()
  df[column + "_fdiff"] = transformed_column.squeeze()
  df[column + "_fdiff"] = transformed_column.squeeze()
  df[column + "_fdiff"] = transformed_column.squeeze()
  df[column + "_fdiff"] = transformed_column.squeeze()
  df[column + "_fdiff"] = transformed_column.squeeze()
  df[column + "_fdiff"] = transformed_column.squeeze()
  df[column + "_fdiff"] = transformed_column.squeeze()
  df[column + "_fdiff"] = transformed_column.squeeze()
  df[column + "_fdiff"] = transformed_column.squeeze()
  df[colum

In [13]:
print(aligned_data.head())

             DFF  DGS10  VIXCLS  DCOILWTICO  DCOILBRENTEU  DTWEXBGS  \
2018-01-03  1.42   2.44    9.15       61.61         67.85  109.6751   
2018-01-04  1.42   2.46    9.22       61.98         68.73  109.4779   
2018-01-05  1.42   2.47    9.22       61.49         68.01  109.3496   
2018-01-08  1.42   2.49    9.52       61.73         68.48  109.5678   
2018-01-09  1.42   2.55   10.08       62.92         69.08  109.9022   

                 Open       High        Low      Close  ...  \
2018-01-03  43.132500  43.637501  42.990002  43.057499  ...   
2018-01-04  43.134998  43.367500  43.020000  43.257500  ...   
2018-01-05  43.360001  43.842499  43.262501  43.750000  ...   
2018-01-08  43.587502  43.902500  43.482498  43.587502  ...   
2018-01-09  43.637501  43.764999  43.352501  43.582500  ...   

            volatility_dcl_fdiff  volatility_dcm_fdiff  volatility_kcc_fdiff  \
2018-01-03             42.314999             42.976250             43.023333   
2018-01-04              6.611719  

In [14]:
from sklearn.preprocessing import MinMaxScaler

def apply_min_max_normalization(data):
    features = data.drop(["Target"], axis=1)
    scaler = MinMaxScaler()
    normalized_features = scaler.fit_transform(features)
    normalized_columns = [f"{col}_Norm" for col in features.columns]
    normalized_df = pd.DataFrame(
        normalized_features, columns=normalized_columns, index=data.index
    )
    data_normalized = pd.concat([data, normalized_df], axis=1)
    return data_normalized

# Apply Min-Max normalization
aligned_data_normalized = apply_min_max_normalization(aligned_data)

In [15]:
aligned_data_normalized.head()

Unnamed: 0,DFF,DGS10,VIXCLS,DCOILWTICO,DCOILBRENTEU,DTWEXBGS,Open,High,Low,Close,...,volatility_dcl_fdiff_Norm,volatility_dcm_fdiff_Norm,volatility_kcc_fdiff_Norm,volatility_kch_fdiff_Norm,volatility_kcl_fdiff_Norm,volume_adi_fdiff_Norm,volume_nvi_fdiff_Norm,volume_obv_fdiff_Norm,volume_vpt_fdiff_Norm,volume_vwap_fdiff_Norm
2018-01-03,1.42,2.44,9.15,61.61,67.85,109.6751,43.1325,43.637501,42.990002,43.057499,...,1.0,1.0,1.0,1.0,1.0,0.474525,1.0,0.439474,0.528494,1.0
2018-01-04,1.42,2.46,9.22,61.98,68.73,109.4779,43.134998,43.3675,43.02,43.2575,...,0.231973,0.172354,0.150158,0.145777,0.167278,0.509224,0.301977,0.523347,0.533631,0.176818
2018-01-05,1.42,2.47,9.22,61.49,68.01,109.3496,43.360001,43.842499,43.262501,43.75,...,0.171971,0.11967,0.095392,0.093162,0.10466,0.552931,0.274056,0.55031,0.542999,0.107855
2018-01-08,1.42,2.49,9.52,61.73,68.48,109.5678,43.587502,43.9025,43.482498,43.587502,...,0.148845,0.097609,0.073331,0.07051,0.080754,0.425776,0.262326,0.419046,0.529829,0.079465
2018-01-09,1.42,2.55,10.08,62.92,69.08,109.9022,43.637501,43.764999,43.352501,43.5825,...,0.136379,0.085834,0.060997,0.058346,0.066844,0.480548,0.260349,0.384891,0.530713,0.063734


In [16]:
from sklearn.model_selection import train_test_split

x = aligned_data_normalized.drop(["Target"], axis=1)
y = aligned_data_normalized["Target"]

x_train, x_test, y_train, y_test = train_test_split(x, y, shuffle=False, test_size=0.1)

In [17]:
from sklearn.model_selection import TimeSeriesSplit
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFECV

tscv = TimeSeriesSplit(n_splits=5)
rf = RandomForestClassifier(n_estimators=500)
rfecv = RFECV(estimator=rf, cv=tscv, scoring='accuracy')
rfecv.fit(x_train, y_train)

# Extracting optimal features
optimal_features = x_train.columns[rfecv.support_]
print(f"Optimal number of features: {len(optimal_features)}")
print(f"Optimal features: {list(optimal_features)}")

# Saving optimal features to CSV
optimal_features_df = pd.DataFrame(optimal_features, columns=["Feature"])
optimal_features_df.to_csv("optimal_features.csv", index=False)

Optimal number of features: 282
Optimal features: ['DFF', 'DGS10', 'VIXCLS', 'DCOILWTICO', 'DCOILBRENTEU', 'DTWEXBGS', 'Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume', 'volume_adi', 'volume_obv', 'volume_cmf', 'volume_fi', 'volume_em', 'volume_sma_em', 'volume_vpt', 'volume_vwap', 'volume_mfi', 'volume_nvi', 'volatility_bbm', 'volatility_bbh', 'volatility_bbl', 'volatility_bbw', 'volatility_bbp', 'volatility_bbhi', 'volatility_kcc', 'volatility_kch', 'volatility_kcl', 'volatility_kcw', 'volatility_kcp', 'volatility_kchi', 'volatility_kcli', 'volatility_dcl', 'volatility_dch', 'volatility_dcm', 'volatility_dcw', 'volatility_dcp', 'volatility_atr', 'volatility_ui', 'trend_macd', 'trend_macd_signal', 'trend_macd_diff', 'trend_sma_fast', 'trend_sma_slow', 'trend_ema_fast', 'trend_ema_slow', 'trend_vortex_ind_pos', 'trend_vortex_ind_neg', 'trend_vortex_ind_diff', 'trend_trix', 'trend_mass_index', 'trend_dpo', 'trend_kst', 'trend_kst_sig', 'trend_kst_diff', 'trend_ichimoku_conv', 'tren

In [18]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# Evaluating the model on the test set
y_pred = rfecv.predict(x_test)

# Calculating performance metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

accuracy, precision, recall, f1, conf_matrix

(0.4657534246575342,
 0.6666666666666666,
 0.05,
 0.09302325581395349,
 array([[64,  2],
        [76,  4]], dtype=int64))