In [1]:
import numpy as np
import yfinance as yf
import warnings
import pandas as pd
warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)

def ST_labels(data, delta):
    """
    Calculate the stop-loss adjusted label.

    Parameters:
    - data: DataFrame containing historical asset prices.
    - delta: Maximum tolerance level for stop-loss trading.

    Returns:
    - Index of rows where the label is 1 (BUY).
    """
    return data[
        (data["Close"] / data["Close"].shift(1) > 1) & 
        ((data["Low"] / data["Close"].shift(1) - 1) * 100 >= -delta)
    ].index

def moving_average(series: pd.Series, window: int) -> pd.Series:
    return series.rolling(window).mean()

def exponential_moving_average(series: pd.Series, window: int) -> pd.Series:
    return series.ewm(span=window, adjust=False).mean()

def rate_of_change(series: pd.Series, window: int) -> pd.Series:
    shifted = series.shift(window)
    return (series - shifted) / (shifted + 1e-9)

def rsi(series: pd.Series, window: int) -> pd.Series:
    diff = series.diff(1)
    gain = diff.clip(lower=0)
    loss = diff.clip(upper=0).abs()
    avg_gain = gain.rolling(window).mean()
    avg_loss = loss.rolling(window).mean()
    rs = avg_gain / (avg_loss + 1e-9)
    return 100 - (100 / (1 + rs))

def macd_signal(series: pd.Series, short_window: int = 12, long_window: int = 26, signal_window: int = 9):
    ema_short = exponential_moving_average(series, short_window)
    ema_long = exponential_moving_average(series, long_window)
    macd_line = ema_short - ema_long
    signal_line = exponential_moving_average(macd_line, signal_window)
    return macd_line, signal_line

def stochastic_oscillator(high: pd.Series, low: pd.Series, close: pd.Series, window: int = 14):
    highest_high = high.rolling(window).max()
    lowest_low = low.rolling(window).min()
    k = (close - lowest_low) / (highest_high - lowest_low + 1e-9) * 100
    d = k.rolling(3).mean()
    return k, d

def williams_r(high: pd.Series, low: pd.Series, close: pd.Series, window: int = 14):
    highest_high = high.rolling(window).max()
    lowest_low = low.rolling(window).min()
    wr = (highest_high - close) / (highest_high - lowest_low + 1e-9) * -100
    return wr

def bollinger_bands(series: pd.Series, window: int = 20, num_std: float = 2.0):
    m_avg = series.rolling(window).mean()
    m_std = series.rolling(window).std(ddof=0)
    upper_band = m_avg + num_std * m_std
    lower_band = m_avg - num_std * m_std
    return m_avg, upper_band, lower_band

def cci(high: pd.Series, low: pd.Series, close: pd.Series, window: int = 20):
    typical_price = (high + low + close) / 3
    ma = typical_price.rolling(window).mean()
    md = (typical_price - ma).rolling(window).apply(lambda x: np.mean(np.abs(x)))
    cci_val = (typical_price - ma) / (md * 0.015 + 1e-9)
    return cci_val

def create_features_and_labels(df: pd.DataFrame) -> pd.DataFrame:
    df = df.sort_values('Date').reset_index(drop=True)

    features_df = df.copy()

    features_df['NextClose'] = features_df['Close'].shift(-1)
    features_df['Y'] = np.where(features_df['NextClose'] > features_df['Close'], 'BUY', 'SELL')
    features_df.dropna(subset=['NextClose'], inplace=True)

    for delta in [2, 3, 4, 5]:
        buy_index = ST_labels(features_df, delta)
        features_df[f'Y_{delta}'] = np.where(features_df.index.isin(buy_index), 'BUY', 'SELL')

    period_list = list(range(2, 31))
    technical_features = []

    for w in period_list:
        col_sma = f'SMA_{w}'
        features_df[col_sma] = moving_average(features_df['Close'], w)
        technical_features.append(col_sma)

        col_ema = f'EMA_{w}'
        features_df[col_ema] = exponential_moving_average(features_df['Close'], w)
        technical_features.append(col_ema)

        col_roc = f'ROC_{w}'
        features_df[col_roc] = rate_of_change(features_df['Close'], w)
        technical_features.append(col_roc)

        col_rsi = f'RSI_{w}'
        features_df[col_rsi] = rsi(features_df['Close'], w)
        technical_features.append(col_rsi)

        col_wr = f'WR_{w}'
        features_df[col_wr] = williams_r(features_df['High'], features_df['Low'], features_df['Close'], w)
        technical_features.append(col_wr)

        col_cci = f'CCI_{w}'
        features_df[col_cci] = cci(features_df['High'], features_df['Low'], features_df['Close'], w)
        technical_features.append(col_cci)

        col_bb_mid = f'BBmid_{w}'
        col_bb_up = f'BBup_{w}'
        col_bb_dn = f'BBdn_{w}'
        bb_mid, bb_up, bb_dn = bollinger_bands(features_df['Close'], w)
        features_df[col_bb_mid] = bb_mid
        features_df[col_bb_up] = bb_up
        features_df[col_bb_dn] = bb_dn
        technical_features += [col_bb_mid, col_bb_up, col_bb_dn]

        col_sto_k = f'StoK_{w}'
        col_sto_d = f'StoD_{w}'
        sto_k, sto_d = stochastic_oscillator(features_df['High'], features_df['Low'], features_df['Close'], w)
        features_df[col_sto_k] = sto_k
        features_df[col_sto_d] = sto_d
        technical_features += [col_sto_k, col_sto_d]

    macd_line, signal_line = macd_signal(features_df['Close'], 12, 26, 9)
    features_df['MACD'] = macd_line
    features_df['MACD_Signal'] = signal_line
    technical_features += ['MACD', 'MACD_Signal']

    for w in period_list:
        col_vol_sma = f'Volume_SMA_{w}'
        features_df[col_vol_sma] = moving_average(features_df['Volume'], w)
        technical_features.append(col_vol_sma)

        col_vol_ema = f'Volume_EMA_{w}'
        features_df[col_vol_ema] = exponential_moving_average(features_df['Volume'], w)
        technical_features.append(col_vol_ema)

    for w in period_list:
        col_range = f'Range_{w}'
        features_df[col_range] = (features_df['High'] - features_df['Low']).rolling(w).mean()
        technical_features.append(col_range)

        col_close_std = f'CloseStd_{w}'
        features_df[col_close_std] = features_df['Close'].rolling(w).std()
        technical_features.append(col_close_std)

    features_df.dropna(subset=technical_features, inplace=True)

    label_cols = ['Y', 'Y_2', 'Y_3', 'Y_4', 'Y_5']
    result_df = features_df[['Date'] + label_cols].join(
        features_df[technical_features]
    )
    return result_df

if __name__ == "__main__":
    start_date = "2020-01-01"
    end_date = "2025-01-31"

    tickers = [
        'BIIB', 'BA', 'AXP', 'SLB', 'COP', 'AVGO', 'TMO', 'NEE', 'NKE', 'MO',
        'WBA', 'QCOM', 'COST', 'ACN', 'CVS', 'T', 'CVX', 'HD', 'DUK', 'CL',
        'MMM', 'CSCO', 'BAC', 'LOW', 'BLK', 'MDLZ', 'PM', 'UNH', 'VZ', 'CAT',
        'NVDA', 'FDX', 'RTX', 'AIG', 'TMUS', 'INTC', 'PEP', 'TGT', 'GD', 'GS',
        'MDT', 'IBM', 'DIS', 'ORCL', 'COF', 'MSFT', 'KO', 'BKNG', 'V', 'LLY',
        'ADBE', 'AMZN', 'SBUX', 'BMY', 'MRK', 'XOM', 'F', 'JNJ', 'USB', 'AMT',
        'EXC', 'AAPL', 'SPG', 'TXN', 'PFE', 'PG', 'LMT', 'MCD', 'NFLX', 'UNP',
        'HON', 'C', 'GOOG', 'AMGN', 'JPM', 'MA', 'CMCSA', 'ABT', 'SO',
        'GILD', 'MET', 'MS', 'EMR', 'UPS', 'CRM', 'DHR', 'GOOGL', 'GE', 'WFC',
        'WMT'
    ]

    all_results = []

    for ticker in tickers:
        try:
            df = yf.download(
                ticker,
                start=start_date,
                end=end_date,
                group_by='column'
            )

            # MultiIndex인 경우 컬럼명 재설정
            if isinstance(df.columns, pd.MultiIndex):
                df.columns = ['_'.join(col).strip() for col in df.columns.values]

            # 불러온 DataFrame에 필요한 컬럼명 매핑
            possible_cols = list(df.columns)
            rename_dict = {}
            for col in ['Open','High','Low','Close','Volume']:
                alt_name = col + '_' + ticker
                if alt_name in possible_cols:
                    rename_dict[alt_name] = col

            df.rename(columns=rename_dict, inplace=True)

            required_cols = {'Open','High','Low','Close','Volume'}
            if not required_cols.issubset(df.columns):
                continue

            df.reset_index(inplace=True)
            result_df = create_features_and_labels(df)
            result_df['Stock'] = ticker
            all_results.append(result_df)

        except Exception as e:
            print(e)
            continue

    if all_results:
        final_df = pd.concat(all_results, axis=0).reset_index(drop=True)
        print(final_df.head())
    else:
        print("There are No data")


[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%********

        Date     Y   Y_2   Y_3   Y_4   Y_5       SMA_2       EMA_2     ROC_2  \
0 2020-03-26  SELL   BUY   BUY   BUY   BUY  294.960007  297.221886  0.104095   
1 2020-03-27   BUY  SELL  SELL  SELL  SELL  300.845001  296.907295  0.041301   
2 2020-03-30   BUY   BUY   BUY   BUY   BUY  306.440002  309.722435  0.036696   
3 2020-03-31  SELL  SELL   BUY   BUY   BUY  316.255005  314.160815  0.066150   
4 2020-04-01   BUY  SELL  SELL  SELL  SELL  304.824997  300.233598 -0.072312   

        RSI_2  ...  CloseStd_26   Range_27  CloseStd_27   Range_28  \
0  100.000000  ...    20.419658  17.487775    21.175643  17.217140   
1   70.905846  ...    19.440730  17.729999    20.089591  17.323569   
2   70.293797  ...    18.435967  18.205184    19.212699  17.929999   
3  100.000000  ...    18.047460  18.483702    18.262901  17.938570   
4    1.070205  ...    17.580483  18.947777    17.786678  18.648927   

   CloseStd_28   Range_29  CloseStd_29   Range_30  CloseStd_30  Stock  
0    21.566385  16.746549 

In [3]:
final_df.to_csv("./dataset/SNP.csv", index =False)

In [5]:
df_raw = final_df.copy()

In [6]:
df_raw[(df_raw['Date'] >= '2020-01-01') & (df_raw['Date'] <= '2022-12-31')]

Unnamed: 0,Date,Y,Y_2,Y_3,Y_4,Y_5,SMA_2,EMA_2,ROC_2,RSI_2,...,CloseStd_26,Range_27,CloseStd_27,Range_28,CloseStd_28,Range_29,CloseStd_29,Range_30,CloseStd_30,Stock
0,2020-03-26,SELL,BUY,BUY,BUY,BUY,294.960007,297.221886,0.104095,100.000000,...,20.419658,17.487775,21.175643,17.217140,21.566385,16.746549,21.694580,16.517665,21.828505,BIIB
1,2020-03-27,BUY,SELL,SELL,SELL,SELL,300.845001,296.907295,0.041301,70.905846,...,19.440730,17.729999,20.089591,17.323569,20.862351,17.067929,21.274301,16.617998,21.425392,BIIB
2,2020-03-30,BUY,BUY,BUY,BUY,BUY,306.440002,309.722435,0.036696,70.293797,...,18.435967,18.205184,19.212699,17.929999,19.827545,17.530688,20.569068,17.276665,20.966701,BIIB
3,2020-03-31,SELL,SELL,BUY,BUY,BUY,316.255005,314.160815,0.066150,100.000000,...,18.047460,18.483702,18.262901,17.938570,18.994445,17.682068,19.578269,17.304331,20.290805,BIIB
4,2020-04-01,BUY,SELL,SELL,SELL,SELL,304.824997,300.233598,-0.072312,1.070205,...,17.580483,18.947777,17.786678,18.648927,18.023623,18.116895,18.770400,17.862998,19.371189,BIIB
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
109095,2022-12-23,BUY,BUY,BUY,BUY,BUY,46.597221,46.656928,-0.009712,14.571861,...,1.120585,0.684315,1.099195,0.725904,1.082542,0.757157,1.235751,0.765641,1.263944,WMT
109096,2022-12-27,SELL,BUY,BUY,BUY,BUY,46.650751,46.657137,0.002300,99.999998,...,1.156899,0.662584,1.135575,0.674475,1.114487,0.714969,1.097347,0.745545,1.238991,WMT
109097,2022-12-28,BUY,SELL,SELL,SELL,SELL,46.248451,46.112153,-0.017250,1.562966,...,1.234204,0.671567,1.215038,0.670901,1.192814,0.682094,1.171323,0.720985,1.152774,WMT
109098,2022-12-29,SELL,BUY,BUY,BUY,BUY,45.979168,46.116501,-0.011543,25.443705,...,1.274099,0.669230,1.263036,0.662298,1.245135,0.661974,1.222878,0.673092,1.201647,WMT


In [7]:
df_raw[(df_raw['Date'] >= '2024-01-01')]

Unnamed: 0,Date,Y,Y_2,Y_3,Y_4,Y_5,SMA_2,EMA_2,ROC_2,RSI_2,...,CloseStd_26,Range_27,CloseStd_27,Range_28,CloseStd_28,Range_29,CloseStd_29,Range_30,CloseStd_30,Stock
948,2024-01-02,SELL,BUY,BUY,BUY,BUY,263.239990,264.946910,0.027362,83.162813,...,12.296138,5.175553,12.378082,5.098927,12.503269,5.022067,12.573234,4.994331,12.703425,BIIB
949,2024-01-03,SELL,SELL,SELL,SELL,SELL,265.569992,263.935632,0.018008,67.624823,...,12.427345,5.286293,12.525816,5.166068,12.622842,5.092411,12.761217,5.018330,12.844005,BIIB
950,2024-01-04,SELL,SELL,SELL,SELL,SELL,261.854996,261.498543,-0.027754,0.000000,...,12.111362,5.355552,12.450845,5.274997,12.566483,5.159308,12.679484,5.088331,12.832376,BIIB
951,2024-01-05,BUY,SELL,SELL,SELL,SELL,259.080002,259.086184,-0.021068,0.000000,...,11.474793,5.332959,12.020840,5.326782,12.374602,5.249997,12.506037,5.138997,12.633951,BIIB
952,2024-01-08,SELL,SELL,BUY,BUY,BUY,257.930008,258.348735,-0.008837,4.000244,...,11.026099,5.459626,11.372743,5.398925,11.929246,5.390686,12.295797,5.314330,12.441672,BIIB
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
109615,2025-01-23,BUY,BUY,BUY,BUY,BUY,93.520000,93.549773,0.007843,100.000000,...,1.469652,1.489868,1.477393,1.479065,1.524600,1.478640,1.542417,1.507180,1.531626,WMT
109616,2025-01-24,BUY,BUY,BUY,BUY,BUY,94.285000,94.356592,0.016411,100.000000,...,1.501293,1.487039,1.523552,1.492016,1.525181,1.481511,1.562961,1.481019,1.575060,WMT
109617,2025-01-27,SELL,BUY,BUY,BUY,BUY,96.080002,96.385532,0.038269,100.000000,...,1.743071,1.524076,1.778913,1.528216,1.780216,1.531601,1.767721,1.520127,1.784154,WMT
109618,2025-01-28,BUY,SELL,SELL,SELL,SELL,97.345001,96.988511,0.026699,95.999978,...,1.907443,1.505557,1.959766,1.509287,1.974917,1.513794,1.964041,1.517548,1.943260,WMT


In [8]:
df_raw.columns[:5]

Index(['Date', 'Y', 'Y_2', 'Y_3', 'Y_4'], dtype='object')

In [11]:
df_raw[['Y', 'Y_2', 'Y_3', 'Y_4']] = df_raw[['Y', 'Y_2', 'Y_3', 'Y_4']].apply(lambda x: x.map({'SELL': 0, 'BUY': 1}))

In [12]:
df_raw

Unnamed: 0,Y,Y_2,Y_3,Y_4
0,0,1,1,1
1,1,0,0,0
2,1,1,1,1
3,0,0,1,1
4,1,0,0,0
...,...,...,...,...
109615,1,1,1,1
109616,1,1,1,1
109617,0,1,1,1
109618,1,0,0,0
