In [1]:
import yfinance as yf
import datetime
import pandas as pd

In [2]:
tickers = ['AAPL', 'JNJ']
start_day = "2019-11-10"
end_day = "2024-11-10"

In [3]:
def scrape_stock_data(stock_symbol, start_day, end_day):
    stock = yf.Ticker(stock_symbol)
    df = stock.history(start = start_day, end = end_day)
    return df[['Close']]

In [4]:
# Function to create buy/sell/hold labels with look-ahead logic
def create_labels(stock_data):
    # Initialize label column with neutral signals (0)
    stock_data['Label'] = 0
    holdingStock = False
    # Iterate over the stock data and label the buy/sell signals
    for i in range(0, len(stock_data) - 1):  # Compare current day with the next day
        current_close = stock_data.loc[stock_data.index[i], 'Close']
        next_close = stock_data.loc[stock_data.index[i + 1], 'Close']  # Next day's close price
        movement = ""

        if next_close > current_close:
            movement = "INCREASE"

        if next_close < current_close:
            movement = "DECREASE"

        # If the price will increase tomorrow, buy today
        if movement == "INCREASE" and holdingStock == False:
            stock_data.loc[stock_data.index[i], 'Label'] = 1  # Buy signal
            holdingStock = True

        # If the price will decrease tomorrow, sell today
        elif movement == "DECREASE" and holdingStock == True:
            stock_data.loc[stock_data.index[i], 'Label'] = -1  # Sell signal
            holdingStock = False

    # Remove the last row since it won't have a next day to compare
    stock_data = stock_data[:-1]

    return stock_data

In [5]:
def construct_dataset(stock_symbol, start_date, end_date, news_api_key=None):
    stock_data = scrape_stock_data(stock_symbol, start_date, end_date)

    # Add the date column

    # Call the create_labels function to assign buy/sell/hold labels
    stock_data = create_labels(stock_data)

    return stock_data

In [6]:
aapl_df = construct_dataset(tickers[0], start_day, end_day)
print(aapl_df.head())

                               Close  Label
Date                                       
2019-11-11 00:00:00-05:00  63.549580      0
2019-11-12 00:00:00-05:00  63.491398      1
2019-11-13 00:00:00-05:00  64.099754     -1
2019-11-14 00:00:00-05:00  63.656219      1
2019-11-15 00:00:00-05:00  64.412422      0


In [7]:
# aapl_df.to_csv('../data/AAPL_FINANCIAL')

In [8]:
jnj_df = construct_dataset(tickers[1], start_day, end_day)
print(jnj_df.head())

                                Close  Label
Date                                        
2019-11-11 00:00:00-05:00  115.011147      0
2019-11-12 00:00:00-05:00  114.462151      0
2019-11-13 00:00:00-05:00  114.392441      0
2019-11-14 00:00:00-05:00  114.122292      1
2019-11-15 00:00:00-05:00  117.590576     -1


In [9]:
# jnj_df.to_csv('../data/JNJ_FINANCIAL')