In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import yfinance as yf
import json
import requests
from yahoofinancials import YahooFinancials

In [4]:
class StockDataProcessor:
    def __init__(self, ticker, start_date, end_date, sp500_file='../data/sp500_historical_data.csv'):
        self.ticker = ticker
        self.start_date = start_date
        self.end_date = end_date
        self.sp500 = pd.read_csv(sp500_file)
        self.stock_data = None

    def get_stock_data(self):
        """Get historical stock data for the given ticker."""
        stock_data = yf.download(self.ticker, start=self.start_date, end=self.end_date)
        return stock_data

    def calculate_technical_indicators(self, stock_data):
        """Calculate technical indicators like SMA, MACD, and RSI."""
        stock_data['SMA_50'] = stock_data['Adj Close'].rolling(window=50).mean()
        stock_data['SMA_200'] = stock_data['Adj Close'].rolling(window=200).mean()

        # MACD
        stock_data['MACD'] = stock_data['Adj Close'].ewm(span=12, adjust=False).mean() - stock_data['Adj Close'].ewm(span=26, adjust=False).mean()
        stock_data['MACD_signal'] = stock_data['MACD'].ewm(span=9, adjust=False).mean()

        # RSI (Relative Strength Index)
        delta = stock_data['Adj Close'].diff(1)
        gain = (delta.where(delta > 0, 0)).rolling(window=14).mean()
        loss = (-delta.where(delta < 0, 0)).rolling(window=14).mean()
        rs = gain / loss
        stock_data['RSI_14'] = 100 - (100 / (1 + rs))
        
        return stock_data

    def calculate_volatility(self, stock_data):
        """Calculate volatility based on daily percentage change."""
        stock_data['Volatility'] = stock_data['Adj Close'].pct_change().rolling(window=20).std()
        return stock_data

    def calculate_returns(self, stock_data):
        """Calculate daily returns."""
        stock_data['Daily_Return'] = stock_data['Adj Close'].pct_change()
        return stock_data

    def get_bond_interest_rate(self, stock_data):
        """Get 10-year bond interest rate and add to stock_data."""
        interest_rate = yf.download('^TNX', start=self.start_date, end=self.end_date)
        stock_data['interest_rate'] = interest_rate['Adj Close'].tolist()
        return stock_data

    def get_bond_volatility_rate(self, stock_data):
        """Get bond volatility index (VIX) and add to stock_data."""
        vix = yf.download('^VIX', start=self.start_date, end=self.end_date)
        stock_data['vix'] = vix['Adj Close'].tolist()
        return stock_data

    def highest_correlation(self, df, target_stock):
        """Find the top 3 correlated stocks with the target stock."""
        target_prices = df[df['Symbol'] == target_stock]['Adj Close']
        pivot_df = df.pivot(index='Date', columns='Symbol', values='Adj Close')
        corr_matrix = pivot_df.corr()
        top_3_corr = corr_matrix[target_stock].drop(target_stock).sort_values(ascending=False).head(3)
        result = top_3_corr.keys().tolist()
        return [string.upper() for string in result]

    def get_similar_stocks(self, stock_data):
        """Get historical data for stocks with the highest correlation to the target stock."""
        symbols = self.highest_correlation(self.sp500, self.ticker)
        for symbol in symbols:
            historical_data = yf.download(symbol, start=self.start_date, end=self.end_date)
            stock_data[symbol.lower()] = historical_data['Adj Close']
        return stock_data

    def get_index_data(self, stock_data):
        """Add NASDAQ and S&P 500 index data to stock_data."""
        nsdq = yf.download('^IXIC', start=self.start_date, end=self.end_date)
        spy = yf.download('^GSPC', start=self.start_date, end=self.end_date)
        stock_data['nsdq'] = nsdq['Adj Close'].tolist()
        stock_data['spy'] = spy['Adj Close'].tolist()
        return stock_data

    def process_data(self):
        """Complete data processing pipeline."""
        # Step 1: Get historical stock data
        self.stock_data = self.get_stock_data()

        # Step 2: Calculate technical indicators
        self.stock_data = self.calculate_technical_indicators(self.stock_data)

        # Step 3: Calculate volatility and returns
        self.stock_data = self.calculate_volatility(self.stock_data)
        self.stock_data = self.calculate_returns(self.stock_data)

        # Step 4: Add bond interest rate and volatility data
        self.stock_data = self.get_bond_interest_rate(self.stock_data)
        self.stock_data = self.get_bond_volatility_rate(self.stock_data)

        # Step 5: Add similar stocks and index data
        self.stock_data = self.get_similar_stocks(self.stock_data)
        self.stock_data = self.get_index_data(self.stock_data)

        # Step 6: Drop unused columns
        self.stock_data = self.stock_data.drop(['High', 'Low', 'Close'], axis=1)

    def save_data(self):
        """Save the processed stock data to a CSV file."""
        filename = f'../data/stock_data_{self.ticker}.csv'
        self.stock_data.to_csv(filename)
        print(f"Data saved to {filename}")

In [7]:
ticker = 'JNJ'
start_date = '2018-01-01'
end_date = '2024-08-01'

processor = StockDataProcessor(ticker=ticker, start_date=start_date, end_date=end_date)
processor.process_data()
processor.save_data()

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed

Data saved to ../data/stock_data_JNJ.csv



