In [None]:
import requests
import pandas as pd
from datetime import datetime, timedelta
from queue import Queue
from scipy.stats import zscore
from concurrent.futures import ThreadPoolExecutor

In [None]:
# Polygon API setup
api_key = ''
base_url = 'https://api.polygon.io/v2'

In [None]:
def get_sp500_symbols():
    url = "https://en.wikipedia.org/wiki/List_of_S%26P_500_companies"
    sp500_symbols = pd.read_html(url, header=0)[0]['Symbol'].tolist()
    return sp500_symbols

In [None]:
symbols = get_sp500_symbols()

In [None]:
def get_dow30_symbols():
    url = "https://en.wikipedia.org/wiki/Dow_Jones_Industrial_Average"
    dow30_symbols = pd.read_html(url, header=0)[1]['Symbol'].tolist()
    return dow30_symbols

In [None]:
# Example usage
symbols = get_dow30_symbols()
print(symbols)

# Alternative data?

In [None]:
import requests
import pandas as pd

def get_historical_sentiment(ticker, api_key, page=10):
    url = f"https://financialmodelingprep.com/api/v4/historical/social-sentiment?symbol={ticker}&apikey={api_key}&page={page}"
    
    response = requests.get(url)
    if response.status_code == 200:
        data = response.json()
        # Convert the data to a pandas DataFrame for easier handling
        df = pd.DataFrame(data)
        return df
    else:
        print(f"Failed to fetch data: {response.status_code}")
        return None

# Example usage
api_key = ""  # Replace with your FMP API key
ticker = "NVDA"
df = get_historical_sentiment(ticker, api_key)
print(df)


In [None]:
df.head()

In [None]:
df['stocktwitsSentiment'].plot(figsize=(15, 5), title=f"StockTwits sentiment for {ticker}")

In [None]:
# Define the function to fetch data for a single ticker
def fetch_data_for_ticker(ticker, start_date, end_date, df):
    attempts = 0
    start = start_date
    while start < end_date:
        try:
            start_str = start.strftime('%Y-%m-%d')
            end_str = end_date.strftime('%Y-%m-%d')
            print(f"Fetching {ticker} data from {start_str} to {end_str}")

            response = requests.get(f"{base_url}/aggs/ticker/{ticker}/range/1/day/{start_str}/{end_str}?apiKey={api_key}")
            response.raise_for_status()

            data = response.json()['results']
            for day in data:
                date = datetime.fromtimestamp(day['t'] / 1000)
                df.loc[(ticker, date), 'volume'] = day['v']
                df.loc[(ticker, date), 'price'] = day['c']

            start += timedelta(days=1830)

        except Exception as e:
            print(f"Error fetching data for {ticker}: {e}")
            attempts += 1
            if attempts >= 2:
                print(f"Skipping {ticker} after 2 failed attempts.")
                break

In [None]:
# Define the time period
end_date = datetime.now()
start_date = end_date - timedelta(days=15*365)

# Create an empty DataFrame with MultiIndex
columns = ['volume', 'price']
index = pd.MultiIndex(levels=[[],[]], codes=[[],[]], names=['ticker', 'timestamp'])
df = pd.DataFrame(columns=columns, index=index)

# Define a queue for thread-safe data collection
results_queue = Queue()

# Use ThreadPoolExecutor to fetch data in parallel
with ThreadPoolExecutor(max_workers=2) as executor:
    executor.map(lambda ticker: fetch_data_for_ticker(ticker, start_date, end_date, df), symbols)

# Process results from the queue and update the DataFrame
while not results_queue.empty():
    ticker, date, volume, price = results_queue.get()
    df.loc[(ticker, date), 'volume'] = volume
    df.loc[(ticker, date), 'price'] = price
    
# Save the DataFrame
df.to_csv('historical_stock_data.csv')

In [None]:
count = 0
for symbol in df.index.get_level_values(0).unique():
    count += 1
    df.loc[symbol, 'price'].plot()
    print('Symbol:', symbol)
    if count == 5:
        break

In [None]:
import os
# change path to be in alpaca_rl_trader for linux
#os.chdir('/home/joe/Python/tradingbot/gym/alpaca_rl_trader')

# change path to be in alpaca_rl_trader for windows
os.chdir('C:/Users/JoeBa/Documents/python/alpaca_rl_trader/')

In [None]:
import numpy as np
import pandas as pd
from scipy.stats import zscore
from fracdiff import fdiff

In [None]:
from data.data_loader import load_data, SplitOption

In [None]:
# symbols = ['MMM', 'AXP', 'AMGN', 'AAPL', 'BA', 'CAT', 'CVX', 'CSCO', 'KO', 'DIS', 'GS', 'HD', 'HON',
#            'IBM', 'INTC', 'JNJ', 'JPM', 'MCD', 'MRK', 'MSFT', 'NKE', 'PG', 'CRM', 'TRV', 'UNH', 'VZ', 'V', 'WMT']

# Get test data
data = load_data(
    ratio=0.8, split_option=SplitOption.NO_SPLIT, symbols=[], table_name='crypto_data_hourly')

In [None]:
data['X:ETHUSD'].keys()

In [None]:
# plot price_data matplotlib
import matplotlib.pyplot as plt

for symbol in data.keys():
    plt.figure()
    plt.plot(data[symbol]['f_vmar'])
    plt.title(symbol)

# check features to be unique

In [None]:

# Assuming data_dict is your existing dictionary
# Reconstruct each ticker's DataFrame
reconstructed_dfs = []
for ticker, col_data in data.items():
    ticker_df = pd.DataFrame(col_data)
    ticker_df['ticker'] = ticker  # Add ticker as a column for later MultiIndex creation
    reconstructed_dfs.append(ticker_df)

# Concatenate all reconstructed DataFrames
concatenated_df = pd.concat(reconstructed_dfs)

# Set the index to be a MultiIndex of ticker and the original index
multiindex_df = concatenated_df.set_index(['ticker', concatenated_df.index])

# Optional: If the original index was a timestamp, you might need to sort by the MultiIndex
multiindex_df.sort_index(inplace=True)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Filter columns that start with 'f_'
f_columns = [col for col in multiindex_df.columns if col.startswith('f_')]

# Calculate the correlation matrix for only those columns
correlation_matrix = multiindex_df[f_columns].corr()

# Plot the heatmap
plt.figure(figsize=(14, 12))  # Set the size of the figure
sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm")
plt.show()


In [None]:
# Set a threshold for high correlation
threshold = 0.8

# Find and collect highly correlated pairs
to_drop = set()
for i in range(len(correlation_matrix.columns)):
    for j in range(i+1, len(correlation_matrix.columns)):
        if abs(correlation_matrix.iloc[i, j]) > threshold:
            # Get the name of the feature to drop (you could also use other logic to choose which one to drop)
            colname = correlation_matrix.columns[i]
            to_drop.add(colname)

In [None]:
# what features need to be dropped
to_drop

In [None]:
np.log(1.00001)

In [None]:
def scale_action_vector(action_vector):
    # output is tanh, we want deadzone safespace between -0.7 and -1
    # Scale values from [-0.7, 1] to [0, 1]
    min_value = -0.7
    max_value = 1

    return np.clip((action_vector - min_value) / (max_value - min_value), 0,1)

def bin_action_values(action_vector):
    # Define the bin edges
    # max is never 1 to dissallow 100% utilization of the portfolio
    bins = np.arange(0, 1.0, 0.025)
    action_vector_reshaped = action_vector.reshape(-1, 1)
    abs_diff = np.abs(action_vector_reshaped - bins)
    closest_bin_indices = np.argmin(abs_diff, axis=1)
    binned_action_vector = bins[closest_bin_indices]

    return binned_action_vector

def normalize_action_vector(action_vector):
    # scale sum of action vector to 1
    action_vector_sum = np.sum(action_vector)
    action_vector = action_vector / action_vector_sum

    return action_vector

In [None]:
# Example usage
#actions_vector = np.array([-1, -1, -1, -1, 0, 0.1, 1])  # Sample vector
actions_vector = np.array([-1, -0.8, -0.7, -0.5, 0, 0.5, 1])  # Sample vector
actions_vector = scale_action_vector(actions_vector)
scaled_binned_actions = normalize_action_vector(actions_vector)
bin_binned_actions = bin_action_values(scaled_binned_actions)
print(bin_binned_actions)

In [None]:
actions_vector

In [None]:
scaled_binned_actions

In [None]:
bin_binned_actions

In [None]:
    
class TradingBot:
    def __init__(self, decay_factor=0.9, frame_stack=10):
        self.decay_factor = decay_factor  # Decay factor for exponential sampling
        self.frame_stack = frame_stack  # Number of frames to stack
        self.observations = []  # Store observations with decay
        # Other initialization code...
        self.features = ['f_percentage_change_zscore', 'f_dollar_volume_zscore', 'f_sma',
            'f_fractional_difference_price', 'f_vmar', 'f_cumulative_return']

    def next_observation(self, index=None):
        
        feature_arrays = []
        
        # Dynamic extraction of features
        for feature_name in self.features:
            feature_array = np.array([data[symbol][feature_name][index]
                                      for symbol in data.keys()], dtype=np.float32)
            feature_arrays.append(feature_array)

        # Perform the division with safe check
        holdings_array = np.zeros( len(data.keys())  )

        # Ensure the result is in float32 format
        holdings_array = holdings_array.astype(np.float32)

        # Calculate unrealized P/L
        unrealized_pl_array = np.zeros( len(data.keys())  )

        # Ensure the result is in float32 format
        unrealized_pl_array = unrealized_pl_array.astype(np.float32)

        # Construct the observation array for the current step
        # Shape will be [num_features, num_symbols]
        current_observation = np.stack(
            [unrealized_pl_array, holdings_array] + feature_arrays, axis=0)

        # Transpose the observation to make it [num_symbols, num_features]
        current_observation = np.transpose(
            np.clip(current_observation, -3, 3), (1, 0))

        # Apply exponential decay to historical observations
        self.observations = [obs * self.decay_factor for obs in self.observations]

        # Add the current observation
        self.observations.append(current_observation)

        # Ensure the deque has a maximum length of FRAMESTACK
        self.observations = self.observations[-self.frame_stack:]

        # Stack the observations to create the final array
        # This will stack the last FRAMESTACK observations along a new dimension
        stacked_obs = np.stack(self.observations, axis=-1)


        return stacked_obs

In [None]:
tb = TradingBot()

In [None]:
# plot x
x = tb.next_observation(index=212)

In [None]:
x.shape

In [None]:
count = 212

In [None]:
import matplotlib.pyplot as plt
# plot x
count+=1
x = tb.next_observation(index=count)
# Assuming 'x' is the output with shape (15, 8, 10)
# Choose a symbol to visualize, for example, symbol 0
symbol_index = 0

num_features = x.shape[1]
num_frames = x.shape[2]

# Create a figure with subplots for each feature
fig, axs = plt.subplots(num_features, 1, figsize=(10, 20))

# Iterate over each feature
for i in range(num_features):
    # Plot the feature across all frames for the chosen symbol
    axs[i].plot(range(num_frames), x[symbol_index, i, :], marker='o')
    axs[i].set_title(f'Feature {i+1} over Time for Symbol {symbol_index+1}')
    axs[i].set_xlabel('Frame')
    axs[i].set_ylabel('Feature Value')

plt.tight_layout()
plt.show()
