In [27]:
from utils import misc as misc
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import norm



In [28]:
df = misc.get_price_data('GME')
df

Unnamed: 0,date,open,high,low,close,adjclose,volume,ticker
0,2002-02-13,2.406250,2.515000,2.381250,2.512500,1.691667,76216000,GME
1,2002-02-14,2.543750,2.548750,2.481250,2.500000,1.683250,11021600,GME
2,2002-02-15,2.500000,2.506250,2.462500,2.487500,1.674834,8389600,GME
3,2002-02-19,2.475000,2.475000,2.343750,2.387500,1.607504,7410400,GME
4,2002-02-20,2.400000,2.468750,2.381250,2.468750,1.662209,6892800,GME
...,...,...,...,...,...,...,...,...
5329,2023-04-17,22.270000,22.680000,22.139999,22.280001,22.280001,2066600,GME
5330,2023-04-18,22.139999,22.320000,21.500000,21.610001,21.610001,2748700,GME
5331,2023-04-19,21.280001,21.870001,20.959999,21.309999,21.309999,2539500,GME
5332,2023-04-20,20.879999,21.570000,20.059999,20.219999,20.219999,2977400,GME


In [29]:
print(df['open'])


0        2.406250
1        2.543750
2        2.500000
3        2.475000
4        2.400000
          ...    
5329    22.270000
5330    22.139999
5331    21.280001
5332    20.879999
5333    20.200001
Name: open, Length: 5334, dtype: float64


In [30]:

window_size = 5
rolling_array = np.array([df['open'].values[i:i+window_size] for i in range(len(df) - window_size + 1)])
print(rolling_array)
print(rolling_array.shape)
print(len(df))

[[ 2.40625     2.54375005  2.5         2.4749999   2.4000001 ]
 [ 2.54375005  2.5         2.4749999   2.4000001   2.46000004]
 [ 2.5         2.4749999   2.4000001   2.46000004  2.48125005]
 ...
 [22.5        22.73999977 22.27000046 22.13999939 21.28000069]
 [22.73999977 22.27000046 22.13999939 21.28000069 20.87999916]
 [22.27000046 22.13999939 21.28000069 20.87999916 20.20000076]]
(5330, 5)
5334


In [32]:
time_period = [5, 10, 15, 30]

def returns_over_time_period(df, t):
    df['returns' + str(t)] = df['close'].rolling(t + 1).apply(lambda x: x.pct_change().iloc[-1])
    return df


def volatility_over_time_period(df, t):
    if t > 1:
        df['volatility' + str(t)] = df['returns1'].rolling(t + 1).apply(np.std)
    return df


def labelling(df):
    mean, std = norm.fit(df['returns1'].dropna())

    def discrete_label(ret):
        if ret > 0.5 * std:
            return 1
        elif ret < -0.5 * std:
            return -1
        else:
            return 0
            
    df['labels'] = df['returns1'].dropna().apply(discrete_label)
    return df


def moving_average(df, t):
    df['ma' + str(t)] = df['close'].rolling(t).mean()
    return df


def calculate_rsi(df, t=14):
    series = df['close']
    delta = series.diff()
    gain = delta.where(delta > 0, 0)
    loss = -delta.where(delta < 0, 0)
    avg_gain = gain.rolling(window=t).mean()
    avg_loss = loss.rolling(window=t).mean()
    rs = avg_gain / avg_loss
    rsi = 100 - (100 / (1 + rs))
    df['rsi' + str(t)] = rsi

    rsi_series = pd.Series(rsi)

    # Calculate when RSI crosses 30 and 70
    rsi_crosses_30 = (rsi_series.shift(1) < 30) & (rsi_series >= 30)
    rsi_crosses_70 = (rsi_series.shift(1) > 70) & (rsi_series <= 70)

    # Combine the two conditions into a single Series object
    rsi_conditions = pd.Series(data=np.zeros(len(rsi_series)), index=rsi_series.index)
    rsi_conditions[rsi_crosses_30] = 1
    rsi_conditions[rsi_crosses_70] = -1

    # Fill in any remaining values with 0
    rsi_conditions.fillna(0, inplace=True)

    df['rsi' + str(t) + 'label'] = rsi_conditions
    return df

def generate_bollinger_bands(df, window=20, num_std=1):
    """
    Generates Bollinger Bands and entry signals based on price crossovers.

    Args:
        df (pd.DataFrame): DataFrame containing OHLCV data.
        window (int): Size of rolling window for calculating SMA and SD.
        num_std (float): Number of standard deviations to use for upper and lower bands.

    Returns:
        pd.DataFrame: DataFrame with added columns for upper band, middle band, lower band, and entry signals.
    """
    # Calculate rolling mean and standard deviation
    rolling_mean = df['close'].rolling(window=window).mean()
    rolling_std = df['close'].rolling(window=window).std()

    # Calculate upper and lower bands
    upper_band = rolling_mean + num_std * rolling_std
    lower_band = rolling_mean - num_std * rolling_std

    # Calculate entry signals based on price crossovers
    entry_signals = pd.Series(0, index=df.index)
    entry_signals[(df['close'] > upper_band.shift(1)) & (df['close'].shift(1) <= upper_band.shift(1))] = -1
    entry_signals[(df['close'] < lower_band.shift(1)) & (df['close'].shift(1) >= lower_band.shift(1))] = 1

    # Add columns to DataFrame
    # df['upper_band'] = upper_band
    # df['middle_band'] = rolling_mean
    # df['lower_band'] = lower_band
    df['bb_signals'] = entry_signals

    return df


# # Running the functions to generate indicators
# for t in time_period:
#     df = returns_over_time_period(df, t)
#     df = volatility_over_time_period(df, t)

# for t in range(2, 30):
#     df = moving_average(df, t)

# df = calculate_rsi(df)
# df = labelling(df)
# df['labels_shifted'] = df['labels'].shift(-1)

df

Unnamed: 0,date,open,high,low,close,adjclose,volume,ticker,returns5
0,2002-02-13,2.406250,2.515000,2.381250,2.512500,1.691667,76216000,GME,
1,2002-02-14,2.543750,2.548750,2.481250,2.500000,1.683250,11021600,GME,
2,2002-02-15,2.500000,2.506250,2.462500,2.487500,1.674834,8389600,GME,
3,2002-02-19,2.475000,2.475000,2.343750,2.387500,1.607504,7410400,GME,
4,2002-02-20,2.400000,2.468750,2.381250,2.468750,1.662209,6892800,GME,
...,...,...,...,...,...,...,...,...,...
5329,2023-04-17,22.270000,22.680000,22.139999,22.280001,22.280001,2066600,GME,-0.008014
5330,2023-04-18,22.139999,22.320000,21.500000,21.610001,21.610001,2748700,GME,-0.030072
5331,2023-04-19,21.280001,21.870001,20.959999,21.309999,21.309999,2539500,GME,-0.013883
5332,2023-04-20,20.879999,21.570000,20.059999,20.219999,20.219999,2977400,GME,-0.051150


In [33]:
df.describe()

Unnamed: 0,open,high,low,close,adjclose,volume,returns5
count,5334.0,5334.0,5334.0,5334.0,5334.0,5334.0,5329.0
mean,8.839545,9.131511,8.550709,8.821232,7.409569,14566950.0,0.001541
std,10.349391,10.996241,9.725501,10.262652,10.445252,29603950.0,0.050818
min,0.7125,0.735,0.6425,0.7,0.638794,260000.0,-0.6
25%,3.375,3.468125,3.2825,3.378125,2.795769,5901400.0,-0.015495
50%,5.815,5.905,5.70625,5.8075,4.066694,9860400.0,0.000164
75%,9.74,9.9425,9.529375,9.72875,6.950881,15342100.0,0.015931
max,94.927498,120.75,72.877502,86.877502,86.877502,788631600.0,1.348358


In [34]:
stock_list = ['SPY', 'AAPL', 'MSFT', 'GME', 'AMC', 'BBBY', 'TSLA', 'PLTR']



for stock in stock_list:
    df = misc.get_price_data(stock)
    df = returns_over_time_period(df, 1)
    # Running the functions to generate indicators
    for t in time_period:
        df = returns_over_time_period(df, t)
        df = volatility_over_time_period(df, t)
        df = calculate_rsi(df, t)
        df = moving_average(df, t)

    df = generate_bollinger_bands(df, window=20, num_std=1)
    df = labelling(df)

    df['labels_shifted'] = df['labels'].shift(-1)
    df = df.dropna()
    df = df[df['date'] <= '2023-03-15']



    df.to_csv('datasets/stock_price_series/' + stock + '.csv')



In [35]:
df.columns.values

array(['date', 'open', 'high', 'low', 'close', 'adjclose', 'volume',
       'ticker', 'returns1', 'returns5', 'volatility5', 'rsi5',
       'rsi5label', 'ma5', 'returns10', 'volatility10', 'rsi10',
       'rsi10label', 'ma10', 'returns15', 'volatility15', 'rsi15',
       'rsi15label', 'ma15', 'returns30', 'volatility30', 'rsi30',
       'rsi30label', 'ma30', 'bb_signals', 'labels', 'labels_shifted'],
      dtype=object)