### YFinance 10 Yr Data Set

#### Imports

In [1]:
import numpy as np
import pandas as pd

#### Technical Indicators Price Data

In [2]:
# Load the CSV file into a DataFrame
df = pd.read_csv('../data/raw/yfinance_prices.csv', parse_dates=['date'])
df = df.sort_values('date').set_index('date')

# 1. Basic returns
df['ret_1d'] = df['close'].pct_change()                   # simple daily return
df['log_ret_1d'] = np.log(df['close'] / df['close'].shift(1))  # log return (to monitor compounding)

# 2. Moving averages + price/MA ratios
ma_windows = [5, 10, 20, 50, 200]

for w in ma_windows:
    df[f'ma_{w}'] = df['close'].rolling(w).mean()
    df[f'close_ma_{w}_ratio'] = df['close'] / df[f'ma_{w}']

# 3. Rolling volatility (std of returns)
vol_windows = [10, 20, 30]

for w in vol_windows:
    df[f'vol_{w}d'] = df['ret_1d'].rolling(w).std()

# 4. ATR (Average True Range, 14-day)
high_low = df['high'] - df['low']
high_close_prev = (df['high'] - df['close'].shift(1)).abs()
low_close_prev = (df['low'] - df['close'].shift(1)).abs()

true_range = pd.concat([high_low, high_close_prev, low_close_prev], axis=1).max(axis=1)
df['atr_14'] = true_range.rolling(14).mean()

# 5. Bollinger Bands (20-day, 2 std)
bb_window = 20

df['bb_mid_20'] = df['close'].rolling(bb_window).mean()
df['bb_std_20'] = df['close'].rolling(bb_window).std()
df['bb_upper_20'] = df['bb_mid_20'] + 2 * df['bb_std_20']
df['bb_lower_20'] = df['bb_mid_20'] - 2 * df['bb_std_20']
df['bb_width_20'] = (df['bb_upper_20'] - df['bb_lower_20']) / df['bb_mid_20']

# 6. RSI (Relative Strength Index, 14-day)
def compute_rsi(series: pd.Series, window: int = 14) -> pd.Series:
    delta = series.diff()
    gain = delta.clip(lower=0)
    loss = -delta.clip(upper=0)

    avg_gain = gain.rolling(window).mean()
    avg_loss = loss.rolling(window).mean()

    rs = avg_gain / avg_loss
    rsi = 100 - (100 / (1 + rs))
    return rsi

df['rsi_14'] = compute_rsi(df['close'], window=14)

# 7. MACD (12, 26, 9)
ema_12 = df['close'].ewm(span=12, adjust=False).mean()
ema_26 = df['close'].ewm(span=26, adjust=False).mean()

df['macd'] = ema_12 - ema_26
df['macd_signal'] = df['macd'].ewm(span=9, adjust=False).mean()
df['macd_hist'] = df['macd'] - df['macd_signal']

# 8. Clean dataset for modeling (drop initial NaNs from rolling windows)
features_df = df.dropna().copy()

features_df.head()

# Display the first few rows with moving averages
df.head(50)

Unnamed: 0_level_0,ticker,open,high,low,close,volume,ret_1d,log_ret_1d,ma_5,close_ma_5_ratio,...,atr_14,bb_mid_20,bb_std_20,bb_upper_20,bb_lower_20,bb_width_20,rsi_14,macd,macd_signal,macd_hist
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2015-01-02,QQQ,95.662562,96.06822,94.445575,94.906555,31314600,,,,,...,,,,,,,,0.0,0.0,0.0
2015-01-05,QQQ,94.491703,94.602341,93.247058,93.514427,36521300,-0.014668,-0.014777,,,...,,,,,,,,-0.111053,-0.022211,-0.088842
2015-01-06,QQQ,93.652698,93.809429,91.845657,92.260536,66205500,-0.013409,-0.013499,,,...,,,,,,,,-0.29682,-0.077133,-0.219688
2015-01-07,QQQ,92.869027,93.671128,92.647752,93.44986,37577400,0.012891,0.012809,,,...,,,,,,,,-0.344107,-0.130527,-0.21358
2015-01-08,QQQ,94.242738,95.422846,94.141322,95.238457,40212600,0.01914,0.018959,93.873967,1.014535,...,,,,,,,,-0.234553,-0.151333,-0.083221
2015-01-09,QQQ,95.533494,95.561152,94.058353,94.611534,41410100,-0.006583,-0.006604,93.814963,1.008491,...,,,,,,,,-0.196059,-0.160278,-0.035781
2015-01-12,QQQ,94.795924,94.89734,93.385327,93.625038,34129800,-0.010427,-0.010482,93.837085,0.99774,...,,,,,,,,-0.24236,-0.176694,-0.065666
2015-01-13,QQQ,94.537769,95.533488,92.841359,93.597366,56162300,-0.000296,-0.000296,94.104451,0.994611,...,,,,,,,,-0.278081,-0.196972,-0.081109
2015-01-14,QQQ,92.656992,93.560516,92.260549,93.081093,51354200,-0.005516,-0.005531,94.030698,0.989901,...,,,,,,,,-0.344083,-0.226394,-0.117689
2015-01-15,QQQ,93.514386,93.661896,91.762661,91.873299,51005400,-0.012976,-0.013061,93.357666,0.9841,...,,,,,,,,-0.488221,-0.278759,-0.209462


### Sentiment Data

In [5]:
# Load the CSV file into a DataFrame
df_sentiment = pd.read_csv('../data/raw/gdelt_sentiment_QQQ.csv', parse_dates=['date'])
#df_sentiment = df_sentiment.sort_values('date').set_index('date')
df_sentiment.head(15)

Unnamed: 0,date,ticker,sentiment_score,sentiment_std,sentiment_min,sentiment_max,sentiment_positive,sentiment_negative,sentiment_neutral,news_count
0,2017-01-02,QQQ,-0.1589,0.275223,-0.4767,0.0,0.0,0.102333,0.897667,3
1,2017-01-03,QQQ,0.071327,0.258975,-0.6369,0.6486,0.067878,0.019102,0.913041,49
2,2017-01-04,QQQ,0.105075,0.221087,-0.2732,0.5859,0.071786,0.013786,0.914429,28
3,2017-01-05,QQQ,0.084076,0.317081,-0.4404,0.7351,0.086619,0.051095,0.862286,21
4,2017-01-06,QQQ,0.010221,0.225347,-0.5859,0.5267,0.049643,0.043929,0.906429,14
5,2017-01-07,QQQ,0.37585,0.436833,0.0,0.8126,0.18675,0.02775,0.7855,4
6,2017-01-08,QQQ,0.0,0.0,0.0,0.0,0.0,0.0,1.0,3
7,2017-01-09,QQQ,0.152804,0.261446,-0.5106,0.5849,0.10176,0.01352,0.88476,25
8,2017-01-10,QQQ,0.1075,0.337307,-0.6908,0.6705,0.0853,0.0573,0.85745,20
9,2017-01-11,QQQ,0.134979,0.342094,-0.6808,0.6124,0.106474,0.049684,0.843842,19
