In [1]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import xgboost as xgb


In [2]:
INPUT_FOLDER = "enrich"
OUTPUT_FOLDER = ""

In [3]:
key = "us_shareproce_joined_companies"

data = pd.read_csv(f"data/{INPUT_FOLDER}/{key}.csv")
data.info(show_counts=True)
data['Date'] = pd.to_datetime(data['Date'])
data.set_index('Date', inplace=True)  # Set Date as the index


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5459003 entries, 0 to 5459002
Data columns (total 11 columns):
 #   Column        Non-Null Count    Dtype  
---  ------        --------------    -----  
 0   Date          5459003 non-null  object 
 1   Industry      5459003 non-null  object 
 2   Sector        5459003 non-null  object 
 3   Ticker        5459003 non-null  object 
 4   Open          5459003 non-null  float64
 5   High          5459003 non-null  float64
 6   Low           5459003 non-null  float64
 7   Close         5459003 non-null  float64
 8   Volume        5459003 non-null  int64  
 9   Dividend      5459003 non-null  float64
 10  Company Name  5459003 non-null  object 
dtypes: float64(5), int64(1), object(5)
memory usage: 458.1+ MB


In [4]:
ticker = 'AAPL'
last_date = last_date = data.index.max()
sixty_days_prior = last_date - pd.DateOffset(days=100)

# Filter to Ticker and time window
df_ticker = data[data['Ticker'] == ticker]
df_ticker = df_ticker[(df_ticker.index > sixty_days_prior) & (df_ticker.index <= last_date)]

# Rename target column
df_ticker = df_ticker.rename(columns={'Close': 'target'})


In [5]:
df_ticker.head()

Unnamed: 0_level_0,Industry,Sector,Ticker,Open,High,Low,target,Volume,Dividend,Company Name
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2023-12-26,Computer Hardware,Technology,AAPL,193.61,193.89,192.83,193.05,28919310,0.0,APPLE INC
2023-12-27,Computer Hardware,Technology,AAPL,192.49,193.5,191.09,193.15,47899806,0.0,APPLE INC
2023-12-28,Computer Hardware,Technology,AAPL,194.14,194.66,193.17,193.58,34049898,0.0,APPLE INC
2023-12-29,Computer Hardware,Technology,AAPL,193.9,194.4,191.72,192.53,42672148,0.0,APPLE INC
2024-01-02,Computer Hardware,Technology,AAPL,187.15,188.44,183.88,185.64,82488674,0.0,APPLE INC


In [6]:
df_ticker['year'] = df_ticker.index.year
df_ticker['month'] = df_ticker.index.month
df_ticker['day'] = df_ticker.index.day
df_ticker['dayofweek'] = df_ticker.index.dayofweek

df_ticker['lag1'] = df_ticker['target'].shift(1)
df_ticker['log_return'] = np.log(df_ticker['target'] / df_ticker['target'].shift(1))
df_ticker['MA_10'] = df_ticker['target'].rolling(10).mean()
df_ticker['MA_50'] = df_ticker['target'].rolling(50).mean()
df_ticker['Volatility'] = df_ticker['target'].rolling(10).std()

def compute_rsi(series, window=14):
    delta = series.diff()
    gain = delta.where(delta > 0, 0).rolling(window).mean()
    loss = -delta.where(delta < 0, 0).rolling(window).mean()
    rs = gain / (loss + 1e-10)
    return 100 - (100 / (1 + rs))

df_ticker['RSI'] = compute_rsi(df_ticker['target'])
df_ticker['BB_Upper'] = df_ticker['MA_10'] + 2 * df_ticker['Volatility']
df_ticker['BB_Lower'] = df_ticker['MA_10'] - 2 * df_ticker['Volatility']


In [7]:
df_ticker = df_ticker.drop(columns=['Industry', 'Sector', 'Ticker_y',"Company Name","Ticker"], errors='ignore').dropna()


In [8]:
df_ticker.head()

Unnamed: 0_level_0,Open,High,Low,target,Volume,Dividend,year,month,day,dayofweek,lag1,log_return,MA_10,MA_50,Volatility,RSI,BB_Upper,BB_Lower
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2024-03-07,169.15,170.73,168.49,169.0,71765061,0.0,2024,3,7,3,169.12,-0.00071,177.148,185.4606,5.744245,18.274979,188.63649,165.65951
2024-03-08,169.0,173.7,168.94,170.73,76267041,0.0,2024,3,8,4,169.0,0.010185,175.969,185.0142,5.729059,25.466102,187.427119,164.510881
2024-03-11,172.94,174.38,172.05,172.75,58929918,0.0,2024,3,11,0,170.73,0.011762,175.128,184.6062,5.494864,32.287897,186.117728,164.138272
2024-03-12,173.15,174.03,171.01,173.23,59544927,0.0,2024,3,12,1,172.75,0.002775,174.188,184.1992,4.833082,31.516877,183.854165,164.521835
2024-03-13,172.77,173.19,170.76,171.13,51948951,0.0,2024,3,13,2,173.23,-0.012197,173.159,183.7712,4.17252,23.133117,181.50404,164.81396


In [9]:
X = df_ticker.drop(columns=['target']).dropna()
y = df_ticker.loc[X.index, 'target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, learning_rate=0.1, max_depth=5)
model.fit(X_train, y_train)

# Evaluate
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

print("Train MSE:", mean_squared_error(y_train, y_train_pred))
print("Test MSE:", mean_squared_error(y_test, y_test_pred))


Train MSE: 0.00012817019906174548
Test MSE: 4.032162731876421
