In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import xgboost as xgb

# Load the dataset (replace with your actual file path)
df = pd.read_csv('data/2024-06-01_2024-09-01/aggregated/percent_return.csv')

# Extract stock names
stock_names = df.columns[1:]  # Assuming the first column is the date

print(df) #every 5th row is a friday

          date         A       AAL      AAPL      ABBV      ABNB       ABT  \
0   2024-06-03  0.889681 -0.603454  0.469502 -0.792589  0.349934  0.153790   
1   2024-06-04 -0.481854 -0.346620 -0.264434  0.295955  0.864075  0.081875   
2   2024-06-05  2.123945  0.955688  0.124639  0.634790 -1.486685 -0.237806   
3   2024-06-06  0.086028 -1.295334 -0.733231  0.933931  1.023988  0.416045   
4   2024-06-07  0.593907  1.232397  1.033843 -0.664010  0.314642  3.035119   
..         ...       ...       ...       ...       ...       ...       ...   
58  2024-08-26 -0.439370 -2.109303  0.185217 -0.060739 -0.975609 -0.035481   
59  2024-08-27  0.441351 -0.877194  0.898230 -1.045511  1.248911  0.035380   
60  2024-08-28 -0.332086 -0.196661 -0.627410 -0.061380  1.318964 -0.176832   
61  2024-08-29  0.323778  1.464849 -0.134730 -0.418371 -1.328594 -0.476612   
62  2024-08-30  0.534605  1.239277 -0.516965  0.542892  0.316397  0.238935   

        ACGL       ACN      ADBE  ...       WTW        WY      

In [2]:
# Prepare the data for Monday to Thursday (features) and Friday (target)
X = []
y = []

# Iterate over the dataset in chunks of 5 rows (representing one week)
for i in range(0, len(df), 5):
    if i + 4 >= len(df):  # Prevent going out of bounds
        break
    # Monday to Thursday data (features)
    X.extend(df.iloc[i:i+4, 1:].T.values.tolist()) # rows are companies
    #X += df.iloc[i:i+4, 1:].T.values
    # Friday data (target)
    y.extend(df.iloc[i+4, 1:].T.values.tolist())  # Friday returns as target

# Convert to numpy arrays
X = np.array(X)
y = np.array(y)

# Train-test split (e.g., 80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [3]:
print(y)
print(y_train)

[0.59390747 1.23239743 1.03384276 ... 0.         0.27238552 0.06594702]
[ 1.31331505 -1.14554792  1.2670157  ... -2.4526709   0.2984898
 -0.735158  ]


In [4]:
print(X) #many arrays of size 4
print(X_train[0])
print(X_train)
X_test.shape

[[ 0.8896813  -0.48185403  2.12394546  0.08602796]
 [-0.60345385 -0.34662012  0.9556877  -1.2953335 ]
 [ 0.46950243 -0.26443394  0.12463948 -0.73323124]
 ...
 [ 0.11606898  0.56747597  0.92105531 -0.59833719]
 [-1.05824727 -1.32656765  2.16462254 -1.49587749]
 [-0.74347986 -0.67608389 -1.07186192  0.28723171]]
[-0.73710197 -0.70710398  3.48648896 -2.57812927]
[[-0.73710197 -0.70710398  3.48648896 -2.57812927]
 [-1.33112282  0.02159564 -0.54288461 -2.12261179]
 [ 0.1646311  -0.18898568 -0.74940558 -2.14013712]
 ...
 [-1.77858887 -0.60129332  0.64930454  0.69509968]
 [-1.65513288 -0.49024986 -0.20232736  0.0897815 ]
 [ 3.20242991  1.38331594  1.52063763 -0.10624886]]


(1203, 4)

In [5]:
from sklearn.metrics import mean_squared_error

# Train an XGBoost regression:squared error model - fine tuning parameters; importance to task
model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, learning_rate=0.04, max_depth = 4)
model.fit(X_train, y_train)

# Evaluate the model on the test set
predictions = model.predict(X_test)


# Calculate RMSE
rmse = np.sqrt(mean_squared_error(y_test, predictions))
print(f"RMSE: {rmse}")

RMSE: 1.40396558616338


In [6]:
from sklearn.dummy import DummyRegressor

# Baseline model that predicts the mean
baseline_model = DummyRegressor(strategy="mean")
baseline_model.fit(X_train, y_train)
baseline_predictions = baseline_model.predict(X_test)

# Calculate RMSE for baseline
baseline_rmse = np.sqrt(mean_squared_error(y_test, baseline_predictions))
print("Baseline RMSE:", baseline_rmse)
print("Model Improvement:", baseline_rmse - rmse)


Baseline RMSE: 1.448624667762448
Model Improvement: 0.04465908159906795


In [None]:
# Define the parameter grid based on suggestions - not included: subsample(default 1), sampling_method(default uniform)
param_dist = {
    'learning_rate': [0.01, 0.02, 0.03, 0.04, 0.05, 0.1, 0.2],
    'gamma': [0, 2, 4, 6, 8, 10],
    'max_depth': [3, 4, 5, 6, 7],
    'min_child_weight': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    'max_delta_step': [0, 2, 4, 6, 8, 10],
    'lambda': [0, 1, 2, 4, 6],
    'n_estimators': [100, 150, 200, 250, 300],
    'subsample': [0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
    'colsample_bytree': [0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
}

from sklearn.model_selection import GridSearchCV

# Initialize model
xgb_model = xgb.XGBRegressor(objective='reg:squarederror')

# Set up GridSearchCV
grid_search = GridSearchCV(
    xgb_model, param_grid=param_dist,
    scoring='neg_root_mean_squared_error',
    cv=5, n_jobs=-1
)

# Fit the search
grid_search.fit(X_train, y_train)

# Output best parameters and best score
print("Best Parameters:", grid_search.best_params_)
print("Best RMSE:", -grid_search.best_score_)

best_model = -grid_search.best_estimator_