In [150]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import xgboost as xgb
import os

# Load the dataset (replace with your actual file path)
filename = os.path.join(os.getcwd(), "percent_return_threeYear.csv")
df = pd.read_csv(filename)

filenameSector = os.path.join(os.getcwd(), "sectorsCompany.csv")
dfSector = pd.read_csv(filenameSector)

#convert original dataframe into rows as company names
stacked_df = df.melt(id_vars=["date"], var_name="Symbol", value_name="Data")
merged_df = pd.merge(stacked_df, dfSector, on="Symbol", how="inner")

sector_groups = merged_df.groupby("Sector")
sector_dfs = {sector: group.drop(columns=["Sector"]) for sector, group in sector_groups} # dfs clustered by sector

dfSectors = {}
    
for sector, df in sector_dfs.items() :
    pivoted_df = df.pivot_table(index="date", columns="Symbol", values="Data", aggfunc="first").reset_index()
    dfSectors[sector] = pivoted_df

print(len(dfSectors)) # 11 clusters for each sector

df_tech = dfSectors['Technology']
df_basicMaterials = dfSectors['Basic Materials']
df_communicationServices = dfSectors['Communication Services']
df_consumerCyclical = dfSectors['Consumer Cyclical']
df_consumerDefensive = dfSectors['Consumer Defensive']
df_energy = dfSectors['Energy']
df_healthcare = dfSectors['Healthcare']
df_industrials = dfSectors['Industrials']
df_real_estate = dfSectors['Real Estate']
df_utilities = dfSectors['Utilities']
df_financialServices = dfSectors['Financial Services']


11


In [139]:
df.shape

(1006, 493)

In [199]:
# Repeat next steps of code for EVERY Sector to form predictive model per cluster
X = []
y = []

# Iterate over the dataset in chunks of 5 rows (representing one week)
for i in range(0, len(df_financialServices), 5): # replace df with specific df for every sector
    if i + 4 >= len(df_financialServices):  # Prevent going out of bounds
        break
    # Monday to Thursday data (features)
    X.extend(df_financialServices.iloc[i:i+4, 1:].T.values.tolist()) # rows are companies
    #X += df.iloc[i:i+4, 1:].T.values
    # Friday data (target)
    y.extend(df_financialServices.iloc[i+4, 1:].T.values.tolist())  # Friday returns as target

# Convert to numpy arrays
X = np.array(X)
y = np.array(y)

# Train-test split (e.g., 80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [200]:
print(y)


[ -6.41951183 -11.87933395 -11.60719056 ...  -0.50351207   0.21964489
   0.33913448]


In [201]:
print(X) #many arrays of size 4
print(X_train[0])
print(X_train)
y_train.shape

[[ -3.90664276  -3.74174451  -5.52781224  -7.35206823]
 [ -7.85557429  -8.97170169 -10.67704956  -9.06064071]
 [ -9.23615876  -8.38814381 -11.90581004 -10.70725759]
 ...
 [ -0.07946497  -1.09601965  -1.36186914   1.21841673]
 [ -0.1800716   -0.69364491  -0.16630175  -0.65898605]
 [ -1.88296658   0.8016432    0.38276051  -0.05171342]]
[-6.50953095 -3.49555009 -3.91276777 -7.53743451]
[[ -6.50953095  -3.49555009  -3.91276777  -7.53743451]
 [ -0.38637407  -3.21955444  -3.23582131  -1.93537966]
 [ -0.56999523  -3.43180574  -2.09577462  -3.73654988]
 ...
 [ -8.07991369  -9.78880011  -7.77502001  -8.15644529]
 [ -8.71299205  -8.10887118 -10.20830255  -8.54714609]
 [ -9.9899424   -8.08064182 -11.47602296  -2.10588012]]


(10612,)

In [202]:
from sklearn.dummy import DummyRegressor

# Train an XGBoost regression:squared error model - fine tuning parameters; importance to task
model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=300, learning_rate=0.007, max_depth = 5)
model.fit(X_train, y_train)

# Evaluate the model on the test set
predictions = model.predict(X_test)

# Baseline model that predicts the mean y for all feature values
baseline_model = DummyRegressor(strategy="mean")
baseline_model.fit(X_train, y_train)
baseline_predictions = baseline_model.predict(X_test)



In [203]:
### Model evaluation
from sklearn.metrics import mean_squared_error, r2_score

# Calculate RMSE
rmse = np.sqrt(mean_squared_error(y_test, predictions))
print(f"Model RMSE: {rmse}")

# Calculate RMSE for baseline
baseline_rmse = np.sqrt(mean_squared_error(y_test, baseline_predictions))
print("Baseline RMSE:", baseline_rmse)
print("Model Improvement:", baseline_rmse - rmse)

# Calculate R^2 scores
model_r2 = r2_score(y_test, predictions)
baseline_r2 = r2_score(y_test, baseline_predictions)

print(f"Model R^2 Score: {model_r2}")
print(f"Baseline R^2 Score: {baseline_r2}")

Model RMSE: 1.7064150024672238
Baseline RMSE: 4.300053989616015
Model Improvement: 2.5936389871487915
Model R^2 Score: 0.8425011553466777
Baseline R^2 Score: -0.00012864865095818523


In [204]:
from sklearn.model_selection import RandomizedSearchCV
import xgboost as xgb
import numpy as np

# Define the parameter distributions
param_dist = {
    'learning_rate': [0.01, 0.02, 0.03, 0.04, 0.05, 0.1, 0.2],
    'gamma': [0, 2, 4, 6, 8, 10],
    'max_depth': [3, 4, 5, 6, 7],
    'min_child_weight': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    'max_delta_step': [0, 2, 4, 6, 8, 10],
    'lambda': [0, 1, 2, 4, 6],
    'n_estimators': [100, 150, 200, 250, 300],
    'subsample': [0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
}

# Initialize the XGBoost model
xgb_model = xgb.XGBRegressor(objective='reg:squarederror')

# Set up RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=xgb_model,
    param_distributions=param_dist,
    n_iter=100,  # Number of parameter settings to sample
    scoring='r2',  # Optimize for R^2 score
    cv=5,  # 5-fold cross-validation
    random_state=42,  # For reproducibility
    n_jobs=-1  # Use all available cores
)

# Fit the random search
random_search.fit(X_train, y_train)

# Output best parameters and best score
print("Best Parameters:", random_search.best_params_)
print("Best R^2 Score:", random_search.best_score_)

# Retrieve the best model
best_model = random_search.best_estimator_

y_pred = best_model.predict(X_test)

# Calculate the standard deviation of the predicted values
std_predicted_values = np.std(y_pred)
print("Standard Deviation of Predicted Values:", std_predicted_values)


Best Parameters: {'subsample': 0.5, 'n_estimators': 250, 'min_child_weight': 10, 'max_depth': 7, 'max_delta_step': 8, 'learning_rate': 0.02, 'lambda': 4, 'gamma': 10}
Best R^2 Score: 0.8608815421040038
Standard Deviation of Predicted Values: 3.9319568
