# <b> <font color = #03396c> JPMC TEAM #1</b></font>
## <b>Price Prediction using XGBOOST</b>
In this notebook, we will create dataframes of the S&P500 stocks' price percentage change over a week for the year of 2022. We will then use the clusters we found from our cluster exploration to prove our hypothesis that: 'Using clusters as opposed to all the stocks in the XGBoost model will, in turn, produce more accruate price predictions.

In [None]:
!pip install ipykernel

In [None]:
!pip install --upgrade xgboost

In [None]:
import numpy as np
import pandas as pd
import matplotlib
import yfinance as yf

In [None]:
data = pd.read_html('https://en.wikipedia.org/wiki/List_of_S%26P_500_companies#S%26P_500_component_stocks')
table_symbol = data[0]
table_industry = data[0]['GICS Sector']

#these 2 lists are in order with one another
symbols = list(table_symbol.Symbol.values)
industries = list(table_industry.values) ##Industry Sector

remove = []

symbols_to_remove = ['BF.B', 'BRK.B', 'KVUE', 'VLTO','SPY']

for i, symbol in enumerate(symbols):
    if symbol in symbols_to_remove:
        remove.append(i)

for i in remove:
    symbols.remove(symbols[i])
    industries.remove(industries[i])

print(len(symbols))

symbol_industry = dict(zip(symbols,industries))
print(len(symbol_industry))

In [None]:
tickers = yf.Tickers('AAPL')

APPLdf = tickers.tickers['AAPL'].history(period="5d", start="2022-01-01", end="2022-12-31")
APPLdf.head(10)

In [None]:
drop_cols = ['Open', 'Low', 'High','Volume', 'Dividends', 'Stock Splits']
APPLdf = APPLdf.drop(columns=drop_cols, axis = 1)
APPLdf.index = APPLdf.index.astype(str).str.split(' ').str[0]
APPLdf

In [None]:
from plotly.subplots import make_subplots
import plotly.graph_objects as go
fig = make_subplots(rows=1, cols=1, subplot_titles=('Price of APPL for 2022',))

# Add trace for Close price
fig.add_trace(go.Scatter(x=APPLdf.index, y=APPLdf['Close'], name='Close Price'), row=1, col=1)

# Get unique months in the DataFrame
unique_months = pd.to_datetime(APPLdf.index).to_period('M').unique()

# Update x-axis layout to show ticks for each month
fig.update_xaxes(
    tickmode='array',
    tickvals=unique_months.to_timestamp(),  # Convert to timestamp for plotting
    ticktext=unique_months.strftime('%b'),  # Displaying abbreviated month names
)
# Show figure
fig.show()

In [None]:
holidays = ["2022-01-17","2022-02-21","2022-04-15", "2022-05-30", "2022-06-20", "2022-07-4", "2022-09-5", "2022-11-24","2022-12-26"]
len(holidays)

In [None]:
holiday_df = pd.DataFrame(index=holidays)
holiday_df['Date'] = holiday_df.index

# Add another column 'Close' with NaN values
holiday_df['Close'] = float('nan')

# Set the 'Date' column as the index
holiday_df.set_index('Date', inplace=True)

# Print the resulting DataFrame
holiday_df.dtypes

In [None]:
APPLdf.loc["2022-11-25"]

In [None]:
# Assuming APPLdf and holiday_df are DataFrames

# Concatenate the original DataFrame with the holiday DataFrame
frames = [APPLdf, holiday_df]
result_df = pd.concat(frames)

# Convert the index to a consistent data type (e.g., Timestamp)
result_df.index = pd.to_datetime(result_df.index)

# Sort the index to maintain order
result_df = result_df.sort_index()

# Print the resulting DataFrame
APPL1 = result_df
APPL1


In [None]:
# Convert index to datetime if it's in string format
APPL1.index = pd.to_datetime(APPL1.index)

# Create a new DataFrame with the desired structure
new_df = pd.DataFrame(index=[f'APPL Week {week}' for week in range(1, 53)],
                      columns=['Day 1', 'Day 2', 'Day 3', 'Day 4', 'Day 5'])

# Iterate through rows of the original DataFrame and fill the new DataFrame
for i, row in APPL1.iterrows():
    week = i.week
    day = i.dayofweek
    
    # If there are rows for the current week and day, set the mean close price
    if not pd.isna(row['Close']):
        new_df.at[f'APPL Week {week}', f'Day {day + 1}'] = row['Close']

# Display the new DataFrame
APPL = new_df
nan_rows = new_df[new_df.isna().any(axis=1)]
nan_rows

In [None]:
# Convert index to datetime if it's in string format
APPL1.index = pd.to_datetime(APPL1.index)

# Create a new DataFrame with the desired structure
new_df = pd.DataFrame(index=[f'APPL Week {week}' for week in range(1, 53)],
                      columns=['Day 1', 'Day 2', 'Day 3', 'Day 4', 'Day 5'])

# Iterate through rows of the original DataFrame and fill the new DataFrame
for i, row in APPL1.iterrows():
    week = i.week
    day = i.dayofweek
    
    # If there are rows for the current week and day, set the mean close price
    if not pd.isna(row['Close']):
        new_df.at[f'APPL Week {week}', f'Day {day + 1}'] = row['Close']

# Drop rows with NaN values
new_df = new_df.dropna()

# Display the new DataFrame
APPL = new_df
APPL = APPL.drop(['APPL Week 1'], axis=0)
APPL


In [None]:
APPLPCT1 = APPL1.pct_change()
APPLPCT1

In [None]:
# Convert index to datetime if it's in string format
APPLPCT1.index = pd.to_datetime(APPL1.index)

# Create a new DataFrame with the desired structure
new_df = pd.DataFrame(index=[f'APPL PC Week {week}' for week in range(1, 53)],
                      columns=['Day 1', 'Day 2', 'Day 3', 'Day 4', 'Day 5'])

# Iterate through rows of the original DataFrame and fill the new DataFrame
for i, row in APPLPCT1.iterrows():
    week = i.week
    day = i.dayofweek
    
    # If there are rows for the current week and day, set the mean close price
    if not pd.isna(row['Close']):
        new_df.at[f'APPL PC Week {week}', f'Day {day + 1}'] = row['Close']

new_df.replace(0, pd.NA, inplace=True)

# Drop rows with NaN values
new_df = new_df.dropna()

# Display the new DataFrame
APPLPCT = new_df
APPLPCT.shape

In [None]:
tickers = yf.Tickers('MMM')

MMMdf = tickers.tickers['MMM'].history(period="5d", start="2022-01-01", end="2022-12-31")
drop_cols = ['Open', 'Low', 'High','Volume', 'Dividends', 'Stock Splits']
MMMdf = MMMdf.drop(columns=drop_cols, axis = 1)
MMMdf.index = MMMdf.index.astype(str).str.split(' ').str[0]
frames = [MMMdf, holiday_df]
result_df = pd.concat(frames)

# Convert the index to a consistent data type (e.g., Timestamp)
#result_df.index = pd.to_datetime(result_df.index)

# Sort the index to maintain order
result_df = result_df.sort_index()

# Print the resulting DataFrame
MMM1 = result_df
MMM1
pct = MMM1.ffill().pct_change()
pct

In [None]:
pct = MMM1.ffill().pct_change()
pct

In [None]:
# Get the list of S&P 500 stocks
sp500 = pd.read_html('https://en.wikipedia.org/wiki/List_of_S%26P_500_companies')[0]
sp500_tickers = sp500['Symbol'].tolist()

# Remove specified tickers from the list of S&P 500 stock
# Function to fetch data and create price percentage DataFrame
def get_price_percentage_df(ticker):
    try:
        stock_data = yf.download(ticker, start="2022-01-01", end="2022-12-31")
        drop_cols = ['Open', 'Low', 'High', 'Adj Close', 'Volume']
        stock_data = stock_data.drop(columns=drop_cols, axis=1)
        stock_data.index = stock_data.index.astype(str).str.split(' ').str[0]
        result_df = stock_data.ffill().pct_change()
        result_df.columns = [f"{ticker}" for col in result_df.columns]
        return result_df
    
    except Exception as e:
        print(f"Error fetching data for {ticker}: {e}")
        return pd.DataFrame()

# <b> Here create the cluster dfs

In [None]:

# Fetch data for each ticker and create price percentage DataFrame
pct_dfs = []
for ticker in sp500_tickers:
        pct_df = get_price_percentage_df(ticker)
        if not pct_df.empty:
            pct_dfs.append(pct_df)

# Concatenate the DataFrames
sp500_pct_df = pd.concat(pct_dfs, axis=1)

# Display the resulting DataFrame
sp500_pct_df

In [None]:
df_labels = pd.read_csv('df_labels.csv')
df_labels = df_labels.loc[:496]
comp = list(df_labels['Company'])
# if the company in sp500pct is not in comp, drop it
for i in sp500_pct_df.columns:
    if i not in comp:
        sp500_pct_df.drop(i, axis=1, inplace=True)
sp500_pct_df


In [None]:
holidays = ["2022-01-17","2022-02-21","2022-04-15", "2022-05-30", "2022-06-20", "2022-07-4", "2022-09-5", "2022-11-24","2022-12-26"]
columns = sp500_pct_df.columns

# Create holiday_df| with NaN values for each date and columns matching sp500_pct_df
holiday_df = pd.DataFrame(index=holidays, columns=columns)
holiday_df['Date'] = holiday_df.index

# Set the 'Date' column as the index
holiday_df.set_index('Date', inplace=True)
holiday_df.shape

In [None]:
result_df = pd.concat([sp500_pct_df, holiday_df], axis=0)
# Sort the DataFrame by index to maintain order
result_df = result_df.sort_index()
sp500_pct_df = result_df

In [None]:
sp500_pct_df.T

In [None]:
company_values = sp500_pct_df.T.iloc[0].values

# Create a list of week strings
weeks = [f"MMM Week {i}" for i in range(1, 53)]

# Create an empty DataFrame with the specified index and columns
MMMweeks = pd.DataFrame(index=weeks, columns=["Day 1", "Day 2", "Day 3", "Day 4", "Day 5"])

# Set the values in MMMweeks
for i in range(0, len(company_values), 5):
    week_number = (i // 5) + 1
    week_name = f"MMM Week {week_number}"
    MMMweeks.loc[week_name] = company_values[i:i + 5]

# Display the resulting DataFrame
#MMMweeks

In [None]:
#MMMdf.pct_change().head(20)

In [None]:
dfs = []


# Iterate through each stock
for stock in sp500_pct_df.T.index:
    # Get the values for the current stock
    stock_values = sp500_pct_df.T.loc[stock].values
    
    # Create a list of week strings
    weeks = [f"{stock} Week {i}" for i in range(1, 53)]

    # Create an empty DataFrame with the specified index and columns
    stock_df = pd.DataFrame(index=weeks, columns=["Day 1", "Day 2", "Day 3", "Day 4", "Day 5"])

    # Set the values in the DataFrame
    for i in range(0, len(stock_values), 5):
        week_number = (i // 5) + 1
        week_name = f"{stock} Week {week_number}"
        stock_df.loc[week_name] = stock_values[i:i + 5]

    # Append the DataFrame for the current stock to the list
    dfs.append(stock_df)
    #add a column with the stock names
    stock_df['Stock'] = stock

# Concatenate all DataFrames in the list
result_df = pd.concat(dfs)

# Drop rows with NaN values
result_df = result_df.dropna()

# Display the resulting DataFrame
result_df


In [None]:
#cluster1 = 391
#cluster2 = 27
#cluster3 = 79
df_labels = pd.read_csv('df_labels.csv')
df_labels = df_labels.loc[:496]
# 
# make 3 lists of the 3 different clusters of stocks
cluster1 = df_labels[df_labels['Cluster_Label']==1]
cluster2 = df_labels[df_labels['Cluster_Label']==2]
cluster3 = df_labels[df_labels['Cluster_Label']==3]
df_labels

In [None]:
b = 79/497
c = b * 20874
c

In [None]:
#make a dataframe subset for clusters 1-3
#list of the companies in cluster1
cluster1_comp = list(cluster1['Company'])
# make a dataframe subset from the result_df of the companies in cluster1_comp
cluster1_df = result_df[result_df['Stock'].isin(cluster1_comp)]
#repeat for cluster2
cluster2_comp = list(cluster2['Company'])
cluster2_df = result_df[result_df['Stock'].isin(cluster2_comp)]
#repeat for cluster3
cluster3_comp = list(cluster3['Company'])
cluster3_df = result_df[result_df['Stock'].isin(cluster3_comp)]

In [None]:
#drop the 'Stock' column from each dataframe
cluster1_df.drop(['Stock'], axis=1, inplace=True)
cluster2_df.drop(['Stock'], axis=1, inplace=True)
cluster3_df.drop(['Stock'], axis=1, inplace=True)
result_df.drop(['Stock'], axis=1, inplace=True)
result_df

In [None]:
cluster2_df

In [None]:
dropweeks = [1,3,8,15,22,25,26,27,36,47,52]
len(dropweeks)

### <b>There are 41 weeks of trading data per company

In [None]:
sp500_pct_df = result_df

In [None]:
sp500_pct_df.dtypes

In [None]:

sp500_pct_df = sp500_pct_df.astype('float64')
cluster1_df = cluster1_df.astype('float64')
cluster2_df = cluster2_df.astype('float64')
cluster3_df = cluster3_df.astype('float64')

In [None]:
cluster1_df.dtypes

# <b> XGBOOST MODEL

In [None]:
import xgboost as xgb
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error


def train_xgboost_regressor(X, y, params=None, test_size=0.2, random_state=42):
    """
    Train an XGBoost regressor on the given dataset.
    Returns:
    - model: Trained XGBoost regressor
    - y_pred: Predictions on the test set
    - mse: Mean squared error on the test set
    """
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, 
                                                        random_state=random_state)

    # Initialize the XGBoost regressor
    model = xgb.XGBRegressor() if params is None else xgb.XGBRegressor(**params)

    # Train the model
    model.fit(X_train, y_train)

    # Make predictions on the test set
    y_pred = model.predict(X_test)

    # Calculate mean squared error
    mse = mean_squared_error(y_test, y_pred)

    return model, y_pred, mse, y_test
#X_train, X_test, y_train, y_test


In [None]:
import xgboost as xgb
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error

def xgboost_tuned(X, y, params=None, test_size=0.2, random_state=42):
    """
    Train an XGBoost regressor on the given dataset.
    Returns:
    - model: Trained XGBoost regressor
    - y_pred: Predictions on the test set
    - mse: Mean squared error on the test set
    """
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, 
                                                        random_state=random_state)

    # Define the parameter grid for grid search
    param_grid = {
        #'gamma': [0.001, 0.005, 0.01, 0.02],
        'learning_rate': [0.001, 0.005, 0.01, 0.05, 0.1],
        'max_depth': [2, 3, 5, 7, 8, 10],
        'n_estimators': [100, 200, 300, 400]
    }

    # Initialize the XGBoost regressor
    model = xgb.XGBRegressor() if params is None else xgb.XGBRegressor(**params)

    # Perform grid search
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error')
    grid_search.fit(X_train, y_train)

    # Get the best model and its parameters
    best_model = grid_search.best_estimator_
    best_params = grid_search.best_params_

    # Make predictions on the test set using the best model
    y_pred = best_model.predict(X_test)

    # Calculate mean squared error
    mse = mean_squared_error(y_test, y_pred)

    return best_model, y_pred, mse, best_params, y_test


In [None]:
y = sp500_pct_df['Day 5']
X = sp500_pct_df.drop(columns='Day 5')

In [None]:
sp500 = train_xgboost_regressor(X,y)
mse_sp500 = sp500[2]
mse_sp500

In [None]:
sp500_tuned = xgboost_tuned(X,y)
mse_sp500_tuned = sp500_tuned[2]
mse_sp500_tuned

In [None]:
best_params = sp500_tuned[3]
best_params

In [None]:
import matplotlib.pyplot as plt
# MSE values
mse_values = [mse_sp500, mse_sp500_tuned]

# Model names
model_names = ['XGBoost', 'Tuned XGBoost']

# Creating a bar plot
plt.bar(model_names, mse_values, color=['#E5E4E2', '#01477bff'], width=0.8)
for i, value in enumerate(mse_values):
    plt.text(i, value + 0.000005, f'{value:.10f}', ha='center', va='bottom')

plt.xlabel('Models')
plt.ylabel('Mean Squared Error (MSE)')
plt.title('Comparison of MSE between SP500 XGBoost and Tuned XGBoost')

# Adjusting the y-axis range
plt.ylim(0.00, max(mse_values) + 0.00005)

# Display the plot
plt.show()

In [None]:
#display the predicted price percentage change values with the actual price percentage change values
y_pred = sp500[1]
y_pred
y_test = sp500[3]
y_test_tuned = sp500_tuned[4]
y_pred_tuned = sp500_tuned[1]

In [None]:
# make a table of the predicted values and the actual values
comparison_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
comparison_df
# combine the 2 dataframes

In [None]:
# calculate r2 score
from sklearn.metrics import r2_score
r2_score(y_test, y_pred)

In [None]:
#plot the MAE and tuned MAE
# MAE values
mae_values = [mae, mae_tuned]
model_names = ['MAE', 'Tuned MAE']
plt.title('Comparison of MAE between SP500 XGBoost and Tuned XGBoost')
plt.xlabel('Models')
plt.ylabel('Mean Absolute Error (MAE)')
plt.bar(model_names, mae_values, color=['#E5E4E2', '#01477bff'])
for i, value in enumerate(mae_values):
    plt.text(i, value + 0.000005, f'{value:.10f}', ha='center', va='bottom')
plt.ylim(0.00, 0.025 + 0.0025)
plt.show()


In [None]:
#plot the RMSE and tuned RMSE like the MAE
# RMSE values
rmse_values = [rmse, rmse_tuned]
model_names = ['RMSE', 'Tuned RMSE']
plt.title('Comparison of RMSE between SP500 XGBoost and Tuned XGBoost')
plt.xlabel('Models')
plt.ylabel('Root Mean Squared Error (RMSE)')
plt.bar(model_names, rmse_values, color=['#7393B3', '#01477bff'])
for i, value in enumerate(rmse_values):
    plt.text(i, value + 0.000005, f'{value:.10f}', ha='center', va='bottom')
plt.ylim(0.00, 0.025 + 0.0025)
plt.show()

In [None]:
#print the best parameters 
best_params = sp500_tuned[3]    
best_params

# <b>Making Dataframes With our Clusters

In [None]:
# compare the list, symbols, to the stocks in the company column of the dataframe
# make a list of the ones in symbols that aren't in the column
# then remove those from the symbols list
remove = []
for i, symbol in enumerate(symbols):
    if symbol not in list(df_labels['Company']):
        remove.append(i)

remove
# make a list of the stocks that are at the indices in remove
remove_symbols = []
for i in remove:
    remove_symbols.append(symbols[i])
remove_symbols

In [None]:
cluster1

In [None]:
cluster2

In [None]:
cluster3

In [None]:
sp500_pct_df

In [None]:
# run the model on each dataframe
y1 = cluster1_df['Day 5']
X1 = cluster1_df.drop(columns='Day 5')
cluster_1_model = train_xgboost_regressor(X1,y1)
cluster_1_mse = cluster_1_model[2]
print('Cluster 1 MSE: ' + str(round(cluster_1_mse, 8)))
cluster_1_model_tuned = xgboost_tuned(X1,y1)
cluster_1_mse_tuned = cluster_1_model_tuned[2]
print('Cluster 1 Tuned MSE: ' + str(round(cluster_1_mse_tuned, 8)))

In [None]:
y2 = cluster2_df['Day 5']
X2 = cluster2_df.drop(columns='Day 5')
cluster_2_model = train_xgboost_regressor(X2,y2)
cluster_2_mse = cluster_2_model[2]
print('Cluster 2 MSE: ' + str(round(cluster_2_mse, 8)))
cluster_2_model_tuned = xgboost_tuned(X2,y2)
cluster_2_mse_tuned = cluster_2_model_tuned[2]
print('Cluster 2 Tuned MSE: ' + str(round(cluster_2_mse_tuned, 8)))

In [None]:
y3 = cluster3_df['Day 5']
X3 = cluster3_df.drop(columns='Day 5')
cluster_3_model = train_xgboost_regressor(X3,y3)
cluster_3_mse = cluster_3_model[2]
print('Cluster 3 MSE: ' + str(round(cluster_3_mse, 8)))
cluster_3_model_tuned = xgboost_tuned(X3,y3)
cluster_3_msetuned = cluster_3_model_tuned[2]
print('Cluster 3 Tuned MSE: ' + str(round(cluster_3_msetuned, 8)))

In [None]:
params1 = cluster_1_model_tuned[3]
params2 = cluster_2_model_tuned[3]
params3 = cluster_3_model_tuned[3]
print(params1)
print(params2)
print(params3)

In [None]:
#plot the MSE value for each cluster and their corresponding tuned mse value next to it
# MSE values
mse_values = [cluster_1_mse, cluster_1_mse_tuned, cluster_2_mse, cluster_2_mse_tuned, cluster_3_mse, cluster_3_msetuned]
model_names = ['Cluster 1 MSE', 'Cluster 1 Tuned MSE', 'Cluster 2 MSE', 'Cluster 2 Tuned MSE', 'Cluster 3 MSE', 'Cluster 3 Tuned MSE']
plt.title('Comparison of MSE between the 3 Clusters')
plt.xlabel('Cluster Models')
plt.ylabel('Mean Squared Error (MSE)')
#make the color a gradient of blues
plt.bar(model_names, mse_values, color= ['#7393B3', '#01477bff', '#7393B3', '#01477bff', '#7393B3', '#01477bff'], width=0.7)
for i, value in enumerate(mse_values):
    plt.text(i, value + 0.000005, f'{value:.10f}', ha='center', va='bottom')
plt.ylim(0.00, max(mse_values) + 0.00005)
plt.xticks(rotation=15)
plt.show()

In [None]:
#plot the MSE value for each cluster and their corresponding tuned mse value next to it
# MSE values
mse_values = [cluster_1_mse, cluster_2_mse, cluster_3_mse]
model_names = ['Cluster 1 MSE',  'Cluster 2 MSE', 'Cluster 3 MSE']
plt.title('MSE between the 3 Clusters')
plt.xlabel('Cluster Models')
plt.ylabel('Mean Squared Error (MSE)')
#make the color a gradient of blues
plt.bar(model_names, mse_values, color= ['#7393B3','#7393B3','#7393B3'], width=0.7)
for i, value in enumerate(mse_values):
    plt.text(i, value + 0.000005, f'{value:.10f}', ha='center', va='bottom')
plt.ylim(0.00, 0.0007 + 0.00005)
plt.show()

In [None]:
#plot the MSE value for each cluster tuned mse value next to it
# MSE values
mse_values = [cluster_1_mse_tuned,cluster_2_mse_tuned, cluster_3_msetuned]
model_names = ['Cluster 1 Tuned MSE', 'Cluster 2 Tuned MSE', 'Cluster 3 Tuned MSE']
plt.title('MSE between the 3 Clusters, Tuned')
plt.xlabel('Cluster Models')
plt.ylabel('Mean Squared Error (MSE)')
#make the color a gradient of blues
plt.bar(model_names, mse_values, color= ['#01477bff','#01477bff','#01477bff'], width=0.7)
for i, value in enumerate(mse_values):
    plt.text(i, value + 0.000005, f'{value:.10f}', ha='center', va='bottom')
plt.ylim(0.00, 0.0007 + 0.00005)
plt.show()

In [None]:
# plot the mse values for each cluster compared to the original sp500 model mse
# MSE values
mse_values = [mse_sp500, cluster_1_mse, cluster_2_mse, cluster_3_mse]
# plot the mse values for each cluster compared to the original sp500 model mse

# Model names
model_names = ['XGBoost', 'Cluster 1', 'Cluster 2', 'Cluster 3']

# Creating a bar plot
plt.bar(model_names, mse_values, color=['#99CCFF', '#6A5ACD', '#9370DB', '#DCD0FF',])
for i, value in enumerate(mse_values):
    plt.text(i, value + 0.000005, f'{value:.10f}', ha='center', va='bottom')

plt.xlabel('Models: Original XGBoost and Clustered XGBoost')
plt.ylabel('Mean Squared Error (MSE)')
plt.title('Comparison of MSE between XGBoost and Clustered XGBoost')

# Adjusting the y-axis range
plt.ylim(0.00, max(mse_values) + 0.00005)


# Display the plot
plt.show()


In [None]:
# plot the mse values for each cluster compared to the original sp500 model mse
# MSE values
mse_values = [mse_sp500,mse_sp500_tuned, cluster_1_mse, cluster_2_mse, cluster_3_mse]
# plot the mse values for each cluster compared to the original sp500 model mse

# Model names
model_names = ['XGBoost','XGBoost Tuned', 'Cluster 1', 'Cluster 2', 'Cluster 3']

# Creating a bar plot
plt.bar(model_names, mse_values, color=['blue', 'purple', 'orange', 'green', 'red',], width=0.6)
for i, value in enumerate(mse_values):
    plt.text(i, value + 0.000005, f'{value:.10f}', ha='center', va='bottom')

plt.xlabel('Models: Original XGBoost and Clustered XGBoost')
plt.ylabel('Mean Squared Error (MSE)')
plt.title('Comparison of Mean Squared Errors between XGBoost, XGBoost Tuned, and Clustered XGBoost')

# Adjusting the y-axis range
plt.ylim(0.00, max(mse_values) + 0.00005)

# Display the plot
plt.show()

In [None]:
#plot the mse values for each cluster next to the mse tuned values for each cluster
mse_values = [cluster_1_mse, cluster_1_mse_tuned, cluster_2_mse, cluster_2_mse_tuned, cluster_3_mse, cluster_3_msetuned]
model_names= ['Cluster 1', 'Cluster 1 Tuned', 'Cluster 2', 'Cluster 2 Tuned', 'Cluster 3', 'Cluster 3 Tuned']

plt.bar(model_names, mse_values, color=['blue', 'lightblue', 'green', 'lightgreen', 'orange', 'coral'])
for i, value in enumerate(mse_values):
    plt.text(i, value + 0.000005, f'{value:.10f}', ha='center', va='bottom')

plt.xlabel('Models')
plt.ylabel('Mean Squared Error (MSE)')
plt.title('MSE values vs tuned values per cluster')

# Adjusting the y-axis range
plt.ylim(0.00, max(mse_values) + 0.00005)

plt.xticks(rotation=45)
# Display the plot
plt.show()

In [None]:
#plot all the tuned model mse values
# MSE values
mse_values = [mse_sp500_tuned, cluster_1_mse_tuned, cluster_2_mse_tuned, cluster_3_msetuned]
model_names = ['XGBoost Tuned', 'Cluster 1 Tuned', 'Cluster 2 Tuned', 'Cluster 3 Tuned']

plt.bar(model_names, mse_values, color=['purple', '#4169E1', '#008080', '#89CFF0',])
for i, value in enumerate(mse_values):
    plt.text(i, value + 0.000005, f'{value:.10f}', ha='center', va='bottom')

plt.xlabel('Models')
plt.ylabel('Mean Squared Error (MSE)')
plt.title('MSE Values for All Tuned Models')

# Adjusting the y-axis range
plt.ylim(0.00, max(mse_values) + 0.00005)

# Display the plot
plt.show()

In [None]:
#average the mse values for each cluster
cluster_average = (cluster_1_mse + cluster_2_mse + cluster_3_mse)/3
print('Average MSE for Clustered XGBoost: ' + str(round(cluster_average, 8)))
cluster_average_tuned = (cluster_1_mse_tuned + cluster_2_mse_tuned + cluster_3_msetuned)/3
print('Average Tuned MSE for Clustered XGBoost: ' + str(round(cluster_average_tuned, 8)))

In [None]:
#plot all the tuned model mse values
# MSE values
mse_values = [mse_sp500, mse_sp500_tuned,cluster_average, cluster_average_tuned]
model_names = ['XGBoost', 'XGBoost Tuned','Clustered XGBoost', 'Clustered XGBoost Tuned']
plt.bar(model_names, mse_values, color=['#7393B3', '#01477bff', '#7393B3','#01477bff',])
for i, value in enumerate(mse_values):
    plt.text(i, value + 0.000005, f'{value:.10f}', ha='center', va='bottom')

plt.xlabel('Models')
plt.ylabel('Mean Squared Error (MSE)')
plt.title('MSE for XGBoost and Average Clustered XGBoost: Nontuned/Tuned')

# Adjusting the y-axis range
plt.ylim(0.00, max(mse_values) + 0.00005)
#rotate the x axis
plt.xticks(rotation=10)
# Display the plot
plt.show()

In [None]:
# make a dataframe of how many companies are in each cluster
cluster_counts = pd.DataFrame(df_labels['Cluster_Label'].value_counts())
#keep it in 1,2,3 order
cluster_counts = cluster_counts.sort_index()
#cluster_counts = cluster_counts.T
#make the column say "company count"
cluster_counts.columns = ['Company Count']
cluster_counts.T

In [None]:
#plot all the tuned model mse values
# MSE values
mse_values = [mse_sp500, cluster_average, cluster_average_tuned]
model_names = ['XGBoost', 'Cluster-XGBoost', 'Tuned Cluster-XGBoost']
plt.bar(model_names, mse_values, color=['purple', '#4169E1', '#C8B4E9'])
for i, value in enumerate(mse_values):
    plt.text(i, value + 0.000005, f'{value:.10f}', ha='center', va='bottom')

plt.xlabel('Models')
plt.ylabel('Mean Squared Error (MSE)')
plt.title('MSE for XGBoost and Average Clustered XGBoost: Nontuned/Tuned')

# Adjusting the y-axis range
plt.ylim(0.00, max(mse_values) + 0.00005)
#rotate the x axis
# Display the plot
plt.show()

In [None]:
#plot all the tuned model mse values
# MSE values
mse_values = [mse_sp500, cluster_average]
model_names = ['XGBoost', 'Cluster-XGBoost']
plt.bar(model_names, mse_values, color=['#E5E4E2', '#01477bff',])
for i, value in enumerate(mse_values):
    plt.text(i, value + 0.000005, f'{value:.10f}', ha='center', va='bottom')

plt.xlabel('Models')
plt.ylabel('Mean Squared Error (MSE)')
plt.title('MSE for XGBoost and Average Cluster-XGBoost')

# Adjusting the y-axis range
plt.ylim(0.00, max(mse_values) + 0.00005)
#rotate the x axis
# Display the plot
plt.show()

In [None]:
# MAPE values: Moving Average Percentage Error
# Calculate the MAPE for each model
mape_values = [mape, mape_tuned, cluster_1_mape, cluster_1_mape_tuned, cluster_2_mape, cluster_2_mape_tuned, cluster_3_mape, cluster_3_mape_tuned]
model_names = ['XGBoost', 'XGBoost Tuned', 'Cluster 1', 'Cluster 1 Tuned', 'Cluster 2', 'Cluster 2 Tuned', 'Cluster 3', 'Cluster 3 Tuned']

# Create a bar plot
plt.bar(model_names, mape_values, color=['#E5E4E2', '#01477bff', '#E5E4E2', '#01477bff', '#E5E4E2', '#01477bff', '#E5E4E2', '#01477bff'])
for i, value in enumerate(mape_values):
    plt.text(i, value + 0.000005, f'{value:.10f}', ha='center', va='bottom')

plt.xlabel('Models')
plt.ylabel('Mean Absolute Percentage Error (MAPE)')
plt.title('Comparison of MAPE between XGBoost and Clustered XGBoost')

# Adjust the y-axis range
plt.ylim(0.00, max(mape_values) + 0.00005)

# Display the plot
plt.show()
