# Importing Libraries

In [97]:
import warnings
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, f1_score
import time
import pickle 
from prophet import Prophet
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score , precision_score, recall_score, f1_score

warnings.simplefilter(action='ignore', category=FutureWarning)

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Reading Files

In [98]:
crude_oil_prices = pd.read_csv('../InputData/crude_oil_prices.csv')
federal_rates = pd.read_csv('../InputData/effective_federal_funds_rate.csv')
corridor_rates = pd.read_csv('../InputData/egyptian_corridor_interest_rates.csv')
housing_index = pd.read_csv('../InputData/housing_index.csv')
inflation_mom = pd.read_csv('../InputData/inflation_month_on_month.csv')
inflation_yoy = pd.read_csv('../InputData/inflation_year_on_year.csv')
news_data = pd.read_csv('../InputData/news.csv')
stock_prices = pd.read_csv('../InputData/stocks_prices_and_volumes.csv')
vix_indices = pd.read_csv('../InputData/vix_index.csv')
vixeem_indices = pd.read_csv('../InputData/vxeem_index.csv')
gold_prices = pd.read_csv('../InputData/intraday_gold.csv')
gold_prices_target = pd.read_csv('../InputData/target_gold.csv')
sentiment_data = pd.read_csv('./cleaned_sentiment_scores.csv')

In [99]:
stock_prices.head()

Unnamed: 0,Date,stock_0_food_beverages_and_tobacco_close_price,stock_10_non-banking_financial_services_close_price,stock_11_real_estate_close_price,stock_12_non-banking_financial_services_close_price,stock_13_it_&_media_close_price,stock_14_it_&_media_close_price,stock_1_energy_and_support_services_close_price,stock_2_basic_resources_close_price,stock_3_basic_resources_close_price,...,stock_14_it_&_media_volume,stock_1_energy_and_support_services_volume,stock_2_basic_resources_volume,stock_3_basic_resources_volume,stock_4_banks_volume,stock_5_banks_volume,stock_6_real_estate_volume,stock_7_real_estate_volume,stock_8_real_estate_volume,stock_9_non-banking_financial_services_volume
0,2020-01-01,,,,,,,,,,...,,,,,,,,,,
1,2020-01-02,15.24,2.23,8.15,8.2,,8.0,3.64,21.05,9.8,...,48800.0,445065.0,13815.0,2264947.0,395.0,284645.0,241598.0,2390823.0,90156.0,22926.0
2,2020-01-03,,,,,,,,,,...,,,,,,,,,,
3,2020-01-04,,,,,,,,,,...,,,,,,,,,,
4,2020-01-05,14.84,2.1,7.58,8.17,,7.62,3.32,19.74,8.88,...,264848.0,2740470.0,152334.0,1777066.0,88832.0,837444.0,3759516.0,16720687.0,1250162.0,1222827.0


## NLP

In [100]:
predictions = []
y_trues = []
# Convert the column to Date Object
sentiment_data['date'] = pd.to_datetime(sentiment_data['date'], errors='coerce')

gold_prices['Timestamp'] = pd.to_datetime(gold_prices['Timestamp'] , errors='coerce')

# Convert the column to Date Object
gold_prices['Date'] = gold_prices['Timestamp'].dt.date
sentiment_data['Date'] = sentiment_data['date'].dt.date

#Convert Column to Date Object
gold_daily = gold_prices.groupby('Date').last().reset_index()
sentiment_data = sentiment_data.groupby('Date').mean().reset_index()

sentiment_data = sentiment_data.sort_values(by='Date').reset_index(drop=True)
gold_daily = gold_daily.sort_values(by='Date').reset_index(drop=True)

    
# Keep only the 'Date' column and the specific stock column
sentiment_data_filtered = sentiment_data[['Date', 'sentiment_score']]
first_valid_index = sentiment_data_filtered['sentiment_score'].first_valid_index()
sentiment_data_filtered = sentiment_data_filtered.loc[first_valid_index:].reset_index(drop=True)


#Get Start and End Date relative to each dataset
start_date = max(sentiment_data_filtered['Date'].min(), gold_daily['Date'].min())
end_date = min(sentiment_data_filtered['Date'].max(), gold_daily['Date'].max())

#Filter the data based on the start and end date
sentiment_data_filtered = sentiment_data_filtered[(sentiment_data_filtered['Date'] >= start_date) & (sentiment_data_filtered['Date'] <= end_date)]
gold_daily = gold_daily[(gold_daily['Date'] >= start_date) & (gold_daily['Date'] <= end_date)]
# stock_prices_filtered[stock_prices_column] = stock_prices_filtered[stock_prices_column].interpolate(method='time')
# gold_daily['24K'] = gold_daily['24K'].interpolate(method='time')


#Create Log Columns
sentiment_data_filtered['Log_Sentiment_Score'] = np.log(sentiment_data_filtered['sentiment_score'] + 1)
gold_daily['Log_Gold_Price'] = np.log(gold_daily['24K'] + 1)

window_size = 30
num_std_devs = 2

gold_daily['Rolling_Mean_Gold'] = gold_daily['Log_Gold_Price'].rolling(window=window_size).mean()
gold_daily['Rolling_Std_Gold'] = gold_daily['Log_Gold_Price'].rolling(window=window_size).std()

gold_daily['Rolling_Upper_Threshold_Gold'] = gold_daily['Rolling_Mean_Gold'] + num_std_devs  * gold_daily['Rolling_Std_Gold']
gold_daily['Rolling_Lower_Threshold_Gold'] = gold_daily['Rolling_Mean_Gold'] - num_std_devs  * gold_daily['Rolling_Std_Gold']

sentiment_data_filtered['Rolling_Mean_Score'] = sentiment_data_filtered['Log_Sentiment_Score'].rolling(window=window_size).mean()
sentiment_data_filtered['Rolling_Std_Score'] = sentiment_data_filtered['Log_Sentiment_Score'].rolling(window=window_size).std()

sentiment_data_filtered['Rolling_Upper_Threshold_Score'] = sentiment_data_filtered['Rolling_Mean_Score'] + num_std_devs  * sentiment_data_filtered['Rolling_Std_Score']
sentiment_data_filtered['Rolling_Lower_Threshold_Score'] = sentiment_data_filtered['Rolling_Mean_Score'] - num_std_devs  * sentiment_data_filtered['Rolling_Std_Score']

outlier_mask_gold = (gold_daily['Log_Gold_Price'] < gold_daily['Rolling_Lower_Threshold_Gold']) | (gold_daily['Log_Gold_Price'] > gold_daily['Rolling_Upper_Threshold_Gold'])

outlier_mask_score = (sentiment_data_filtered['Log_Sentiment_Score'] < sentiment_data_filtered['Rolling_Lower_Threshold_Score']) | (sentiment_data_filtered['Log_Sentiment_Score'] > sentiment_data_filtered['Rolling_Upper_Threshold_Score'])

#Set the outliers to NaN
gold_daily.loc[outlier_mask_gold, 'Log_Gold_Price'] = np.nan
sentiment_data_filtered.loc[outlier_mask_score, 'Log_Stock_Price'] = np.nan


sentiment_data_filtered['Log_Sentiment_Score'] = sentiment_data_filtered['Log_Sentiment_Score'].shift(1)

#Merge the data
merged_data = pd.merge(sentiment_data_filtered, gold_daily, on='Date', how='outer')
#Remove duplicate dates
date_range = pd.date_range(start=start_date, end=end_date, freq='D')
# merged_data = merged_data.drop_duplicates(subset='Date')

filtered_data = merged_data[['Date', 'Log_Sentiment_Score', 'Log_Gold_Price']]

filtered_data = filtered_data.set_index('Date')

filtered_data = filtered_data.reindex(date_range)
filtered_data['Log_Sentiment_Score'] = filtered_data['Log_Sentiment_Score'].interpolate(method='time')
filtered_data['Log_Gold_Price'] = filtered_data['Log_Gold_Price'].interpolate(method='time')

filtered_data = filtered_data.dropna(subset=['Log_Sentiment_Score', 'Log_Gold_Price'])

train_size = 0.8
split_index = int(len(filtered_data) * train_size)
train = filtered_data.iloc[:split_index].copy()
test = filtered_data.iloc[split_index:].copy()
y_trues.append(np.exp(test['Log_Gold_Price']))
df = pd.DataFrame()
df['y'] = train['Log_Gold_Price']
df['ds'] = train.index
df['Log_Sentiment_Score'] = train['Log_Sentiment_Score']
model = Prophet()
model.add_regressor('Log_Sentiment_Score')
model.fit(df)

test_size = len(test)
future_periods = test_size
prophet_test = test.reset_index().rename(columns={'index':'ds','Log_Gold_Price':'y'})
test_fcst = model.predict(prophet_test)
test_fcst['Predicted_Gold_Price'] = np.exp(test_fcst['yhat'])
predictions.append(test_fcst)

23:08:06 - cmdstanpy - INFO - Chain [1] start processing
23:08:07 - cmdstanpy - INFO - Chain [1] done processing


## Stock

In [101]:
# Convert the column to Date Object
stock_prices['Date'] = pd.to_datetime(stock_prices['Date']).dt.date

gold_prices['Timestamp'] = pd.to_datetime(gold_prices['Timestamp'] , errors='coerce')

# Convert the column to Date Object
gold_prices['Date'] = gold_prices['Timestamp'].dt.date

#Convert Column to Date Object
gold_daily = gold_prices.groupby('Date').last().reset_index()

stock_prices = stock_prices.sort_values(by='Date').reset_index(drop=True)
gold_daily = gold_daily.sort_values(by='Date').reset_index(drop=True)
for stock_prices_column in stock_prices.columns:
    if stock_prices_column == 'Date':
        continue
    
    # Keep only the 'Date' column and the specific stock column
    stock_prices_filtered = stock_prices[['Date', stock_prices_column]]
    first_valid_index = stock_prices_filtered[stock_prices_column].first_valid_index()
    stock_prices_filtered = stock_prices_filtered.loc[first_valid_index:].reset_index(drop=True)
 

    
    #Get Start and End Date relative to each dataset
    start_date = max(stock_prices_filtered['Date'].min(), gold_daily['Date'].min())
    end_date = min(stock_prices_filtered['Date'].max(), gold_daily['Date'].max())
    
    #Filter the data based on the start and end date
    stock_prices_filtered = stock_prices_filtered[(stock_prices_filtered['Date'] >= start_date) & (stock_prices_filtered['Date'] <= end_date)]
    gold_daily = gold_daily[(gold_daily['Date'] >= start_date) & (gold_daily['Date'] <= end_date)]
    # stock_prices_filtered[stock_prices_column] = stock_prices_filtered[stock_prices_column].interpolate(method='time')
    # gold_daily['24K'] = gold_daily['24K'].interpolate(method='time')

    
    #Create Log Columns
    stock_prices_filtered['Log_Stock_Price'] = np.log(stock_prices_filtered[stock_prices_column] + 1)
    gold_daily['Log_Gold_Price'] = np.log(gold_daily['24K'] + 1)

    window_size = 30
    num_std_devs = 2
    
    gold_daily['Rolling_Mean_Gold'] = gold_daily['Log_Gold_Price'].rolling(window=window_size).mean()
    gold_daily['Rolling_Std_Gold'] = gold_daily['Log_Gold_Price'].rolling(window=window_size).std()
    
    gold_daily['Rolling_Upper_Threshold_Gold'] = gold_daily['Rolling_Mean_Gold'] + num_std_devs  * gold_daily['Rolling_Std_Gold']
    gold_daily['Rolling_Lower_Threshold_Gold'] = gold_daily['Rolling_Mean_Gold'] - num_std_devs  * gold_daily['Rolling_Std_Gold']
    
    stock_prices_filtered['Rolling_Mean_Stock'] = stock_prices_filtered['Log_Stock_Price'].rolling(window=window_size).mean()
    stock_prices_filtered['Rolling_Std_Stock'] = stock_prices_filtered['Log_Stock_Price'].rolling(window=window_size).std()
    
    stock_prices_filtered['Rolling_Upper_Threshold_Stock'] = stock_prices_filtered['Rolling_Mean_Stock'] + num_std_devs  * stock_prices_filtered['Rolling_Std_Stock']
    stock_prices_filtered['Rolling_Lower_Threshold_Stock'] = stock_prices_filtered['Rolling_Mean_Stock'] - num_std_devs  * stock_prices_filtered['Rolling_Std_Stock']
    
    outlier_mask_gold = (gold_daily['Log_Gold_Price'] < gold_daily['Rolling_Lower_Threshold_Gold']) | (gold_daily['Log_Gold_Price'] > gold_daily['Rolling_Upper_Threshold_Gold'])
    
    outlier_mask_stock = (stock_prices_filtered['Log_Stock_Price'] < stock_prices_filtered['Rolling_Lower_Threshold_Stock']) | (stock_prices_filtered['Log_Stock_Price'] > stock_prices_filtered['Rolling_Upper_Threshold_Stock'])
    
    #Set the outliers to NaN
    gold_daily.loc[outlier_mask_gold, 'Log_Gold_Price'] = np.nan
    stock_prices_filtered.loc[outlier_mask_stock, 'Log_Stock_Price'] = np.nan
    
    
    stock_prices_filtered['Log_Stock_Price'] = stock_prices_filtered['Log_Stock_Price'].shift(1)
    
    #Merge the data
    merged_data = pd.merge(stock_prices_filtered, gold_daily, on='Date', how='outer')
    #Remove duplicate dates
    date_range = pd.date_range(start=start_date, end=end_date, freq='D')
    # merged_data = merged_data.drop_duplicates(subset='Date')
    
    filtered_data = merged_data[['Date', 'Log_Stock_Price', 'Log_Gold_Price']]
    
    filtered_data = filtered_data.set_index('Date')
    filtered_data = filtered_data.reindex(date_range)
    filtered_data['Log_Stock_Price'] = filtered_data['Log_Stock_Price'].interpolate(method='time')
    filtered_data['Log_Gold_Price'] = filtered_data['Log_Gold_Price'].interpolate(method='time')
    
    filtered_data = filtered_data.dropna(subset=['Log_Stock_Price', 'Log_Gold_Price'])
    
    train_size = 0.8
    split_index = int(len(filtered_data) * train_size)
    train = filtered_data.iloc[:split_index].copy()
    test = filtered_data.iloc[split_index:].copy()
    y_trues.append(np.exp(test['Log_Gold_Price']))
    df = pd.DataFrame()
    df['y'] = train['Log_Gold_Price']
    df['ds'] = train.index
    df['Log_Stock_Price'] = train['Log_Stock_Price']
    model = Prophet()
    model.add_regressor('Log_Stock_Price')
    model.fit(df)
    
    test_size = len(test)
    future_periods = test_size
    prophet_test = test.reset_index().rename(columns={'index':'ds','Log_Gold_Price':'y'})
    test_fcst = model.predict(prophet_test)
    test_fcst['Predicted_Gold_Price'] = np.exp(test_fcst['yhat'])
    predictions.append(test_fcst)
    
predictions

23:08:08 - cmdstanpy - INFO - Chain [1] start processing
23:08:08 - cmdstanpy - INFO - Chain [1] done processing
23:08:09 - cmdstanpy - INFO - Chain [1] start processing
23:08:10 - cmdstanpy - INFO - Chain [1] done processing
23:08:10 - cmdstanpy - INFO - Chain [1] start processing
23:08:11 - cmdstanpy - INFO - Chain [1] done processing
23:08:11 - cmdstanpy - INFO - Chain [1] start processing
23:08:12 - cmdstanpy - INFO - Chain [1] done processing
23:08:12 - cmdstanpy - INFO - Chain [1] start processing
23:08:13 - cmdstanpy - INFO - Chain [1] done processing
23:08:13 - cmdstanpy - INFO - Chain [1] start processing
23:08:13 - cmdstanpy - INFO - Chain [1] done processing
23:08:13 - cmdstanpy - INFO - Chain [1] start processing
23:08:14 - cmdstanpy - INFO - Chain [1] done processing
23:08:14 - cmdstanpy - INFO - Chain [1] start processing
23:08:14 - cmdstanpy - INFO - Chain [1] done processing
23:08:14 - cmdstanpy - INFO - Chain [1] start processing
23:08:15 - cmdstanpy - INFO - Chain [1]

[            ds     trend  yhat_lower  yhat_upper  trend_lower  trend_upper  \
 0   2023-03-13  7.591756    7.584070    7.628581     7.591756     7.591756   
 1   2023-03-14  7.593935    7.584594    7.629853     7.593935     7.593935   
 2   2023-03-15  7.596113    7.586034    7.628217     7.596113     7.596113   
 3   2023-03-16  7.598291    7.586806    7.627261     7.598291     7.598291   
 4   2023-03-17  7.600469    7.584482    7.632487     7.600469     7.600469   
 ..         ...       ...         ...         ...          ...          ...   
 287 2023-12-25  8.216905    6.587368   10.137320     6.455524     9.976595   
 288 2023-12-26  8.219083    6.585333   10.125750     6.449704     9.984461   
 289 2023-12-27  8.221261    6.578022   10.145472     6.438939     9.992328   
 290 2023-12-28  8.223440    6.585621   10.158831     6.429457    10.000451   
 291 2023-12-29  8.225618    6.579160   10.162611     6.425115    10.008781   
 
      Log_Sentiment_Score  Log_Sentiment_Score_low

In [102]:
len(predictions)

31

## Ensemble Learning

### Calculating Weights

In [196]:
def calculate_weights(y_true, predictions_array):
    """
    Calculate weights for each model based on their F1 scores.
    
    Args:
    y_true (list): List of true values
    predictions_array (list): List of DataFrames, each containing model predictions
    
    Returns:
    list: Weights for each model
    """
    f1_scores = []
    for i, pred_df in enumerate(predictions_array):
        y_true_buckets = pd.qcut(y_true[i], 4, labels=False)
        y_pred_buckets = pd.qcut(pred_df['Predicted_Gold_Price'], 4, labels=False)
        f1 = f1_score(y_true_buckets, y_pred_buckets, average='weighted')
        f1_scores.append(f1)
    
    # Normalize F1 scores to get weights
    total=0
    for score in f1_scores:
        print(score)
        total += score
    weights = [f1 / total for f1 in f1_scores]
    
    return weights


In [188]:
def calculate_weights_2(y_true, predictions_array):
    """
    Calculate weights for each model based on their mean absolute error.
    
    Args:
    y_true (pd.Series): True values
    predictions_array (list): List of DataFrames, each containing model predictions
    
    Returns:
    list: Weights for each model
    """
    maes = []
    i = 0
    for pred_df in predictions_array:
        mae = mean_absolute_error(y_true[i], pred_df['Predicted_Gold_Price'])
        maes.append(mae)
        i+=1
    
    # Invert MAEs so that lower error gets higher weight
    weights = [1 / mae for mae in maes]
    print(weights)
    # Normalize weights
    total=0
    for weight in weights:
        total += weight
    normalized_weights = [w / total for w in weights]
    
    return normalized_weights

In [197]:
# Calculate weights
weights = calculate_weights(y_trues, predictions)

# Print each weight
for i, weight in enumerate(weights, start=1):
    print(f"Weight for Model {i}: {weight:.4f}")

0.4090459299735903
0.4120314864805633
0.4120314864805633
0.4120314864805633
0.4120314864805633
0.5580967134810466
0.7383704169615011
0.5089595715169005
0.39127911096357254
0.5775951017191157
0.6893691657955096
0.6641390978209911
0.7194282103473078
0.6330890210912224
0.6767167618387577
0.6268720170756494
0.6954269834475302
0.6891371807900885
0.6146913483182057
0.6893691657955096
0.6329317760247865
0.6453619014616996
0.5525834296702027
0.6830813044351844
0.6645283278928481
0.7142138862924039
0.6893691657955096
0.6395360688151002
0.6332423835634255
0.5646971236771273
0.6396088674569688
Weight for Model 1: 0.0220
Weight for Model 2: 0.0222
Weight for Model 3: 0.0222
Weight for Model 4: 0.0222
Weight for Model 5: 0.0222
Weight for Model 6: 0.0300
Weight for Model 7: 0.0397
Weight for Model 8: 0.0274
Weight for Model 9: 0.0210
Weight for Model 10: 0.0311
Weight for Model 11: 0.0371
Weight for Model 12: 0.0357
Weight for Model 13: 0.0387
Weight for Model 14: 0.0341
Weight for Model 15: 0.0364

In [198]:
sum = 0
for weight in weights:
    sum+=weight
sum

1.0

In [199]:
def weighted_ensemble_predictions(predictions_array, weights):
    """
    Combine predictions from multiple models using weighted averaging.
    
    Args:
    predictions_array (list): List of DataFrames, each containing model predictions
    weights (list): List of weights corresponding to each model
    
    Returns:
    pd.DataFrame: Combined predictions
    """
    # Ensure number of prediction DataFrames matches number of weights
    assert len(predictions_array) == len(weights), "Number of prediction DataFrames must match number of weights"
    
    # Ensure all prediction DataFrames have the same index
    # for i in range(1, len(predictions_array)):
    #     assert predictions_array[0].index.equals(predictions_array[i].index), f"Prediction DataFrame {i} has a different index"
    
    # Calculate the weighted average of all predictions
    ensemble_pred = pd.Series(0, index=predictions_array[0].index)
    for predictions, weight in zip(predictions_array, weights):
        ensemble_pred += weight * predictions['Predicted_Gold_Price']
    
    # Create a new DataFrame with the ensemble predictions
    ensemble_df = pd.DataFrame({
        'ds': predictions_array[0]['ds'],
        'Predicted_Gold_Price': ensemble_pred
    })
    
    return ensemble_df

In [205]:
def averaging_ensemble_predictions(predictions_array):
    """
    Combine predictions from multiple models using simple averaging.
    
    Args:
    predictions_array (list): List of DataFrames, each containing model predictions
    
    Returns:
    pd.DataFrame: Combined predictions
    """
    # Ensure all prediction DataFrames have the same index
    # for i in range(1, len(predictions_array)):
    #     assert predictions_array[0].index.equals(predictions_array[i].index), f"Prediction DataFrame {i} has a different index"
    
    # Calculate the average of all predictions
    ensemble_pred = pd.Series(0, index=predictions_array[0].index)
    for predictions in predictions_array:
        ensemble_pred += predictions['Predicted_Gold_Price']
    
    ensemble_pred /= len(predictions_array)
    
    # Create a new DataFrame with the ensemble predictions
    ensemble_df = pd.DataFrame({
        'ds': predictions_array[0]['ds'],
        'Predicted_Gold_Price': ensemble_pred
    })
    
    return ensemble_df


In [206]:
# Use the function to create weighted ensemble predictions
weighted_ensemble_predictions = weighted_ensemble_predictions(
    predictions_array=predictions,
    weights=weights
)

In [207]:
from sklearn.metrics import mean_absolute_percentage_error


def evaluate_model(y_true, y_pred):
    """
    Evaluate model performance using multiple metrics.
    
    Args:
    y_true (pd.Series): True values
    y_pred (pd.Series): Predicted values
    
    Returns:
    dict: Dictionary containing evaluation metrics
    """

    rmse = np.sqrt(mean_squared_error(y_true[:160], y_pred[:160]))
    mae = mean_absolute_error(y_true[:160], y_pred[:160])
    mape = mean_absolute_percentage_error(y_true[:160], y_pred[:160])


    mda = np.mean(np.sign(np.diff(y_true)) == np.sign(np.diff(y_pred)))

    y_true_buckets = pd.qcut(y_true[:160], 4, labels=False) 
    y_pred_buckets = pd.qcut(y_pred[:160], 4, labels=False)

    f1 = f1_score(y_true_buckets, y_pred_buckets, average='weighted')

    
    return {
        'Mean Directional Accuracy':mda,
        'Bucketized F1 Score': f1,
        'RMSE': rmse,
        'MAE': mae,
        'MAPE': mape
    }

In [150]:
y_trues

[2023-03-13    2137.270546
 2023-03-14    2151.285804
 2023-03-15    2165.392968
 2023-03-16    2179.592640
 2023-03-17    2193.885428
                  ...     
 2023-12-25    3458.533651
 2023-12-26    3482.851054
 2023-12-27    3507.339437
 2023-12-28    3532.000000
 2023-12-29    3578.000000
 Freq: D, Name: Log_Gold_Price, Length: 292, dtype: float64,
 2023-03-14    2151.285804
 2023-03-15    2165.392968
 2023-03-16    2179.592640
 2023-03-17    2193.885428
 2023-03-18    2208.271941
                  ...     
 2023-12-26    3482.851054
 2023-12-27    3507.339437
 2023-12-28    3532.000000
 2023-12-29    3578.000000
 2023-12-30    3635.000000
 Freq: D, Name: Log_Gold_Price, Length: 292, dtype: float64,
 2023-03-14    2151.285804
 2023-03-15    2165.392968
 2023-03-16    2179.592640
 2023-03-17    2193.885428
 2023-03-18    2208.271941
                  ...     
 2023-12-26    3482.851054
 2023-12-27    3507.339437
 2023-12-28    3532.000000
 2023-12-29    3578.000000
 2023-12-30   

In [209]:
# Evaluate the weighted ensemble model
weighted_ensemble_performance = evaluate_model(y_trues[0], weighted_ensemble_predictions['Predicted_Gold_Price'])

# Print the results
print("Weighted Ensemble Model Performance:")
for metric, value in weighted_ensemble_performance.items():
    print(f"{metric}: {value}")




Weighted Ensemble Model Performance:
Mean Directional Accuracy: 0.25773195876288657
Bucketized F1 Score: 0.2793598444410553
RMSE: 303.45706170819955
MAE: 259.11801906212645
MAPE: 0.10287391855745205


In [None]:
gold_prices_target['Date'] = pd.to_datetime(gold_prices_target['date'])
gold_prices_target['Date'] = gold_prices_target['Date'].dt.date
gold_prices_target.drop(columns=['date'], inplace=True)
gold_prices_target.set_index('Date' , inplace=True)

In [None]:
crude_oil_prices.head()

In [None]:
crude_oil_prices['Date'] = pd.to_datetime(crude_oil_prices['Date'])
crude_oil_prices['Date'] = crude_oil_prices['Date'].dt.date
crude_oil_prices.set_index('Date' , inplace=True)

In [None]:
features_df = pd.DataFrame(crude_oil_prices['Europe Brent Crude Oil (Dollars per Barrel)'].pct_change() * 100).shift(-1)
X = features_df[['Europe Brent Crude Oil (Dollars per Barrel)']]
X

# EDA

In [None]:
gold_prices_target.describe()

In [None]:
px.line(gold_prices_target)

In [None]:
crude_oil_prices.describe()

In [None]:
px.line(crude_oil_prices)

# Feature Selection

In [None]:
features_df = pd.merge(gold_prices_target, (crude_oil_prices['Europe Brent Crude Oil (Dollars per Barrel)'].pct_change() * 100).shift(-1), on='Date', how='left')
features_df.fillna(0, inplace=True)
features_df

In [None]:
px.line(features_df)

# Modelling

In [None]:
X = features_df[['Europe Brent Crude Oil (Dollars per Barrel)']]  
y = features_df['pct_change']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

In [None]:
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

## Validation

In [None]:
start_time = time.time()
y_pred = model.predict(X_test)
end_time = time.time()
inference_time = end_time - start_time

rmse = np.sqrt(mean_squared_error(y_test, y_pred))

mda = np.mean(np.sign(np.diff(y_test)) == np.sign(np.diff(y_pred)))

y_test_buckets = pd.qcut(y_test, 4, labels=False) 
y_pred_buckets = pd.qcut(y_pred, 4, labels=False)

f1 = f1_score(y_test_buckets, y_pred_buckets, average='weighted')

print(f'Root Mean Squared Error: {rmse}')
print(f'Mean Directional Accuracy: {mda}')
print(f'Bucketized F1 Score: {f1}')
print(f'Inference Time (seconds): {inference_time}')


## Visualize the results

In [None]:
plt.figure(figsize=(10, 6))
plt.plot(features_df.index, y, label='Actual Target Variable', color='blue')
plt.plot(X_test.index, y_pred, label='Predicted Target Variable', color='red')
plt.title('Linear Regression Model')
plt.xlabel('Date')
plt.ylabel('Target Variable')
plt.legend()
plt.show()

# Saving Model

In [None]:
with open('../Pickles/linear_regression_model.pkl', 'wb') as file:
    pickle.dump(model, file)

# Loading Model

In [None]:
with open('../Pickles/linear_regression_model.pkl', 'rb') as file:
    loaded_model = pickle.load(file)

y_pred = loaded_model.predict(X_test)
output_df = pd.DataFrame({
    'date': X_test.index,
    'prediction': y_pred.flatten()
})
output_df