In [9]:
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go


In [10]:
# Make a pandas function that takes in the date and creates a new column specifying the contract month
# A contract month starts the second Friday of the previous month and ends the second Friday of the current month.
# Ex: Feburary contract starts the second Friday of January and ends the second Friday of February.
from datetime import datetime, timedelta
def get_second_friday(year, month):
    """
    Given a year and month, returns the date of the second Friday of that month.
    """
    # Start from the first day of the month
    first_day = datetime(year, month, 1)
    # Find the first Friday in the month
    first_friday = first_day + timedelta(days=(4 - first_day.weekday() + 7) % 7)
    # Add 7 days to get the second Friday
    second_friday = first_friday + timedelta(days=7)
    return second_friday

def determine_contract_month(date):
    """
    Determine the contract month based on the given date.
    A contract month starts the second Friday of the previous month and ends the second Friday of the current month.
    """
    year = date.year
    month = date.month

    # Get the second Friday of the current month
    current_second_friday = get_second_friday(year, month)

    # Get the second Friday of the previous month
    if month == 1:  # If January, go to December of the previous year
        previous_second_friday = get_second_friday(year - 1, 12)
    else:
        previous_second_friday = get_second_friday(year, month - 1)

    # Determine the contract month
    if date < current_second_friday and date >= previous_second_friday:
        contract_month = current_second_friday.strftime("%B")
    else:
        # If date is on or after the current second Friday, contract month is the next month
        contract_month = (current_second_friday + timedelta(days=30)).strftime("%B")

    return contract_month

def add_contract_month_column(df, date_column):
    """
    Adds a new column 'Contract_Month' to the DataFrame indicating the contract month for each date.
    """
    df['contract_month'] = df[date_column].apply(determine_contract_month)
    return df

In [11]:
train_data_df = pd.read_csv("data/train/merged.csv")
train_data_df['Date'] = pd.to_datetime(train_data_df['Date'])
add_contract_month_column(train_data_df, 'Date')
train_data_df

Unnamed: 0,Date,crude,feeder,lean_hogs,cattle,london_coffee,lumber,oats,orange,us_cocoa,us_coffee_c,soybean_oil,soybeans,sugar_11,wheat,job_index,contract_month
0,2020-02-03,50.28,136.65,56.30,121.67,1302.0,422.8,300.25,98.30,2734.0,272.05,30.65,877.00,14.89,555.50,99.97,February
1,2020-02-04,49.80,137.50,56.75,121.62,1290.0,412.7,308.00,97.45,2757.0,272.05,31.10,879.50,14.71,557.25,100.03,February
2,2020-02-05,50.92,135.68,57.10,120.75,1284.0,428.3,304.50,97.30,2790.0,272.05,31.69,880.00,14.73,562.00,100.12,February
3,2020-02-06,51.14,135.90,57.70,121.12,1297.0,433.2,304.75,98.20,2849.0,272.05,31.61,881.00,14.74,556.25,100.24,February
4,2020-02-07,50.55,135.20,57.10,121.33,1290.0,440.2,303.00,98.35,2898.0,272.05,31.34,882.00,14.92,558.75,100.20,February
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1306,2024-10-21,70.04,244.25,78.28,187.00,4503.0,526.0,379.50,468.15,7388.5,251.70,42.39,989.75,21.83,572.25,109.82,November
1307,2024-10-22,72.09,249.43,79.13,188.05,4425.0,523.5,383.00,487.00,7186.0,249.85,43.69,1000.50,21.73,576.00,109.67,November
1308,2024-10-23,70.77,243.82,80.18,187.63,4443.0,525.0,380.50,500.10,6977.5,252.35,43.39,1005.00,22.34,578.50,109.57,November
1309,2024-10-24,70.19,249.40,78.65,189.30,4337.0,529.5,376.75,506.95,6732.0,245.45,44.33,1005.00,22.20,581.50,109.43,November


In [12]:
chunk_starts = train_data_df['contract_month'].ne(train_data_df['contract_month'].shift()).cumsum()
data_chunk_groups = list(train_data_df.groupby(chunk_starts))

def train_model(model, num_past_months=3, data_chunk_groups=data_chunk_groups):
    predictions_list = []
    model_name = model.__class__.__name__
    print(f"Training {model_name} model")

    for i in range(num_past_months, len(data_chunk_groups)):
        past_months = data_chunk_groups[i-num_past_months:i]
        past_months = pd.concat([group for _, group in past_months])
        past_months = past_months.reset_index(drop=True)

        _, curr_months = data_chunk_groups[i]
        # curr_months = pd.concat([group for _, group in curr_months])
        # curr_months = curr_months.reset_index(drop=True)

        # Get the top 3 most correlated futures
        correlation_matrix = past_months.drop(['contract_month'], axis=1).corr()
        correlation_matrix = correlation_matrix['job_index']
        correlation_matrix = correlation_matrix.drop(['job_index', 'Date']).dropna()

        futures = correlation_matrix.abs().sort_values(ascending=False).head(3).index
        futures = futures.tolist()

        # Run Regression
        X = past_months[futures]
        X_base = np.arange(len(past_months)).reshape(-1, 1)
        y = past_months['job_index']
        # Convert to log returns
        X = np.log(X) - np.log(X.shift(1))
        y = np.log(y) - np.log(y.shift(1))
        X = X.dropna()
        y = y.dropna()
        X_base = X_base[-len(y):]

        model.fit(X, y)
        prediction_df = pd.DataFrame(columns=['Date', 'job_index', 'prediction'])
        prediction_df['Date'] = curr_months['Date']
        prediction_df['job_index'] = curr_months['job_index']
        yhat = model.predict(curr_months[futures])
        prediction_df['prediction'] = np.exp(yhat + np.log(curr_months['job_index'].shift(1)))

        model.fit(X_base, y)
        xhat = np.arange(len(curr_months)).reshape(-1, 1)
        yhat = model.predict(xhat)
        prediction_df['baseline'] = np.exp(yhat + np.log(curr_months['job_index'].shift(1)))

        predictions_list.append(prediction_df)

    # Merge all predictions using date
    predictions_df = pd.concat(predictions_list)
    predictions_list = []

    return model, predictions_df

In [15]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor, BaggingRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor

def loss(prediction_df, col1='job_index', col2='prediction'):
    '''MSE'''
    return np.mean((prediction_df[col1] - prediction_df[col2])**2)

test_filepath = "data/test/"

list_of_models = [RandomForestRegressor(), GradientBoostingRegressor(), AdaBoostRegressor(), BaggingRegressor(), SVR(), KNeighborsRegressor()]
for model in list_of_models:
    num_past_months = 3
    model, predictions = train_model(model, num_past_months=num_past_months)
    # model, baseline_pred = train_baseline(model, num_past_months=num_past_months)
    predictions.to_csv(f"{test_filepath}{model.__class__.__name__}_{num_past_months}_predictions.csv", index=False)
    # baseline_pred.to_csv(f"{test_filepath}{model.__class__.__name__}_{num_past_months}_baseline_predictions.csv", index=False)
    print(f"{model.__class__.__name__} Loss: {loss(predictions)}")
    print(f"{model.__class__.__name__} [BASELINE] Loss: {loss(predictions, col2='baseline')}")

Training RandomForestRegressor model
RandomForestRegressor Loss: 0.3145311766929341
RandomForestRegressor [BASELINE] Loss: 0.18191666505998966
Training GradientBoostingRegressor model
GradientBoostingRegressor Loss: 0.3323335055962404
GradientBoostingRegressor [BASELINE] Loss: 0.2028681956991292
Training AdaBoostRegressor model
AdaBoostRegressor Loss: 0.3848953868901313
AdaBoostRegressor [BASELINE] Loss: 0.20613293644650124
Training BaggingRegressor model
BaggingRegressor Loss: 0.33081935931611445
BaggingRegressor [BASELINE] Loss: 0.1857212042369269
Training SVR model
SVR Loss: 0.2623129387639819
SVR [BASELINE] Loss: 0.2623129387639819
Training KNeighborsRegressor model
KNeighborsRegressor Loss: 0.12846436075943798
KNeighborsRegressor [BASELINE] Loss: 0.1635314564365186


In [29]:
# Visualize output from all models using plotly with selectable boxes for each regressor
model_dict = {}
for model in list_of_models:
    predictions = pd.read_csv(f"{test_filepath}{model.__class__.__name__}_{num_past_months}_predictions.csv")
    predictions['Date'] = pd.to_datetime(predictions['Date'])
    model_dict[f"{model.__class__.__name__}"] = predictions

# Use Plotly to create a dropdown menu where you can select which model to look at. It will pull up the correct model and then show the regression results based on the prediction csv stored in model_dict.
fig = go.Figure()
for model in list_of_models:
    model_name = model.__class__.__name__
    predictions = model_dict[f"{model_name}"]
    # Fill in NaN
    predictions.fillna(method='ffill', inplace=True)
    fig.add_trace(go.Scatter(x=predictions['Date'], y=predictions['prediction'], mode='lines', name=f"{model_name}"))
    fig.add_trace(go.Scatter(x=predictions['Date'], y=predictions['baseline'], mode='lines', name=f"{model_name} [BASELINE]"))
fig.add_trace(go.Scatter(x=predictions['Date'], y=predictions['job_index'], mode='lines', name=f"Actual Job Index"))

fig.update_layout(title="Predictions vs Actual Job Index", xaxis_title="Date", yaxis_title="Job Index")
fig.show()