In [14]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go


In [15]:
futures_df = pd.read_csv('data/clean/all.csv')
futures_df

Unnamed: 0,Date,crude,feeder,lean_hogs,cattle,london_coffee,lumber,oats,orange,us_cocoa,us_coffee_c,soybean_oil,soybeans,sugar_11,wheat
0,2020-01-01,61.33,145.50,71.25,125.97,,404.9,,,,,34.82,943.50,,559.50
1,2020-01-02,60.95,144.65,71.55,125.78,1380.0,406.7,298.00,100.75,2532.0,,35.24,944.25,13.13,560.25
2,2020-01-03,62.82,143.35,68.55,124.72,1372.0,403.6,290.75,99.40,2519.0,,35.08,930.50,13.31,554.50
3,2020-01-06,63.04,147.43,68.62,127.28,1352.0,397.0,294.25,99.00,2484.0,,34.52,932.75,13.73,550.00
4,2020-01-07,62.51,145.90,69.22,126.53,1364.0,395.4,293.25,99.65,2533.0,,34.74,935.00,13.59,550.25
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1335,2024-10-28,67.38,246.95,80.63,190.18,4502.0,535.5,379.75,530.95,6906.0,252.35,42.69,986.00,21.96,558.75
1336,2024-10-29,67.21,244.03,82.82,189.30,4398.0,543.0,384.00,511.50,7264.5,248.10,42.80,979.00,22.08,570.50
1337,2024-10-30,68.61,241.97,84.38,188.40,4453.0,553.5,390.75,505.35,7388.0,249.60,43.81,991.25,22.22,573.25
1338,2024-10-31,69.26,241.32,83.80,193.00,4369.0,549.5,394.50,529.75,7338.5,245.90,45.14,994.50,22.74,570.50


In [16]:
job_df = pd.read_csv("data/clean/indeed_job_index.csv")
job_df

Unnamed: 0,Date,Country,Value
0,2020-02-01,United States,100.00
1,2020-02-02,United States,99.98
2,2020-02-03,United States,99.97
3,2020-02-04,United States,100.03
4,2020-02-05,United States,100.12
...,...,...,...
1724,2024-10-21,United States,109.82
1725,2024-10-22,United States,109.67
1726,2024-10-23,United States,109.57
1727,2024-10-24,United States,109.43


In [17]:
futures_df['Date'] = pd.to_datetime(futures_df['Date'])
job_df['Date'] = pd.to_datetime(job_df['Date'])

futures_df = futures_df.sort_values(by='Date')
job_df = job_df.sort_values(by='Date')

futures_df = futures_df.reset_index(drop=True)
job_df = job_df.reset_index(drop=True)

In [18]:
# Calculate correlation matrix between futures and job index
# Use a sliding window of 3 months in the past to predict next month

start_date = max(job_df['Date'].iloc[0], futures_df['Date'].iloc[0])
end_date = min(job_df['Date'].iloc[-1], futures_df['Date'].iloc[-1])

futures_df = futures_df[(futures_df['Date'] >= start_date) & (futures_df['Date'] <= end_date)]
job_df = job_df[(job_df['Date'] >= start_date) & (job_df['Date'] <= end_date)]
job_df = job_df[['Date', 'Value']]
job_df = job_df.rename(columns={'Value': 'job_index'})

# Join the two dataframes on Date, and use backfill to fill in missing values
df = pd.merge(futures_df, job_df, on='Date', how='inner')
df = df.fillna(method='bfill')

print(start_date, end_date)
print(len(df))
df.to_csv('data/train/merged.csv', index=False)
df

2020-02-01 00:00:00 2024-10-25 00:00:00
1311


Unnamed: 0,Date,crude,feeder,lean_hogs,cattle,london_coffee,lumber,oats,orange,us_cocoa,us_coffee_c,soybean_oil,soybeans,sugar_11,wheat,job_index
0,2020-02-03,50.28,136.65,56.30,121.67,1302.0,422.8,300.25,98.30,2734.0,272.05,30.65,877.00,14.89,555.50,99.97
1,2020-02-04,49.80,137.50,56.75,121.62,1290.0,412.7,308.00,97.45,2757.0,272.05,31.10,879.50,14.71,557.25,100.03
2,2020-02-05,50.92,135.68,57.10,120.75,1284.0,428.3,304.50,97.30,2790.0,272.05,31.69,880.00,14.73,562.00,100.12
3,2020-02-06,51.14,135.90,57.70,121.12,1297.0,433.2,304.75,98.20,2849.0,272.05,31.61,881.00,14.74,556.25,100.24
4,2020-02-07,50.55,135.20,57.10,121.33,1290.0,440.2,303.00,98.35,2898.0,272.05,31.34,882.00,14.92,558.75,100.20
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1306,2024-10-21,70.04,244.25,78.28,187.00,4503.0,526.0,379.50,468.15,7388.5,251.70,42.39,989.75,21.83,572.25,109.82
1307,2024-10-22,72.09,249.43,79.13,188.05,4425.0,523.5,383.00,487.00,7186.0,249.85,43.69,1000.50,21.73,576.00,109.67
1308,2024-10-23,70.77,243.82,80.18,187.63,4443.0,525.0,380.50,500.10,6977.5,252.35,43.39,1005.00,22.34,578.50,109.57
1309,2024-10-24,70.19,249.40,78.65,189.30,4337.0,529.5,376.75,506.95,6732.0,245.45,44.33,1005.00,22.20,581.50,109.43


In [19]:
# Make a pandas function that takes in the date and creates a new column specifying the contract month
# A contract month starts the second Friday of the previous month and ends the second Friday of the current month.
# Ex: Feburary contract starts the second Friday of January and ends the second Friday of February.
from datetime import datetime, timedelta
def get_second_friday(year, month):
    """
    Given a year and month, returns the date of the second Friday of that month.
    """
    # Start from the first day of the month
    first_day = datetime(year, month, 1)
    # Find the first Friday in the month
    first_friday = first_day + timedelta(days=(4 - first_day.weekday() + 7) % 7)
    # Add 7 days to get the second Friday
    second_friday = first_friday + timedelta(days=7)
    return second_friday

def determine_contract_month(date):
    """
    Determine the contract month based on the given date.
    A contract month starts the second Friday of the previous month and ends the second Friday of the current month.
    """
    year = date.year
    month = date.month

    # Get the second Friday of the current month
    current_second_friday = get_second_friday(year, month)

    # Get the second Friday of the previous month
    if month == 1:  # If January, go to December of the previous year
        previous_second_friday = get_second_friday(year - 1, 12)
    else:
        previous_second_friday = get_second_friday(year, month - 1)

    # Determine the contract month
    if date < current_second_friday and date >= previous_second_friday:
        contract_month = current_second_friday.strftime("%B")
    else:
        # If date is on or after the current second Friday, contract month is the next month
        contract_month = (current_second_friday + timedelta(days=30)).strftime("%B")

    return contract_month

def add_contract_month_column(df, date_column):
    """
    Adds a new column 'Contract_Month' to the DataFrame indicating the contract month for each date.
    """
    df['contract_month'] = df[date_column].apply(determine_contract_month)
    return df

In [20]:
# Third Friday of every month is when futures expires, roll over the week before (Second Friday)
# Calculate the past 3 months correlation data, and take the top 3 most correlated futures
# Then use these correlated futures and the past 3 months of data to plot linear regression for each day.

regression_df = pd.DataFrame(columns=['Date', 'job_index', 'futures1_name', 'futures1_price', 'futures2_name', 'futures2_price', 'futures3_name', 'futures3_price', 'prediction'])
regression_df['Date'] = df['Date']
regression_df["job_index"] = df["job_index"]
add_contract_month_column(regression_df, 'Date')

Unnamed: 0,Date,job_index,futures1_name,futures1_price,futures2_name,futures2_price,futures3_name,futures3_price,prediction,contract_month
0,2020-02-03,99.97,,,,,,,,February
1,2020-02-04,100.03,,,,,,,,February
2,2020-02-05,100.12,,,,,,,,February
3,2020-02-06,100.24,,,,,,,,February
4,2020-02-07,100.20,,,,,,,,February
...,...,...,...,...,...,...,...,...,...,...
1306,2024-10-21,109.82,,,,,,,,November
1307,2024-10-22,109.67,,,,,,,,November
1308,2024-10-23,109.57,,,,,,,,November
1309,2024-10-24,109.43,,,,,,,,November


In [21]:
# Loop through each chunk in contract_month
from sklearn.linear_model import LinearRegression

chunk_starts = regression_df['contract_month'].ne(regression_df['contract_month'].shift()).cumsum()
data_chunk_groups = list(df.groupby(chunk_starts))

num_past_months = 3

regression_modified_chunks = []
for i in range(num_past_months, len(data_chunk_groups)):
    past_months = data_chunk_groups[i-num_past_months:i]
    past_months = pd.concat([group for _, group in past_months])
    past_months = past_months.reset_index(drop=True)

    # Get the top 3 most correlated futures
    correlation_matrix = past_months.corr()
    correlation_matrix = correlation_matrix['job_index']
    correlation_matrix = correlation_matrix.drop(['job_index', 'Date']).dropna()

    futures = correlation_matrix.abs().sort_values(ascending=False).head(3).index
    futures = futures.tolist()

    modified_chunk_df = pd.DataFrame(columns=regression_df.columns)
    modified_chunk_df['Date'] = past_months['Date']
    modified_chunk_df['job_index'] = past_months['job_index']
    for i in range(3):
        modified_chunk_df[f'futures{i+1}_name'] = futures[i]
        modified_chunk_df[f'futures{i+1}_price'] = past_months[futures[i]]

    # Y = past_3_months['job_index']
    modified_chunk_df['prediction'] = 0
    add_contract_month_column(modified_chunk_df, 'Date')

    regression_modified_chunks.append(modified_chunk_df)

modified_df = pd.concat(regression_modified_chunks, ignore_index=True)
modified_df


Unnamed: 0,Date,job_index,futures1_name,futures1_price,futures2_name,futures2_price,futures3_name,futures3_price,prediction,contract_month
0,2020-02-03,99.97,lumber,422.80,orange,98.30,sugar_11,14.89,0,February
1,2020-02-04,100.03,lumber,412.70,orange,97.45,sugar_11,14.71,0,February
2,2020-02-05,100.12,lumber,428.30,orange,97.30,sugar_11,14.73,0,February
3,2020-02-06,100.24,lumber,433.20,orange,98.20,sugar_11,14.74,0,February
4,2020-02-07,100.20,lumber,440.20,orange,98.35,sugar_11,14.92,0,February
...,...,...,...,...,...,...,...,...,...,...
3778,2024-10-04,112.42,lean_hogs,84.03,feeder,249.27,orange,459.15,0,October
3779,2024-10-07,112.24,lean_hogs,83.97,feeder,249.15,orange,471.50,0,October
3780,2024-10-08,112.24,lean_hogs,84.13,feeder,250.30,orange,464.10,0,October
3781,2024-10-09,112.25,lean_hogs,83.93,feeder,248.68,orange,466.55,0,October


In [22]:
modified_df.to_csv("data/train_data/train_data.csv", index=False)

OSError: Cannot save file into a non-existent directory: 'data\train_data'