In [1]:
import numpy as np
import os
import pandas as pd
from datetime import date
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, confusion_matrix, mean_absolute_percentage_error, r2_score
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from xgboost import XGBClassifier
from datetime import datetime
from dateutil.relativedelta import relativedelta
import warnings
import pandas as pd
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.metrics import make_scorer, r2_score, mean_squared_error
import numpy as np


In [2]:
#Keeping Track of Input Variables
path = '/Users/angelobenedicto/Documents/Commodities Hedger/Commodities'
target = 'CHK'
today = date.today()
hedge_period = 12
tts = 0.2

In [3]:
#Get List of Commodities
def list_csv_files(folder_path):
    csv_files = []
    for file in os.listdir(folder_path):
        if file.endswith('.csv'):
            csv_files.append(file)
    return csv_files

comms = list_csv_files(path)

In [4]:
#Create Total Comm DF
total_comm_df = pd.read_csv(f'/Users/angelobenedicto/Documents/Commodities Hedger/Commodities/{comms[0]}')

for item in range(len(comms)):
    if item == 0:
        pass
    else:
        comm_df = pd.read_csv(f'/Users/angelobenedicto/Documents/Commodities Hedger/Commodities/{comms[item]}')
        total_comm_df = total_comm_df.merge(comm_df, how='outer', on='DATE')

#Create Date Time For Time esries Analysis
total_comm_df['DATE'] = pd.to_datetime(total_comm_df['DATE'])

#Fetch Manually Maintained Code Lookup
comm_lookup = pd.read_excel('/Users/angelobenedicto/Documents/Commodities Hedger/Commodities/CommLookUp.xlsx')

#Get List of Columns
comms = list(total_comm_df.columns)
comms.remove('DATE')

#Get FRED Short Code
short_codes = pd.DataFrame(comms).merge(comm_lookup[['FRED Ticker', 'Code']], how='left', left_on=0, right_on='FRED Ticker')
short_codes = list(short_codes['Code'])

#Add 'DATE' to column names
short_codes.insert(0, 'DATE')

#Rename Columns to Short Codes
total_comm_df.columns=short_codes

In [5]:
# Determine the complete monthly date range from the earliest to the latest date in total_comm_df
date_range = pd.date_range(start=total_comm_df['DATE'].min(), end=total_comm_df['DATE'].max(), freq='MS')

# Set the 'DATE' column as the index for reindexing
total_comm_df.set_index('DATE', inplace=True)

# Reindex the dataframe with the complete monthly date range, filling missing values as NaN
continuous_total_comm_df = total_comm_df.reindex(date_range)

# If you want to have 'DATE' as a column instead of the index
continuous_total_comm_df.reset_index(inplace=True)
continuous_total_comm_df.rename(columns={'index': 'DATE'}, inplace=True)

#Turn DF into Time Series
continuous_total_comm_df.set_index('DATE', inplace=True)

#Filter Out Dates Without Target
start_date = continuous_total_comm_df[[target]].dropna().index.min()
end_date = continuous_total_comm_df[[target]].dropna().index.max()
continuous_total_comm_df = continuous_total_comm_df[start_date:end_date]

In [6]:
#Quick Check For Missing Dates

# Assuming continuous_total_comm_df is your DataFrame
start_date = continuous_total_comm_df.index.min()
end_date = continuous_total_comm_df.index.max()

# Generate a complete range of monthly dates from start to end
complete_date_range = pd.date_range(start=start_date, end=end_date, freq='MS')

# Find missing dates by checking which dates in the complete range are not in the DataFrame's index
missing_dates = complete_date_range.difference(continuous_total_comm_df.index)

print("Missing Dates:")
print(missing_dates)

#Drop all NaN Values
continuous_total_comm_df.dropna(inplace=True)

Missing Dates:
DatetimeIndex([], dtype='datetime64[ns]', freq='MS')


In [7]:
# Convert object columns that should be numeric
for col in continuous_total_comm_df.columns:  # List other columns if needed
    continuous_total_comm_df[col] = pd.to_numeric(continuous_total_comm_df[col], errors='coerce')

# Function to impute values based on the described logic
def impute_values(df, column):
    for i, value in enumerate(df[column]):
        if pd.isnull(value):  # Check if value is NaN (formerly ".")
            prev_val = df[column].iloc[:i].dropna().tail(1).values  # Value before the NaN
            next_val = df[column].iloc[i+1:].dropna().head(1).values  # Value after the NaN

            if prev_val.size > 0 and next_val.size > 0:
                # Average of the period before and the period after
                df.at[df.index[i], column] = np.mean([prev_val[0], next_val[0]])
            else:
                # Impute with the mean of that year
                year = df.index[i].year
                yearly_mean = df[column][df.index.year == year].mean()
                df.at[df.index[i], column] = yearly_mean

# Apply the imputation function to each column that requires it
for col in continuous_total_comm_df.columns:
    if continuous_total_comm_df[col].dtype == 'float64':  # Assuming you want to apply this to float columns
        impute_values(continuous_total_comm_df, col)

In [8]:
#Get Rolling Values
for comm in continuous_total_comm_df.columns:
    for period in [3, 6, 9, 12]:
        continuous_total_comm_df[f'{comm} Rolling {period}M'] = continuous_total_comm_df[comm].rolling(window=period).mean()
        
#Get Rolling Growth Values
for comm in continuous_total_comm_df.columns:
    for period in [3, 6, 9, 12]:
        continuous_total_comm_df[f'{comm} M-{period}'] = continuous_total_comm_df[comm].shift(+period)
        continuous_total_comm_df[f'{comm} M+{period} Growth'] = continuous_total_comm_df[comm] / continuous_total_comm_df[f'{comm} M-{period}'] - 1

#Drop NaN Values
continuous_total_comm_df.dropna(inplace=True)

#Create 3M Future Value of Target
continuous_total_comm_df[f'{target} Future Price'] = continuous_total_comm_df[target].shift(-hedge_period)

#Get Price Increases
continuous_total_comm_df['Increase'] = np.where(continuous_total_comm_df[f'{target} Future Price'] > continuous_total_comm_df[target], 1, 0)

#Create lin_reg_df
lin_reg_df = continuous_total_comm_df.drop('Increase', axis=1)

#Drop Target Future Price
continuous_total_comm_df.drop(f'{target} Future Price', axis=1, inplace=True)

  continuous_total_comm_df[f'{comm} M-{period}'] = continuous_total_comm_df[comm].shift(+period)
  continuous_total_comm_df[f'{comm} M+{period} Growth'] = continuous_total_comm_df[comm] / continuous_total_comm_df[f'{comm} M-{period}'] - 1
  continuous_total_comm_df[f'{comm} M-{period}'] = continuous_total_comm_df[comm].shift(+period)
  continuous_total_comm_df[f'{comm} M+{period} Growth'] = continuous_total_comm_df[comm] / continuous_total_comm_df[f'{comm} M-{period}'] - 1
  continuous_total_comm_df[f'{comm} M-{period}'] = continuous_total_comm_df[comm].shift(+period)
  continuous_total_comm_df[f'{comm} M+{period} Growth'] = continuous_total_comm_df[comm] / continuous_total_comm_df[f'{comm} M-{period}'] - 1
  continuous_total_comm_df[f'{comm} M-{period}'] = continuous_total_comm_df[comm].shift(+period)
  continuous_total_comm_df[f'{comm} M+{period} Growth'] = continuous_total_comm_df[comm] / continuous_total_comm_df[f'{comm} M-{period}'] - 1
  continuous_total_comm_df[f'{comm} M-{perio

In [9]:
def evaluate_model(model_name, y_true, y_pred):
    f1 = f1_score(y_true, y_pred)
    cm = confusion_matrix(y_true, y_pred)
    tn, fp, fn, tp = cm.ravel()
    print(f"{model_name} F1 Score: {f1}")
    print(f"{model_name} Confusion Matrix:")
    print(f"True Negatives (TN): {tn}")
    print(f"False Positives (FP): {fp}")
    print(f"False Negatives (FN): {fn}")
    print(f"True Positives (TP): {tp}\n")
    return f1

def run_multi_model(val_date, run_gradient_boost):
    val_date_ts = datetime.strptime(val_date, '%Y-%m')
    val_date_plus_one_month = val_date_ts + relativedelta(months=+1)
    val_date_next = val_date_plus_one_month.strftime('%Y-%m')

    df = continuous_total_comm_df[:val_date]
    X = df.drop('Increase', axis=1)
    y = df['Increase']
    scaler = StandardScaler().fit(X)
    X_scaled = scaler.transform(X)
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=tts, random_state=42)

    f1_scores = []
    
    print('Test Set Results')
    print('')
    
    # Logistic Regression
    lr_model = LogisticRegression(max_iter=1000)
    lr_model.fit(X_train, y_train)
    y_pred_lr = lr_model.predict(X_test)
    f1_lr = evaluate_model("Logistic Regression", y_test, y_pred_lr)
    f1_scores.append(["Logistic Regression", f1_lr])

    # XGBoost
    xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
    xgb_model.fit(X_train, y_train)
    y_pred_xgb = xgb_model.predict(X_test)
    f1_xgb = evaluate_model("XGBoost", y_test, y_pred_xgb)
    f1_scores.append(["XGBoost", f1_xgb])

    if run_gradient_boost:
        # Gradient Boosting with Grid Search
        parameters = {'learning_rate': [0.01, 0.1, 0.2], 'max_depth': [3, 5, 7], 'n_estimators': [100, 200]}
        gb_model = GridSearchCV(estimator=GradientBoostingClassifier(), param_grid=parameters, scoring='f1', cv=5)
        gb_model.fit(X_train, y_train)
        best_gb_model = gb_model.best_estimator_
        y_pred_gb = best_gb_model.predict(X_test)
        f1_gb = evaluate_model("Gradient Boosting (Best)", y_test, y_pred_gb)
        f1_scores.append(["Gradient Boosting (Best)", f1_gb])

    # Random Forest
    rf_model = RandomForestClassifier()
    rf_model.fit(X_train, y_train)
    y_pred_rf = rf_model.predict(X_test)
    f1_rf = evaluate_model("Random Forest", y_test, y_pred_rf)
    f1_scores.append(["Random Forest", f1_rf])

    top_model, top_f1 = max(f1_scores, key=lambda x: x[1])

    forecast_model = {
        "Logistic Regression": lr_model,
        "XGBoost": xgb_model,
        "Random Forest": rf_model
    }[top_model]

    val_df = continuous_total_comm_df[val_date_next:'2023-02']
    X_val = val_df.drop('Increase', axis=1)
    y_val = val_df['Increase']
    X_val_scaled = scaler.transform(X_val)
    y_val_preds = forecast_model.predict(X_val_scaled)
    
    print('Hold Out Set Results:')
    print('')

    f1_val = evaluate_model(top_model, y_val, y_val_preds)

    return y_val_preds, y_val, val_df


In [10]:
y_val_preds, y_val, val_df = run_multi_model(val_date='2022-02', run_gradient_boost=False)  

Test Set Results

Logistic Regression F1 Score: 0.912621359223301
Logistic Regression Confusion Matrix:
True Negatives (TN): 12
False Positives (FP): 6
False Negatives (FN): 3
True Positives (TP): 47

XGBoost F1 Score: 0.9019607843137256
XGBoost Confusion Matrix:
True Negatives (TN): 12
False Positives (FP): 6
False Negatives (FN): 4
True Positives (TP): 46

Random Forest F1 Score: 0.8775510204081632
Random Forest Confusion Matrix:
True Negatives (TN): 13
False Positives (FP): 5
False Negatives (FN): 7
True Positives (TP): 43

Hold Out Set Results:

Logistic Regression F1 Score: 0.8571428571428571
Logistic Regression Confusion Matrix:
True Negatives (TN): 0
False Positives (FP): 0
False Negatives (FN): 3
True Positives (TP): 9



In [56]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import mean_squared_error, make_scorer, r2_score

# Load the lin_reg_df and prepare it
lin_reg_df = lin_reg_df.reset_index(drop=True)  # Assuming lin_reg_df is already defined in your environment
lin_reg_df.dropna(subset=['CHK Future Price'], inplace=True)  # Drop rows with missing target variable

# Define features and target
X = lin_reg_df.drop(['CHK Future Price', 'DATE'], axis=1)  # 'DATE' is non-numeric and dropped
y = lin_reg_df['CHK Future Price']

# Initialize and cross-validate the linear regression model using sklearn
model_sklearn = LinearRegression()
kf = KFold(n_splits=5, shuffle=True, random_state=42)

def rmse_score(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

def mape_score(y_true, y_pred):
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

# Define scorers
r2_scorer = make_scorer(r2_score)
rmse_scorer = make_scorer(rmse_score)
mape_scorer = make_scorer(mape_score)

# Efficiently compute scores in a single call per metric
r2_scores = cross_val_score(model_sklearn, X, y, cv=kf, scoring=r2_scorer)
rmse_scores = cross_val_score(model_sklearn, X, y, cv=kf, scoring=rmse_scorer)
mape_scores = cross_val_score(model_sklearn, X, y, cv=kf, scoring=mape_scorer)

print("Cross-Validated R² Scores:", r2_scores)
print("Cross-Validated RMSE Scores:", rmse_scores)
print("Cross-Validated MAPE Scores:", mape_scores)

# Fit the OLS model using statsmodels for detailed statistics
X_with_const = sm.add_constant(X)  # Add a constant to the model for the intercept
model_sm = sm.OLS(y, X_with_const).fit()

# Extract coefficients and p-values, calculate 1% impacts
coefficients = model_sm.params
p_values = model_sm.pvalues
one_percent_increases = X.mean() * 0.01
impact_on_target = one_percent_increases * coefficients.drop('const')  # Exclude intercept for percentage calculations

# Create and display the results DataFrame
impact_df = pd.DataFrame({
    'Feature': impact_on_target.index,
    '1% Increase Impact on CHK Future Price': impact_on_target.values,
    'P-Value': p_values.drop('const').values  # Exclude intercept's p-value
}).sort_values(by='1% Increase Impact on CHK Future Price', ascending=False)

impact_df[impact_df['P-Value'] > 0.05].head(60)


Cross-Validated R² Scores: [0.75911257 0.81483404 0.76609406 0.78155283 0.48374682]
Cross-Validated RMSE Scores: [0.13344031 0.11549493 0.11830034 0.11831872 0.20275877]
Cross-Validated MAPE Scores: [ 9.0742879   7.40972259  6.57757448  7.38930639 10.35608178]


Unnamed: 0,Feature,1% Increase Impact on CHK Future Price,P-Value
311,PAY Rolling 9M M+6 Growth,1.96658,0.144621
314,PAY Rolling 9M M-12,0.488217,0.419653
306,PAY Rolling 6M M-12,0.431294,0.352184
301,PAY Rolling 6M M+3 Growth,0.357604,0.194096
297,PAY Rolling 3M M+9 Growth,0.326148,0.295376
292,PAY Rolling 3M M-3,0.322448,0.145688
299,PAY Rolling 3M M+12 Growth,0.308272,0.237786
320,PAY Rolling 12M M-9,0.30437,0.507921
354,ELE Rolling 12M M-12,0.271718,0.457072
330,ELE Rolling 3M M-12,0.263233,0.053978
