In [95]:
# Import cell
import pathlib
from collections import defaultdict
import json
import pandas as pd
import yfinance as yf
import matplotlib.pyplot as plt
import numpy as np
import os
import statsmodels.api as sm
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder

###  regress targetting returnss, and take regressor from ´company´ to ´similarity_score´ but also : company, Sector, DJIA previous year return.

In [96]:
def train_model_and_save_predictions(train_end_year, prediction_year, data_path, output_folder):
    """
    Trains a regression model using data up to train_end_year, writes the regression summary to a file
    in the output folder, computes MSE on both the full test set and the re-prediction for prediction_year,
    predicts returns for prediction_year, ranks the stocks, and saves the predictions.
    
    Parameters:
      train_end_year: int - last year to include in training data.
      prediction_year: int - the year for which predictions are made.
      data_path: str - path to the merged data CSV.
      output_folder: str - folder where the predictions and summary file will be saved.
    
    Returns:
      results: the fitted regression model.
    """
    # Load the merged data
    df = pd.read_csv(data_path)

    # Keep all columns including Company, Sector, and DJIA_Return_Previous_Year
    columns_to_keep = [
        "Year", "Company", "sentiment_score_positive", "sentiment_score_negative",
        "sentiment_score_polarity", "sentiment_score_subjectivity", "similarity_score",
        "DJIA_Return_Previous_Year", "Yearly_Return", "Sector"
    ]
    df = df[columns_to_keep]

    # One-hot encode the 'Company' column
    encoder_company = OneHotEncoder(sparse_output=False, drop='first')
    company_encoded = encoder_company.fit_transform(df[['Company']])
    company_encoded_df = pd.DataFrame(company_encoded, columns=encoder_company.get_feature_names_out(['Company']))

    # One-hot encode the 'Sector' column
    encoder_sector = OneHotEncoder(sparse_output=False, drop='first')
    sector_encoded = encoder_sector.fit_transform(df[['Sector']])
    sector_encoded_df = pd.DataFrame(sector_encoded, columns=encoder_sector.get_feature_names_out(['Sector']))

    # Concatenate the encoded columns with the original dataframe
    df = pd.concat([df.reset_index(drop=True), 
                    company_encoded_df.reset_index(drop=True), 
                    sector_encoded_df.reset_index(drop=True)], axis=1)

    # Define target and features
    target = "Yearly_Return"
    features = [
        "sentiment_score_positive", "sentiment_score_negative",
        "sentiment_score_polarity", "sentiment_score_subjectivity", "similarity_score",
        "DJIA_Return_Previous_Year"
    ] + list(company_encoded_df.columns) + list(sector_encoded_df.columns)

    # Sort data by Year to maintain time order
    df.sort_values("Year", inplace=True)

    # Split data: training on data up to train_end_year and testing on later years
    train_data = df[df["Year"] <= train_end_year]
    test_data  = df[df["Year"] > train_end_year]

    X_train = train_data[features]
    y_train = train_data[target]

    # Build and train the model
    X_train_const = sm.add_constant(X_train)
    model = sm.OLS(y_train, X_train_const)
    results = model.fit()

    # Write the regression summary to a file in the output folder
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    summary_file = os.path.join(output_folder, f"regression_summary_{prediction_year}.txt")
    with open(summary_file, "w") as f:
        f.write(results.summary().as_text())
    print(f"Regression summary saved to {summary_file}")

    # Evaluate on full test set (years > train_end_year)
    X_test = test_data[features]
    y_test = test_data[target]
    X_test_const = sm.add_constant(X_test)
    y_pred = results.predict(X_test_const)
    mse_test = mean_squared_error(y_test, y_pred)
    print(f"MSE on full test set (years > {train_end_year}): {mse_test:.4f}")

    # Predict for the specific prediction_year
    prediction_data = df[df["Year"] == prediction_year].copy()
    X_pred = prediction_data[features]
    # Align columns with training data
    X_pred = X_pred.reindex(columns=X_train.columns, fill_value=0)
    X_pred_const = sm.add_constant(X_pred, has_constant='add')
    predicted_returns = results.predict(X_pred_const)
    prediction_data['Predicted_Return'] = predicted_returns

    # Calculate MSE for the re-prediction on prediction_year
    mse_prediction = mean_squared_error(prediction_data[target], prediction_data['Predicted_Return'])
    print(f"MSE on re-prediction for {prediction_year}: {mse_prediction:.4f}")

    # Keep only necessary columns and add rankings
    prediction_data = prediction_data[['Year', 'Company', 'Yearly_Return', 'Predicted_Return']]
    prediction_data['Predicted_Return_Rank'] = prediction_data['Predicted_Return'].rank(method='min', ascending=False)
    prediction_data['Yearly_Return_Rank'] = prediction_data['Yearly_Return'].rank(method='min', ascending=False)

    # Save predictions sorted by predicted and actual return ranks
    file_pred = os.path.join(output_folder, f"{prediction_year}_by_predicted.csv")
    file_actual = os.path.join(output_folder, f"{prediction_year}_by_actual.csv")
    prediction_data.sort_values(by='Predicted_Return_Rank').to_csv(file_pred, index=False)
    prediction_data.sort_values(by='Yearly_Return_Rank').to_csv(file_actual, index=False)
    print(f"Predictions for {prediction_year} saved successfully in {output_folder}!")

    return results

def compute_strategy_returns(prediction_file, djia_file, year):
    """
    Computes the long-short strategy return using the predictions file,
    and compares it to the DJIA return for the given year.
    
    Parameters:
      prediction_file: str - path to the predictions CSV sorted by predicted returns.
      djia_file: str - path to the DJIA returns CSV.
      year: int - the year for which to compute the returns.
      
    Returns:
      A tuple (strategy_return, djia_return).
    """
    # Load the predictions file
    df_pred = pd.read_csv(prediction_file)
    n = len(df_pred)
    n_quartile = int(np.ceil(n * 0.25))
    
    # Sort by Predicted_Return_Rank (lowest rank = highest predicted return)
    df_pred_sorted = df_pred.sort_values(by='Predicted_Return_Rank')
    long_positions = df_pred_sorted.head(n_quartile)
    short_positions = df_pred_sorted.tail(n_quartile)
    
    long_return = long_positions['Yearly_Return'].mean()
    short_return = short_positions['Yearly_Return'].mean()
    
    # Long-short portfolio return
    strategy_return = long_return - short_return

    # Load DJIA data and extract the DJIA return for the year
    df_djia = pd.read_csv(djia_file)
    df_djia['Year'] = pd.to_datetime(df_djia['Date']).dt.year
    djia_year = df_djia[df_djia['Year'] == year]
    if djia_year.empty:
        raise ValueError(f"No DJIA return data found for {year}.")
    djia_return = djia_year.iloc[0]['^DJI']
    
    # Output the results
    print(f"{year} Trading Strategy Results:")
    print(f"Strategy Return: {strategy_return * 100:.2f}%")
    print(f"DJIA Return: {djia_return * 100:.2f}%")
    print(f"Excess Return (Strategy vs DJIA): {(strategy_return - djia_return) * 100:.2f}%")
    
    return strategy_return, djia_return

In [100]:
# Example usage:
# For a simulation in 2021 (from the perspective of someone at end of 2020):
model_results_2021 = train_model_and_save_predictions(
    train_end_year=2020,
    prediction_year=2021,
    data_path="trading_strat_data/merged_data.csv",
    output_folder="trading_strat_data/2021_returns_predictions"
)
compute_strategy_returns(
    prediction_file="trading_strat_data/2021_returns_predictions/2021_by_predicted.csv",
    djia_file="trading_strat_data/yearly_djia_returns.csv",
    year=2021
)

Regression summary saved to trading_strat_data/2021_returns_predictions/regression_summary_2021.txt
MSE on full test set (years > 2020): 0.1263
MSE on re-prediction for 2021: 0.0552
Predictions for 2021 saved successfully in trading_strat_data/2021_returns_predictions!
2021 Trading Strategy Results:
Strategy Return: 29.28%
DJIA Return: 20.23%
Excess Return (Strategy vs DJIA): 9.05%


(np.float64(0.29283545349172885), np.float64(0.2023038738497948))

In [101]:
# Example usage:
# For a simulation in 2022 (from the perspective of someone at end of 2021):
model_results_2022 = train_model_and_save_predictions(
    train_end_year=2021,
    prediction_year=2022,
    data_path="trading_strat_data/merged_data.csv",
    output_folder="trading_strat_data/2022_returns_predictions"
)
compute_strategy_returns(
    prediction_file="trading_strat_data/2022_returns_predictions/2022_by_predicted.csv",
    djia_file="trading_strat_data/yearly_djia_returns.csv",
    year=2022
)

Regression summary saved to trading_strat_data/2022_returns_predictions/regression_summary_2022.txt
MSE on full test set (years > 2021): 0.1450
MSE on re-prediction for 2022: 0.1697
Predictions for 2022 saved successfully in trading_strat_data/2022_returns_predictions!
2022 Trading Strategy Results:
Strategy Return: -40.18%
DJIA Return: -9.40%
Excess Return (Strategy vs DJIA): -30.78%


(np.float64(-0.4017931980924885), np.float64(-0.0939675574098246))

In [102]:
# Example usage:
# For a simulation in 2023 (from the perspective of someone at end of 2022):
model_results_2023 = train_model_and_save_predictions(
    train_end_year=2022,
    prediction_year=2023,
    data_path="trading_strat_data/merged_data.csv",
    output_folder="trading_strat_data/2023_returns_predictions"
)
compute_strategy_returns(
    prediction_file="trading_strat_data/2023_returns_predictions/2023_by_predicted.csv",
    djia_file="trading_strat_data/yearly_djia_returns.csv",
    year=2023
)

Regression summary saved to trading_strat_data/2023_returns_predictions/regression_summary_2023.txt
MSE on full test set (years > 2022): 0.1467
MSE on re-prediction for 2023: 0.1621
Predictions for 2023 saved successfully in trading_strat_data/2023_returns_predictions!
2023 Trading Strategy Results:
Strategy Return: 61.29%
DJIA Return: 13.74%
Excess Return (Strategy vs DJIA): 47.55%


(np.float64(0.6128866478553259), np.float64(0.1374069585310986))

In [103]:
# Example usage:
# For a simulation in 2024 (from the perspective of someone at end of 2023):
model_results_2024 = train_model_and_save_predictions(
    train_end_year=2023,
    prediction_year=2024,
    data_path="trading_strat_data/merged_data.csv",
    output_folder="trading_strat_data/2024_returns_predictions"
)
compute_strategy_returns(
    prediction_file="trading_strat_data/2024_returns_predictions/2024_by_predicted.csv",
    djia_file="trading_strat_data/yearly_djia_returns.csv",
    year=2024
)

Regression summary saved to trading_strat_data/2024_returns_predictions/regression_summary_2024.txt


ValueError: shapes (30,43) and (44,) not aligned: 43 (dim 1) != 44 (dim 0)