# Exploratory Data Analysis
> **Warning!** Please run `01_cleaning.ipynb` first if you haven't already

In [6]:
import numpy as np
import pandas as pd
from functions.constants import BM_NAME, STARTDATE, ENDDATE, N_THRESHOLD_BPS,DATA_DIR 

In [7]:
active_returns_path = DATA_DIR + BM_NAME + "_active_returns.csv"
active_returns = pd.read_csv(active_returns_path, index_col=0, parse_dates=True)
print("Loaded active returns from", active_returns_path)
active_returns_thresholded_path = DATA_DIR + BM_NAME + "_active_returns_thresholded_" + str(N_THRESHOLD_BPS) + "bps.csv"
active_returns_thresholded = pd.read_csv(active_returns_thresholded_path, index_col=0, parse_dates=True)
print("Loaded active returns thresholded from", active_returns_thresholded_path)

Loaded active returns from ./../data/SP500_active_returns.csv
Loaded active returns thresholded from ./../data/SP500_active_returns_thresholded_100bps.csv


### Previewing the thresholded data 

In [8]:
TEST_TICKER = "GS UN" # Goldman Sachs--also try "AAPL UW" and "JPM UN"
TEST_PERIODS = ["1b", "1w", "1m", "1q", "1y"]
period_columns = ["active_returns_" + period for period in TEST_PERIODS]
test_ticker_df = active_returns_thresholded[active_returns_thresholded["Ticker"] == TEST_TICKER]
test_ticker_df
#True and False counts for each period 
true_counts = test_ticker_df[period_columns].sum()
false_counts = len(test_ticker_df) - true_counts
print("Col Name           || True Count || False Count")
for col in period_columns:
    print(f"{col:<18} || {true_counts[col]:<10} || {false_counts[col]}")

Col Name           || True Count || False Count
active_returns_1b  || 408.0      || 2044.0
active_returns_1w  || 800.0      || 1652.0
active_returns_1m  || 806.0      || 1646.0
active_returns_1q  || 805.0      || 1647.0
active_returns_1y  || 801.0      || 1651.0


In [9]:
# Hyperparameter for test period
TEST_PERIOD = "1b"  # This can be set to different periods like "1b", "1w", "1m", "1q", "1y"
shift_bizdays = 1
#for our given TEST_TICKER let us construct a simple strategy that forecasts 1 if yesterday's active_returns_1b was 1, 0 if it was 0
test_ticker_df = active_returns_thresholded[active_returns_thresholded["Ticker"] == TEST_TICKER]
test_ticker_df = test_ticker_df[["Ticker", "Date", f"active_returns_{TEST_PERIOD}"]] 
#soort in ascending by Ticker first then Date
test_ticker_df = test_ticker_df.sort_values(["Ticker", "Date"])
test_ticker_df[f"active_returns_{TEST_PERIOD}_forecast_dumb"] = test_ticker_df[f"active_returns_{TEST_PERIOD}"].shift(shift_bizdays)
#drop row where forecast column is NaN
test_ticker_df = test_ticker_df.dropna()
#measure precision and recall of this dumb model, get f1 score and accuracy
true_positive = len(test_ticker_df[(test_ticker_df[f"active_returns_{TEST_PERIOD}"] == 1) & (test_ticker_df[f"active_returns_{TEST_PERIOD}_forecast_dumb"] == 1)])
false_positive = len(test_ticker_df[(test_ticker_df[f"active_returns_{TEST_PERIOD}"] == 0) & (test_ticker_df[f"active_returns_{TEST_PERIOD}_forecast_dumb"] == 1)])
true_negative = len(test_ticker_df[(test_ticker_df[f"active_returns_{TEST_PERIOD}"] == 0) & (test_ticker_df[f"active_returns_{TEST_PERIOD}_forecast_dumb"] == 0)])
false_negative = len(test_ticker_df[(test_ticker_df[f"active_returns_{TEST_PERIOD}"] == 1) & (test_ticker_df[f"active_returns_{TEST_PERIOD}_forecast_dumb"] == 0)])
precision = true_positive / (true_positive + false_positive)
recall = true_positive / (true_positive + false_negative)

print(f"Dumb Momentum Model that forecasts +1 if previous period's active_returns_{TEST_PERIOD} was +1, 0 if it was 0. Specific to", TEST_TICKER)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", 2 * precision * recall / (precision + recall))
print("Accuracy:", (true_positive + true_negative) / (true_positive + true_negative + false_positive + false_negative))
print("True Positive:", true_positive)
print("False Positive:", false_positive)
print("True Negative:", true_negative)


Dumb Momentum Model that forecasts +1 if previous period's active_returns_1b was +1, 0 if it was 0. Specific to GS UN
Precision: 0.19852941176470587
Recall: 0.19852941176470587
F1 Score: 0.19852941176470587
Accuracy: 0.7331701346389229
True Positive: 81
False Positive: 327
True Negative: 1716


In [10]:
TEST_PERIOD = "1b"  # This can be set to different periods like "1b", "1w", "1m", "1q", "1y"
shift_bizdays = 1
# Initialize counters for the global confusion matrix
global_true_positive = 0
global_false_positive = 0
global_true_negative = 0
global_false_negative = 0

for ticker in active_returns_thresholded["Ticker"].unique():
    # Filter the data for the current ticker
    ticker_df = active_returns_thresholded[active_returns_thresholded["Ticker"] == ticker]
    ticker_df = ticker_df[["Ticker", "Date", f"active_returns_{TEST_PERIOD}"]]  # Use the TEST_PERIOD here
    
    # Sort by Ticker and Date
    ticker_df = ticker_df.sort_values(["Ticker", "Date"])
    
    # Create forecast column based on the TEST_PERIOD
    ticker_df[f"active_returns_{TEST_PERIOD}_forecast_dumb"] = ticker_df[f"active_returns_{TEST_PERIOD}"].shift(shift_bizdays)
    
    # Drop rows with NaN in forecast column
    ticker_df = ticker_df.dropna()
    
    global_true_positive += len(ticker_df[(ticker_df[f"active_returns_{TEST_PERIOD}"] == 1) & (ticker_df[f"active_returns_{TEST_PERIOD}_forecast_dumb"] == 1)])
    global_false_positive += len(ticker_df[(ticker_df[f"active_returns_{TEST_PERIOD}"] == 0) & (ticker_df[f"active_returns_{TEST_PERIOD}_forecast_dumb"] == 1)])
    global_true_negative += len(ticker_df[(ticker_df[f"active_returns_{TEST_PERIOD}"] == 0) & (ticker_df[f"active_returns_{TEST_PERIOD}_forecast_dumb"] == 0)])
    global_false_negative += len(ticker_df[(ticker_df[f"active_returns_{TEST_PERIOD}"] == 1) & (ticker_df[f"active_returns_{TEST_PERIOD}_forecast_dumb"] == 0)])

# Calculate overall precision, recall, F1 score, and accuracy
precision = global_true_positive / (global_true_positive + global_false_positive) if (global_true_positive + global_false_positive) != 0 else 0
recall = global_true_positive / (global_true_positive + global_false_negative) if (global_true_positive + global_false_negative) != 0 else 0
f1_score = 2 * precision * recall / (precision + recall) if (precision + recall) != 0 else 0
accuracy = (global_true_positive + global_true_negative) / (global_true_positive + global_true_negative + global_false_positive + global_false_negative)

# Print the overall metrics
print(f"Dumb Momentum Model Forecasting Across All Tickers for period {TEST_PERIOD}:")
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1_score)
print("Accuracy:", accuracy)
print("True Positive:", global_true_positive)
print("False Positive:", global_false_positive)
print("True Negative:", global_true_negative)
print("False Negative:", global_false_negative)

Dumb Momentum Model Forecasting Across All Tickers for period 1b:
Precision: 0.24611793285380806
Recall: 0.24620671324422844
F1 Score: 0.24616231504418062
Accuracy: 0.6778748155700639
True Positive: 64841
False Positive: 198614
True Negative: 770879
False Negative: 198519


In [13]:
active_returns.dtypes

Ticker                object
Date                  object
active_returns_1b    float64
active_returns_1w    float64
active_returns_1m    float64
active_returns_1q    float64
active_returns_1y    float64
dtype: object

In [37]:
def featurize(df, period_chosen, num_trailing_periods, reduce_rows=True):
    '''
    Create features for forecasting active returns based on specified periods.

    This function generates a new DataFrame containing lagged features of active returns
    for a specified period. It can also reduce the number of rows based on the chosen period 
    to retain only the first business day of each week, month, quarter, or year.

    Parameters:
    -----------
    df : pandas.DataFrame
        The input DataFrame containing at least the columns 'Ticker', 'Date', and 
        'active_returns_*' for various periods.

    period_chosen : str
        The period for which to create features. Must be one of:
        - '1b' : 1 business day
        - '1w' : 1 week
        - '1m' : 1 month
        - '1q' : 1 quarter
        - '1y' : 1 year

    num_trailing_periods : int
        The number of trailing periods to create lagged features for.

    reduce_rows : bool, optional
        If True, reduces the DataFrame to keep only the first business day of the week,
        month, quarter, or year based on the selected period. False means it keeps all rows.
        Defaults to True.

    Returns:
    --------
    pandas.DataFrame
        A new DataFrame containing:
        - Ticker
        - Date
        - ar_{period_chosen}_t (the value to be predicted)
        - ar_{period_chosen}_t_minus_{i} (for i in range(1, num_trailing_periods + 1))

    Notes:
    ------
    - The function handles missing data by dropping rows where there are NaN values 
      after creating lagged features.
    '''
    df = df.copy()
    df['Date'] = pd.to_datetime(df['Date'])
    df = df.sort_values(['Ticker', 'Date'])
    col_name = f'active_returns_{period_chosen}'    
    # Create a new dataframe with only the required columns
    new_df = df[['Ticker', 'Date', col_name]].copy()
    new_df = new_df.rename(columns={col_name: f'ar_{period_chosen}_t'})
    if reduce_rows:
        if period_chosen == '1w':
            new_df = new_df.groupby(['Ticker', pd.Grouper(key='Date', freq='W-MON')]).first().reset_index()
        elif period_chosen == '1m':
            new_df = new_df.groupby(['Ticker', pd.Grouper(key='Date', freq='MS')]).first().reset_index()
        elif period_chosen == '1q':
            new_df = new_df.groupby(['Ticker', pd.Grouper(key='Date', freq='QS')]).first().reset_index()
        elif period_chosen == '1y':
            new_df = new_df.groupby(['Ticker', pd.Grouper(key='Date', freq='AS')]).first().reset_index()
    #create lagged features 
    for i in range(1, num_trailing_periods + 1):
        new_df[f'ar_{period_chosen}_t_minus_{i}'] = new_df.groupby('Ticker')[f'ar_{period_chosen}_t'].shift(i)
    new_df = new_df.dropna() #insufficient trailing periods
    return new_df    

## Example of featurization

In [41]:
featurized_active_returns_weekly = featurize(active_returns, "1w", 10)
featurized_active_returns_weekly

Unnamed: 0,Ticker,Date,ar_1w_t,ar_1w_t_minus_1,ar_1w_t_minus_2,ar_1w_t_minus_3,ar_1w_t_minus_4,ar_1w_t_minus_5,ar_1w_t_minus_6,ar_1w_t_minus_7,ar_1w_t_minus_8,ar_1w_t_minus_9,ar_1w_t_minus_10
10,A UN,2015-03-16,-0.008387,0.008395,0.030798,0.006078,-0.007792,0.012361,0.018158,-0.040724,-0.000753,-0.015568,-0.005181
11,A UN,2015-03-23,0.008647,-0.008387,0.008395,0.030798,0.006078,-0.007792,0.012361,0.018158,-0.040724,-0.000753,-0.015568
12,A UN,2015-03-30,-0.020086,0.008647,-0.008387,0.008395,0.030798,0.006078,-0.007792,0.012361,0.018158,-0.040724,-0.000753
13,A UN,2015-04-06,0.022483,-0.020086,0.008647,-0.008387,0.008395,0.030798,0.006078,-0.007792,0.012361,0.018158,-0.040724
14,A UN,2015-04-13,0.017339,0.022483,-0.020086,0.008647,-0.008387,0.008395,0.030798,0.006078,-0.007792,0.012361,0.018158
...,...,...,...,...,...,...,...,...,...,...,...,...,...
256022,ZTS UN,2024-09-02,-0.013294,-0.055311,-0.020422,0.055157,0.035131,0.000368,0.033582,-0.015599,0.033399,-0.006671,-0.070519
256023,ZTS UN,2024-09-09,0.025291,-0.013294,-0.055311,-0.020422,0.055157,0.035131,0.000368,0.033582,-0.015599,0.033399,-0.006671
256024,ZTS UN,2024-09-16,0.048479,0.025291,-0.013294,-0.055311,-0.020422,0.055157,0.035131,0.000368,0.033582,-0.015599,0.033399
256025,ZTS UN,2024-09-23,-0.002572,0.048479,0.025291,-0.013294,-0.055311,-0.020422,0.055157,0.035131,0.000368,0.033582,-0.015599


In [3]:
featurized_active_returns_thresholded_weekly = featurize(active_returns_thresholded, "1w", 10)
featurized_active_returns_thresholded_weekly

NameError: name 'featurize' is not defined

In [1]:
#write featurized_active_returns_thresholded_weekly to csv, put num trailing periods in filename as
featurized_active_returns_thresholded_weekly.to_csv(DATA_DIR + BM_NAME + "_featurized_active_returns_thresholded_weekly_10_periods.csv")

NameError: name 'featurized_active_returns_thresholded_weekly' is not defined