# Exploratory Data Analysis
> **Warning!** Please run `01_cleaning.ipynb` first if you haven't already

In [39]:
import numpy as np
import pandas as pd
from functions.constants import BM_NAME, STARTDATE, ENDDATE, N_THRESHOLD_BPS,DATA_DIR 

In [40]:
active_returns_path = DATA_DIR + BM_NAME + "_active_returns.csv"
active_returns = pd.read_csv(active_returns_path, index_col=0, parse_dates=True)
print("Loaded active returns from", active_returns_path)
active_returns_thresholded_path = DATA_DIR + BM_NAME + "_active_returns_thresholded_" + str(N_THRESHOLD_BPS) + "bps.csv"
active_returns_thresholded = pd.read_csv(active_returns_thresholded_path, index_col=0, parse_dates=True)
print("Loaded active returns thresholded from", active_returns_thresholded_path)

Loaded active returns from ./../data/SP500_active_returns.csv
Loaded active returns thresholded from ./../data/SP500_active_returns_thresholded_100bps.csv


### Previewing the thresholded data 

In [41]:
TEST_TICKER = "GS UN" # Goldman Sachs--also try "AAPL UW" and "JPM UN"
TEST_PERIODS = ["1b", "1w", "1m", "1q", "1y"]
period_columns = ["active_returns_" + period for period in TEST_PERIODS]
test_ticker_df = active_returns_thresholded[active_returns_thresholded["Ticker"] == TEST_TICKER]
test_ticker_df
#True and False counts for each period 
true_counts = test_ticker_df[period_columns].sum()
false_counts = len(test_ticker_df) - true_counts
print("Col Name           || True Count || False Count")
for col in period_columns:
    print(f"{col:<18} || {true_counts[col]:<10} || {false_counts[col]}")

Col Name           || True Count || False Count
active_returns_1b  || 408.0      || 2044.0
active_returns_1w  || 800.0      || 1652.0
active_returns_1m  || 806.0      || 1646.0
active_returns_1q  || 805.0      || 1647.0
active_returns_1y  || 801.0      || 1651.0


In [None]:
# Hyperparameter for test period
TEST_PERIOD = "1b"  # This can be set to different periods like "1b", "1w", "1m", "1q", "1y"
shift_bizdays = 1
#for our given TEST_TICKER let us construct a simple strategy that forecasts 1 if yesterday's active_returns_1b was 1, 0 if it was 0
test_ticker_df = active_returns_thresholded[active_returns_thresholded["Ticker"] == TEST_TICKER]
test_ticker_df = test_ticker_df[["Ticker", "Date", f"active_returns_{TEST_PERIOD}"]] 
#soort in ascending by Ticker first then Date
test_ticker_df = test_ticker_df.sort_values(["Ticker", "Date"])
test_ticker_df[f"active_returns_{TEST_PERIOD}_forecast_dumb"] = test_ticker_df[f"active_returns_{TEST_PERIOD}"].shift(shift_bizdays)
#drop row where forecast column is NaN
test_ticker_df = test_ticker_df.dropna()
#measure precision and recall of this dumb model, get f1 score and accuracy
true_positive = len(test_ticker_df[(test_ticker_df[f"active_returns_{TEST_PERIOD}"] == 1) & (test_ticker_df[f"active_returns_{TEST_PERIOD}_forecast_dumb"] == 1)])
false_positive = len(test_ticker_df[(test_ticker_df[f"active_returns_{TEST_PERIOD}"] == 0) & (test_ticker_df[f"active_returns_{TEST_PERIOD}_forecast_dumb"] == 1)])
true_negative = len(test_ticker_df[(test_ticker_df[f"active_returns_{TEST_PERIOD}"] == 0) & (test_ticker_df[f"active_returns_{TEST_PERIOD}_forecast_dumb"] == 0)])
false_negative = len(test_ticker_df[(test_ticker_df[f"active_returns_{TEST_PERIOD}"] == 1) & (test_ticker_df[f"active_returns_{TEST_PERIOD}_forecast_dumb"] == 0)])
precision = true_positive / (true_positive + false_positive)
recall = true_positive / (true_positive + false_negative)

print(f"Dumb Momentum Model that forecasts +1 if previous period's active_returns_{TEST_PERIOD} was +1, 0 if it was 0. Specific to", TEST_TICKER)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", 2 * precision * recall / (precision + recall))
print("Accuracy:", (true_positive + true_negative) / (true_positive + true_negative + false_positive + false_negative))
print("True Positive:", true_positive)
print("False Positive:", false_positive)
print("True Negative:", true_negative)


Dumb Model that forecasts +1 if previous period's active_returns_1b was +1, 0 if it was 0. Specific to GS UN
Precision: 0.19852941176470587
Recall: 0.19852941176470587
F1 Score: 0.19852941176470587
Accuracy: 0.7331701346389229
True Positive: 81
False Positive: 327
True Negative: 1716


In [None]:
TEST_PERIOD = "1b"  # This can be set to different periods like "1b", "1w", "1m", "1q", "1y"
shift_bizdays = 1
# Initialize counters for the global confusion matrix
global_true_positive = 0
global_false_positive = 0
global_true_negative = 0
global_false_negative = 0

for ticker in active_returns_thresholded["Ticker"].unique():
    # Filter the data for the current ticker
    ticker_df = active_returns_thresholded[active_returns_thresholded["Ticker"] == ticker]
    ticker_df = ticker_df[["Ticker", "Date", f"active_returns_{TEST_PERIOD}"]]  # Use the TEST_PERIOD here
    
    # Sort by Ticker and Date
    ticker_df = ticker_df.sort_values(["Ticker", "Date"])
    
    # Create forecast column based on the TEST_PERIOD
    ticker_df[f"active_returns_{TEST_PERIOD}_forecast_dumb"] = ticker_df[f"active_returns_{TEST_PERIOD}"].shift(shift_bizdays)
    
    # Drop rows with NaN in forecast column
    ticker_df = ticker_df.dropna()
    
    global_true_positive += len(ticker_df[(ticker_df[f"active_returns_{TEST_PERIOD}"] == 1) & (ticker_df[f"active_returns_{TEST_PERIOD}_forecast_dumb"] == 1)])
    global_false_positive += len(ticker_df[(ticker_df[f"active_returns_{TEST_PERIOD}"] == 0) & (ticker_df[f"active_returns_{TEST_PERIOD}_forecast_dumb"] == 1)])
    global_true_negative += len(ticker_df[(ticker_df[f"active_returns_{TEST_PERIOD}"] == 0) & (ticker_df[f"active_returns_{TEST_PERIOD}_forecast_dumb"] == 0)])
    global_false_negative += len(ticker_df[(ticker_df[f"active_returns_{TEST_PERIOD}"] == 1) & (ticker_df[f"active_returns_{TEST_PERIOD}_forecast_dumb"] == 0)])

# Calculate overall precision, recall, F1 score, and accuracy
precision = global_true_positive / (global_true_positive + global_false_positive) if (global_true_positive + global_false_positive) != 0 else 0
recall = global_true_positive / (global_true_positive + global_false_negative) if (global_true_positive + global_false_negative) != 0 else 0
f1_score = 2 * precision * recall / (precision + recall) if (precision + recall) != 0 else 0
accuracy = (global_true_positive + global_true_negative) / (global_true_positive + global_true_negative + global_false_positive + global_false_negative)

# Print the overall metrics
print(f"Dumb Momentum Model Forecasting Across All Tickers for period {TEST_PERIOD}:")
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1_score)
print("Accuracy:", accuracy)
print("True Positive:", global_true_positive)
print("False Positive:", global_false_positive)
print("True Negative:", global_true_negative)
print("False Negative:", global_false_negative)