In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from keras.models import Sequential
from keras.layers import Dense, LSTM, Dropout
from keras.callbacks import EarlyStopping
from statsmodels.tsa.arima.model import ARIMA
import warnings



# Load the datasets
hedge_fund = pd.read_csv('all_hedge_funds_balance_sheet.csv')  
sp500 = pd.read_csv('HistoricalData_1731445986332.csv')        

# Create a function to map quarterly dates to end-of-quarter dates
def convert_quarter_to_date(quarter):
    year, q = quarter.split(':')
    quarter_end_mapping = {
        'Q1': '-03-31',
        'Q2': '-06-30',
        'Q3': '-09-30',
        'Q4': '-12-31'
    }
    return year + quarter_end_mapping[q]

# Apply the conversion function to the hedge fund 'Date' column
hedge_fund['Date'] = hedge_fund['Date'].apply(convert_quarter_to_date)
hedge_fund['Date'] = pd.to_datetime(hedge_fund['Date'], format='%Y-%m-%d')

# Convert S&P500 Dates
sp500['Date'] = pd.to_datetime(sp500['Date'], format='%m/%d/%Y')
sp500['Quarter'] = sp500['Date'].dt.to_period('Q')

# Aggregate S&P500 Data by Quarter
sp500_quarterly = sp500.groupby('Quarter').agg({
    'Close/Last': 'last'  
}).reset_index()

# Add Quarterly Period to Hedge Fund Data
hedge_fund['Quarter'] = hedge_fund['Date'].dt.to_period('Q')

# Merge Hedge Fund with S&P500 Quarterly Data
merged_data = pd.merge(hedge_fund, sp500_quarterly, on='Quarter', how='inner')

# Save the merged data to a new file
merged_data.to_csv('merged_data.csv', index=False)

# Display the first few rows of the merged data
print(merged_data.head())



warnings.filterwarnings("ignore")

# Load the dataset
data = pd.read_csv("merged_data.csv")

# ============================
# Step 1: Data Preprocessing
# ============================

# Ensure 'Date' is in datetime format
data['Date'] = pd.to_datetime(data['Date'])

# Create the target variable 
data['Next_Close'] = data['Close/Last'].shift(-1)
data['Target'] = (data['Next_Close'] > data['Close/Last']).astype(int)
data.drop(columns=['Next_Close'], inplace=True)

# Feature Scaling 
scaler = StandardScaler()
features = data.drop(columns=['Date', 'Quarter', 'Close/Last', 'Target'])
X = pd.DataFrame(scaler.fit_transform(features), columns=features.columns)
y = data['Target']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# ============================
# Step 2: Define and Evaluate Models
# ============================

# Function to evaluate model performance
def evaluate_model(name, y_test, y_pred):
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    print(f"Model: {name}")
    print(f"Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1 Score: {f1:.4f}")
    print("-" * 50)
    return accuracy, precision, recall, f1

# Random Forest Classifier
rf_clf = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42)
rf_clf.fit(X_train, y_train)
y_pred_rf = rf_clf.predict(X_test)
rf_metrics = evaluate_model("Random Forest", y_test, y_pred_rf)

# XGBoost
xgb_clf = xgb.XGBClassifier(n_estimators=100, learning_rate=0.1, max_depth=5, eval_metric='logloss', use_label_encoder=False)
xgb_clf.fit(X_train, y_train)
y_pred_xgb = xgb_clf.predict(X_test)
xgb_metrics = evaluate_model("XGBoost", y_test, y_pred_xgb)

# Logistic Regression
log_reg_all = LogisticRegression(max_iter=200)
log_reg_all.fit(X_train, y_train)
y_pred_logreg_all = log_reg_all.predict(X_test)
log_reg_metrics = evaluate_model("Logistic Regression (All Data)", y_test, y_pred_logreg_all)

# LSTM

# Reshape data for LSTM
X_train_lstm = np.expand_dims(X_train.values, axis=1)
X_test_lstm = np.expand_dims(X_test.values, axis=1)

lstm_model = Sequential()
lstm_model.add(LSTM(64, activation='relu', input_shape=(X_train_lstm.shape[1], X_train_lstm.shape[2])))
lstm_model.add(Dropout(0.3))
lstm_model.add(Dense(1, activation='sigmoid'))

lstm_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Early stopping to prevent overfitting
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

lstm_model.fit(X_train_lstm, y_train, epochs=20, batch_size=32, validation_data=(X_test_lstm, y_test), callbacks=[early_stopping])

y_pred_lstm = (lstm_model.predict(X_test_lstm) > 0.5).astype(int)
lstm_metrics = evaluate_model("LSTM", y_test, y_pred_lstm)

# ARIMA
arima_model = ARIMA(y_train, order=(1, 1, 1)).fit()
y_pred_arima = (arima_model.forecast(steps=len(y_test)) > 0.5).astype(int)
arima_metrics = evaluate_model("ARIMA", y_test, y_pred_arima)

# ============================
# Step 3: Results Summary
# ============================

results = {
    "Random Forest": rf_metrics,
    "XGBoost": xgb_metrics,
    "Logistic Regression (All Data)": log_reg_metrics,
    "LSTM": lstm_metrics,
    "ARIMA": arima_metrics
}

print("\nModel Comparison:")
for model, (acc, prec, rec, f1) in results.items():
    print(f"{model}:")
    print(f" - Accuracy: {acc:.4f}")
    print(f" - Precision: {prec:.4f}")
    print(f" - Recall: {rec:.4f}")
    print(f" - F1 Score: {f1:.4f}")
    
    print("-" * 70)