# Stock Performance Forecasting with Sentiment Analysis
## Comprehensive Analysis with Model Summaries

## 1. Initial Setup and Data Loading

In [None]:
# Import all required libraries
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from statsmodels.iolib.summary2 import summary_col
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Set visualization style
sns.set_style("whitegrid")
plt.style.use("seaborn")
plt.rcParams['figure.figsize'] = (12, 6)

# Define file paths
SENTIMENT_BASE_PATH = './sentiment'
STOCK_DATA_PATH = './data/quantitative/tech/AAPL.csv'
OUTPUT_DIR = './results'
PLOT_DIR = os.path.join(OUTPUT_DIR, 'plots')
os.makedirs(PLOT_DIR, exist_ok=True)

print("✅ Packages imported and directories created.")

## 2. Data Loading and Preprocessing

In [None]:
def load_sentiment_data(folder, ticker):
    """Load and preprocess sentiment data"""
    path = os.path.join(SENTIMENT_BASE_PATH, folder, f"{ticker}_{folder}.csv")
    df = pd.read_csv(path)
    df['Date'] = pd.to_datetime(df['published']).dt.date
    df = df.groupby('Date')['sentiment_score'].mean().reset_index()
    return df.rename(columns={'sentiment_score': folder})

# Load all data sources
print("Loading data...")
ai_df = load_sentiment_data('ai', 'AAPL')
esg_df = load_sentiment_data('esg', 'AAPL')
general_df = load_sentiment_data('general', 'AAPL')

# Merge sentiment data
sentiment_df = ai_df.merge(esg_df, on='Date', how='outer')
sentiment_df = sentiment_df.merge(general_df, on='Date', how='outer')

# Create composite features
sentiment_df['AI+General'] = (sentiment_df['ai'] + sentiment_df['general']) / 2
sentiment_df['ESG+General'] = (sentiment_df['esg'] + sentiment_df['general']) / 2
sentiment_df['Combined'] = sentiment_df[['ai', 'esg', 'general']].mean(axis=1)

# Binary indicators for sentiment availability
sentiment_df['include_AI'] = sentiment_df['ai'].notna().astype(int)
sentiment_df['include_ESG'] = sentiment_df['esg'].notna().astype(int)
sentiment_df['include_General'] = sentiment_df['general'].notna().astype(int)

# Load stock data
stock_df = pd.read_csv(STOCK_DATA_PATH)
stock_df['Date'] = pd.to_datetime(stock_df['Date'])
stock_df.set_index('Date', inplace=True)
stock_df['Returns'] = stock_df['Adj Close'].pct_change()
stock_df = stock_df[['Returns']].dropna()

# Load Fama-French data
ff_path = "F-F_Research_Data_5_Factors_2x3_daily.CSV"
ff_df = pd.read_csv(ff_path, skiprows=3)
ff_df.columns = ['Date', 'MKT_RF', 'SMB', 'HML', 'RMW', 'CMA', 'RF']
ff_df['Date'] = pd.to_datetime(ff_df['Date'].astype(str), format='%Y%m%d')
ff_df.set_index('Date', inplace=True)
ff_df = ff_df / 100  # Convert percentages to decimals

# Merge all datasets
merged_df = stock_df.join(ff_df, how='inner').join(sentiment_df.set_index('Date'), how='left')

# Fill missing values
sentiment_cols = ['ai', 'esg', 'general', 'AI+General', 'ESG+General', 'Combined']
merged_df[sentiment_cols] = merged_df[sentiment_cols].fillna(0)
merged_df[['include_AI', 'include_ESG', 'include_General']] = merged_df[['include_AI', 'include_ESG', 'include_General']].fillna(0)

# Define train-test split
train_mask = (merged_df.index >= '2018-01-01') & (merged_df.index <= '2023-12-31')
test_mask = (merged_df.index >= '2024-01-01') & (merged_df.index <= '2024-12-31')

X_train = merged_df.loc[train_mask]
X_test = merged_df.loc[test_mask]
y_train = X_train['Returns']
y_test = X_test['Returns']

print("✅ Data loading and preprocessing complete.")
print(f"Training period: {X_train.index.min()} to {X_train.index.max()}")
print(f"Testing period: {X_test.index.min()} to {X_test.index.max()}")

## 3. Model Training with Statistical Summaries

In [None]:
# Define model configurations
model_configs = [
    {'name': 'Fama-French Only', 'features': ['MKT_RF', 'SMB', 'HML']},
    {'name': 'FF + AI+General', 'features': ['MKT_RF', 'SMB', 'HML', 'AI+General', 'include_AI']},
    {'name': 'FF + ESG+General', 'features': ['MKT_RF', 'SMB', 'HML', 'ESG+General', 'include_ESG']},
    {'name': 'FF + Combined', 'features': ['MKT_RF', 'SMB', 'HML', 'Combined', 'include_General']}
]

# Store results
results = []
model_summaries = {}

for config in model_configs:
    print(f"\n[1m{'='*60}\n{config['name']} Model\n{'='*60}[0m")
    
    # Prepare data
    X_train_set = X_train[config['features']]
    X_test_set = X_test[config['features']]
    
    # Statsmodels OLS
    X_train_sm = sm.add_constant(X_train_set)
    sm_model = sm.OLS(y_train, X_train_sm).fit()
    model_summaries[config['name']] = sm_model
    
    # Display full summary
    display(sm_model.summary2())
    
    # Calculate predictions
    y_train_pred = sm_model.predict(X_train_sm)
    y_test_pred = sm_model.predict(sm.add_constant(X_test_set))
    
    # Store results
    results.append({
        'Model': config['name'],
        'StatsModels_Object': sm_model,
        'Features': config['features'],
        'Train_R2': sm_model.rsquared,
        'Test_R2': r2_score(y_test, y_test_pred),
        'Train_MSE': mean_squared_error(y_train, y_train_pred),
        'Test_MSE': mean_squared_error(y_test, y_test_pred),
        'AIC': sm_model.aic,
        'BIC': sm_model.bic,
        'Predictions': y_test_pred
    })
    
    # Plot actual vs predicted
    plt.figure(figsize=(12, 6))
    plt.plot(y_test.index, y_test, label='Actual Returns', color='navy')
    plt.plot(y_test.index, y_test_pred, label='Predicted Returns', linestyle='--', color='darkorange')
    plt.title(f"{config['name']} - Actual vs Predicted Returns")
    plt.xlabel('Date')
    plt.ylabel('Daily Returns')
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.savefig(os.path.join(PLOT_DIR, f"returns_{config['name'].replace(' ', '_')}.png"))
    plt.show()

print("✅ All models trained and evaluated.")

## 4. Model Comparison and Diagnostics

In [None]:
# Create comprehensive comparison table
results_df = pd.DataFrame(results)[['Model', 'Train_R2', 'Test_R2', 'Train_MSE', 'Test_MSE', 'AIC', 'BIC']]
print("\n[1mModel Performance Comparison:[0m")
display(results_df.style\
    .background_gradient(cmap='Blues', subset=['Train_R2', 'Test_R2'])\
    .background_gradient(cmap='Reds_r', subset=['Train_MSE', 'Test_MSE'])\
    .background_gradient(cmap='Greens_r', subset=['AIC', 'BIC']))

# Coefficient comparison
coef_comparison = []
for name, model in model_summaries.items():
    coef_df = pd.DataFrame({
        'Coefficient': model.params,
        'P-value': model.pvalues,
        'Model': name
    })
    coef_comparison.append(coef_df)

all_coefs = pd.concat(coef_comparison).reset_index()

# Plot coefficients
plt.figure(figsize=(14, 8))
sns.barplot(data=all_coefs, x='index', y='Coefficient', hue='Model')
plt.axhline(0, color='black', linestyle='--')
plt.title('Model Coefficients Comparison', fontsize=16)
plt.xlabel('Features')
plt.ylabel('Coefficient Value')
plt.xticks(rotation=45)
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.savefig(os.path.join(PLOT_DIR, "coefficient_comparison.png"))
plt.show()

# Residual diagnostics
for name, model in model_summaries.items():
    print(f"\n[1m{'='*60}\n{name} Residual Diagnostics\n{'='*60}[0m")
    
    # Residual plot
    plt.figure(figsize=(12, 6))
    plt.scatter(model.fittedvalues, model.resid, alpha=0.5)
    plt.axhline(y=0, color='r', linestyle='--')
    plt.title(f'{name} - Residuals vs Fitted')
    plt.xlabel('Fitted values')
    plt.ylabel('Residuals')
    plt.grid(True)
    plt.tight_layout()
    plt.savefig(os.path.join(PLOT_DIR, f"residuals_{name.replace(' ', '_')}.png"))
    plt.show()
    
    # QQ plot
    plt.figure(figsize=(8, 6))
    sm.qqplot(model.resid, line='s')
    plt.title(f'{name} - Q-Q Plot')
    plt.tight_layout()
    plt.show()

print("✅ Model comparison and diagnostics complete.")

## 5. Final Outputs and Export

In [None]:
# Save all model summaries to HTML
for name, model in model_summaries.items():
    with open(os.path.join(OUTPUT_DIR, f"model_summary_{name.replace(' ', '_')}.html"), 'w') as f:
        f.write(model.summary().as_html())

# Save predictions
pred_df = pd.DataFrame({
    'Date': y_test.index,
    'Actual_Returns': y_test.values,
    **{res['Model']: res['Predictions'] for res in results}
}).set_index('Date')
pred_df.to_csv(os.path.join(OUTPUT_DIR, "predictions.csv"))

# Save results comparison
results_df.to_csv(os.path.join(OUTPUT_DIR, "model_comparison.csv"))
results_df.to_html(os.path.join(OUTPUT_DIR, "model_comparison.html"))

print("\n[1;32mAll outputs saved to results directory:[0m")
print(f"📊 Model summaries (HTML)")
print(f"📈 Prediction values (CSV)")
print(f"📉 Model comparison tables (CSV & HTML)")
print(f"🖼️ Visualizations in /plots folder")