# Import Library

In [None]:
import pandas as pd
from transformers import BertTokenizer, BertModel,RobertaTokenizer, RobertaModel, DistilBertTokenizer, DistilBertModel,DistilBertForSequenceClassification, RobertaTokenizer,  AutoTokenizer, AutoModel
import torch
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import timedelta
from sklearn.model_selection import TimeSeriesSplit
import matplotlib.dates as mdates
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
import plotly.graph_objs as go
import plotly.express as px


# BERT

## Load Embedding

In [None]:
# Load the pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Load embedded Dataframe
with open('embedding-BERT-AllCompany-NEW.pkl', 'rb') as f:
    insample_df, outsample_df = pd.read_pickle(f)

# Display the first 2 rows of outsample_df to check
filtered_df = outsample_df[outsample_df['headline'] != '[No_Headline]']
filtered_df.head(3)

## Accuracy per-companies

In [None]:
# Combine insample and outsample data for rolling window
df = pd.concat([insample_df, outsample_df])

# Convert 'Date From' to datetime
df['Date From'] = pd.to_datetime(df['Date From'])

# Sort by date
df = df.sort_values(by='Date From')

# Check available years
available_years = df['Date From'].dt.year.unique()
print("Years available in the data:", available_years)

# Define rolling window parameters
window_size = 365 * 10  # 10 years in days
prediction_period = 365  # 1 year in days

# Get unique companies
companies = df['companyname'].unique()

# Dictionary to store DataFrames for each company
company_results_dfs = {}

# Rolling window analysis for each company
for company in companies:
    company_df = df[df['companyname'] == company].copy()

    accuracies = []
    precisions = []
    recalls = []
    f1_scores = []
    years = []

    # Setup time series cross-validation
    tscv = TimeSeriesSplit(n_splits=5)  # You can adjust the number of splits as needed

    for train_index, test_index in tscv.split(company_df):
        train_df, test_df = company_df.iloc[train_index], company_df.iloc[test_index]

        if len(test_df) == 0 or len(train_df) == 0:
            continue

        X_train = np.vstack(train_df['embedding'].values)
        y_train = train_df['Future Return Direction'].values

        X_test = np.vstack(test_df['embedding'].values)
        y_test = test_df['Future Return Direction'].values

        # Train logistic regression model
        logistic_model = LogisticRegression(max_iter=1000)
        logistic_model.fit(X_train, y_train)

        # Make predictions
        y_pred = logistic_model.predict(X_test)

        # Ensure the length of predictions and test_df matches
        if len(y_pred) != len(test_df):
            raise ValueError(f"Mismatch between test data length ({len(test_df)}) and predictions ({len(y_pred)})")

        # Store the predictions and true values
        test_df['predicted'] = y_pred

        # Update the company-specific DataFrame with predictions
        company_df.loc[test_df.index, 'predicted'] = test_df['predicted']

        print(f'Company: {company}')
        print(f'Training window: {train_df["Date From"].min()} to {train_df["Date From"].max()}')
        print(f'Test window: {test_df["Date From"].min()} to {test_df["Date From"].max()}')
        print(f'Predicted years: {test_df["Date From"].dt.year.unique()}')

    # Evaluate the Model for the company
    for year in range(2016, 2024):
        year_df = company_df[company_df['Date From'].dt.year == year]

        if 'predicted' not in year_df.columns or year_df['predicted'].isnull().all():
            print(f"Missing predictions for year {year} for company {company} due to insufficient data or missing predictions.")
            continue

        y_test = year_df['Future Return Direction'].values
        y_pred = year_df['predicted'].values

        # Remove NaN values in predictions
        valid_indices = ~np.isnan(y_pred)
        y_test = y_test[valid_indices]
        y_pred = y_pred[valid_indices]

        if len(y_pred) == 0 or len(y_test) == 0:
            print(f"Insufficient valid predictions for year {year} for company {company}.")
            continue

        # Calculate evaluation metrics
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average='binary', pos_label=1)
        recall = recall_score(y_test, y_pred, average='binary', pos_label=1)
        f1 = f1_score(y_test, y_pred, average='binary', pos_label=1)

        accuracies.append(accuracy)
        precisions.append(precision)
        recalls.append(recall)
        f1_scores.append(f1)
        years.append(year)

        print(f'Company: {company}, Year: {year}')
        print(f'Accuracy: {accuracy:.2f}')
        print(f'Precision: {precision:.2f}')
        print(f'Recall: {recall:.2f}')
        print(f'F1 Score: {f1:.2f}')
        print()

    # Store the results for the company in a DataFrame
    results_df = pd.DataFrame({
        'Year': years,
        'Accuracy': accuracies,
        'Precision': precisions,
        'Recall': recalls,
        'F1 Score': f1_scores
    })

    # Save the DataFrame in the dictionary
    company_results_dfs[company] = results_df

    print(f"Evaluation results for {company}:")
    print(results_df)

    # Plot accuracy over time for the company
    plt.figure(figsize=(10, 6))
    plt.plot(results_df['Year'], results_df['Accuracy'], marker='o', label='Accuracy')
    plt.plot(results_df['Year'], results_df['Precision'], marker='o', label='Precision')
    plt.plot(results_df['Year'], results_df['Recall'], marker='o', label='Recall')
    plt.plot(results_df['Year'], results_df['F1 Score'], marker='o', label='F1 Score')

    plt.title(f'Performance Metrics Over Time for {company}')
    plt.xlabel('Year')
    plt.ylabel('Score')
    plt.legend()
    plt.grid(True)
    plt.show()

    # Plot confusion matrices for each year
    n_plots = len(years)
    n_cols = 4
    n_rows = (n_plots // n_cols) + (n_plots % n_cols > 0)

    fig, axes = plt.subplots(n_rows, n_cols, figsize=(20, 4 * n_rows))
    axes = axes.flatten()

    for i, year in enumerate(years):
        year_df = company_df[company_df['Date From'].dt.year == year]
        y_test = year_df['Future Return Direction'].values
        y_pred = year_df['predicted'].values

        # Remove NaN values in predictions
        valid_indices = ~np.isnan(y_pred)
        y_test = y_test[valid_indices]
        y_pred = y_pred[valid_indices]

        if len(y_pred) > 0 and len(y_test) > 0:
            cm = confusion_matrix(y_test, y_pred)
            disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=logistic_model.classes_)
            disp.plot(cmap='Blues', ax=axes[i])
            axes[i].set_title(f'{company} {year}')

    for j in range(i + 1, len(axes)):
        fig.delaxes(axes[j])

    plt.tight_layout()
    plt.show()

# Access the stored DataFrames
for company, results_df in company_results_dfs.items():
    print(f"\nResults for {company}:")
    print(results_df)


In [None]:
# Concatenate all the company results into one DataFrame
all_results_df = pd.concat(company_results_dfs.values(), keys=company_results_dfs.keys()).reset_index(level=0).rename(columns={'level_0': 'Company'})

# Save the combined DataFrame to a CSV file
all_results_df.to_csv('[EVAL] BERT_all_company_results.csv', index=False)


## Accuracy all years

In [None]:
# Combine insample and outsample data for rolling window
df = pd.concat([insample_df, outsample_df])

# Convert 'Date From' to datetime
df['Date From'] = pd.to_datetime(df['Date From'])

# Sort by date
df = df.sort_values(by='Date From')

# Check available years
available_years = df['Date From'].dt.year.unique()
print("Years available in the data:", available_years)

# Define rolling window parameters
window_size = 365 * 10  # 10 years in days
prediction_period = 365  # 1 year in days

# Get unique companies
companies = df['companyname'].unique()

# Dictionary to store DataFrames for each company
company_results_dfs = {}

# Rolling window analysis for each company
for company in companies:
    company_df = df[df['companyname'] == company].copy()

    accuracies = []
    precisions = []
    recalls = []
    f1_scores = []
    years = []

    # Setup time series cross-validation
    tscv = TimeSeriesSplit(n_splits=5)  # You can adjust the number of splits as needed

    for train_index, test_index in tscv.split(company_df):
        train_df, test_df = company_df.iloc[train_index], company_df.iloc[test_index]

        if len(test_df) == 0 or len(train_df) == 0:
            continue

        X_train = np.vstack(train_df['embedding'].values)
        y_train = train_df['Future Return Direction'].values

        X_test = np.vstack(test_df['embedding'].values)
        y_test = test_df['Future Return Direction'].values

        # Train logistic regression model
        logistic_model = LogisticRegression(max_iter=1000)
        logistic_model.fit(X_train, y_train)

        # Make predictions
        y_pred = logistic_model.predict(X_test)

        # Ensure the length of predictions and test_df matches
        if len(y_pred) != len(test_df):
            raise ValueError(f"Mismatch between test data length ({len(test_df)}) and predictions ({len(y_pred)})")

        # Store the predictions and true values
        test_df['predicted'] = y_pred

        # Update the company-specific DataFrame with predictions
        company_df.loc[test_df.index, 'predicted'] = test_df['predicted']

    # Evaluate the Model for the company
    for year in range(2016, 2024):
        year_df = company_df[company_df['Date From'].dt.year == year]

        if 'predicted' not in year_df.columns or year_df['predicted'].isnull().all():
            continue

        y_test = year_df['Future Return Direction'].values
        y_pred = year_df['predicted'].values

        # Remove NaN values in predictions
        valid_indices = ~np.isnan(y_pred)
        y_test = y_test[valid_indices]
        y_pred = y_pred[valid_indices]

        if len(y_pred) == 0 or len(y_test) == 0:
            continue

        # Calculate evaluation metrics
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average='binary', pos_label=1)
        recall = recall_score(y_test, y_pred, average='binary', pos_label=1)
        f1 = f1_score(y_test, y_pred, average='binary', pos_label=1)

        accuracies.append(accuracy)
        precisions.append(precision)
        recalls.append(recall)
        f1_scores.append(f1)
        years.append(year)

    # Store the results for the company in a DataFrame
    results_df = pd.DataFrame({
        'Year': years,
        'Accuracy': accuracies,
        'Precision': precisions,
        'Recall': recalls,
        'F1 Score': f1_scores
    })

    # Save the DataFrame in the dictionary
    company_results_dfs[company] = results_df

# Combine results of all companies into a single DataFrame
combined_results = pd.concat(company_results_dfs.values(), ignore_index=True)

# Calculate average accuracy per year across all companies
average_metrics_per_year = combined_results.groupby('Year').mean().reset_index()

# Print average metrics for each year
print("Average Metrics Per Year:")
print(average_metrics_per_year)

# Plot average accuracy per year
plt.figure(figsize=(10, 6))
sns.set(style='whitegrid')

plt.plot(average_metrics_per_year['Year'], average_metrics_per_year['Accuracy'], marker='o', label='Accuracy')
plt.plot(average_metrics_per_year['Year'], average_metrics_per_year['Precision'], marker='o', label='Precision')
plt.plot(average_metrics_per_year['Year'], average_metrics_per_year['Recall'], marker='o', label='Recall')
plt.plot(average_metrics_per_year['Year'], average_metrics_per_year['F1 Score'], marker='o', label='F1 Score')

plt.title('Average Performance Metrics Over Time Across All Companies')
plt.xlabel('Year')
plt.ylabel('Score')
plt.legend()
plt.grid(True)
plt.show()


## Standard Deviation all years

In [None]:
# Combine insample and outsample data for rolling window
df = pd.concat([insample_df, outsample_df])

# Convert 'Date From' to datetime
df['Date From'] = pd.to_datetime(df['Date From'])

# Sort by date
df = df.sort_values(by='Date From')

# Check available years
available_years = df['Date From'].dt.year.unique()
print("Years available in the data:", available_years)

# Get unique companies
companies = df['companyname'].unique()

# Dictionary to store DataFrames for each company
company_results_dfs = {}

# Rolling window analysis for each company
for company in companies:
    company_df = df[df['companyname'] == company].copy()

    accuracies = []
    years = []

    # Setup time series cross-validation
    tscv = TimeSeriesSplit(n_splits=5)  # You can adjust the number of splits as needed

    for train_index, test_index in tscv.split(company_df):
        train_df, test_df = company_df.iloc[train_index], company_df.iloc[test_index]

        if len(test_df) == 0 or len(train_df) == 0:
            continue

        X_train = np.vstack(train_df['embedding'].values)
        y_train = train_df['Future Return Direction'].values

        X_test = np.vstack(test_df['embedding'].values)
        y_test = test_df['Future Return Direction'].values

        # Train logistic regression model
        logistic_model = LogisticRegression(max_iter=1000)
        logistic_model.fit(X_train, y_train)

        # Make predictions
        y_pred = logistic_model.predict(X_test)

        # Ensure the length of predictions and test_df matches
        if len(y_pred) != len(test_df):
            raise ValueError(f"Mismatch between test data length ({len(test_df)}) and predictions ({len(y_pred)})")

        # Store the predictions and true values
        test_df['predicted'] = y_pred

        # Update the company-specific DataFrame with predictions
        company_df.loc[test_df.index, 'predicted'] = test_df['predicted']

    # Evaluate the Model for the company
    for year in range(2016, 2024):
        year_df = company_df[company_df['Date From'].dt.year == year]

        if 'predicted' not in year_df.columns or year_df['predicted'].isnull().all():
            continue

        y_test = year_df['Future Return Direction'].values
        y_pred = year_df['predicted'].values

        # Remove NaN values in predictions
        valid_indices = ~np.isnan(y_pred)
        y_test = y_test[valid_indices]
        y_pred = y_pred[valid_indices]

        if len(y_pred) == 0 or len(y_test) == 0:
            continue

        # Calculate evaluation metrics
        accuracy = accuracy_score(y_test, y_pred)

        accuracies.append(accuracy)
        years.append(year)

    # Store the results for the company in a DataFrame
    results_df = pd.DataFrame({
        'Year': years,
        'Accuracy': accuracies
    })

    # Save the DataFrame in the dictionary
    company_results_dfs[company] = results_df

# Combine results of all companies into a single DataFrame
combined_results = pd.concat(company_results_dfs.values(), ignore_index=True)

# Calculate average accuracy per year across all companies
average_metrics_per_year = combined_results.groupby('Year').agg(
    Accuracy_mean=('Accuracy', 'mean'),
    Accuracy_std=('Accuracy', 'std')
).reset_index()

# Print average metrics for each year
print("Average Metrics Per Year:")
print(average_metrics_per_year)

# Get a color sequence from Plotly's default colors
colors = px.colors.qualitative.Plotly

# Function to make the color more transparent
def get_transparent_color(color, alpha=0.2):
    # Convert hex to RGB and then to RGBA
    hex_color = color.lstrip('#')
    rgb_color = tuple(int(hex_color[i:i+2], 16) for i in (0, 2, 4))
    return f'rgba({rgb_color[0]}, {rgb_color[1]}, {rgb_color[2]}, {alpha})'

# Create a figure
fig = go.Figure()

# Add the mean line
fig.add_trace(go.Scatter(
    x=average_metrics_per_year['Year'],
    y=average_metrics_per_year['Accuracy_mean'],
    mode='lines',
    name='Accuracy',
    line=dict(color=colors[0], width=2)
))

# Add the standard deviation shaded area
fig.add_trace(go.Scatter(
    x=pd.concat([average_metrics_per_year['Year'], average_metrics_per_year['Year'][::-1]]),
    y=pd.concat([average_metrics_per_year['Accuracy_mean'] + average_metrics_per_year['Accuracy_std'],
                 (average_metrics_per_year['Accuracy_mean'] - average_metrics_per_year['Accuracy_std'])[::-1]]),
    fill='toself',
    fillcolor=get_transparent_color(colors[0], alpha=0.2),  # Use the same color with transparency
    line=dict(color='rgba(255,255,255,0)'),
    hoverinfo="skip",
    showlegend=False,
    name='Accuracy std dev'
))

# Customize layout
fig.update_layout(
    title='Average Rolling Window Accuracy Over Time Across All Companies',
    xaxis_title='Year',
    yaxis_title='Accuracy',
    template='plotly_white',
    showlegend=True
)

# Show the plot
fig.show()


## Portofolio

In [None]:
def prepare_data(insample_df, outsample_df):
    df = pd.concat([insample_df, outsample_df])
    df['Date From'] = pd.to_datetime(df['Date From'])
    df = df.sort_values(by='Date From')
    available_years = df['Date From'].dt.year.unique()
    print("Years available in the data:", available_years)
    return df

def rolling_window_analysis(df):
    companies = df['companyname'].unique()
    predictions_df = pd.DataFrame()

    for company in companies:
        company_df = df[df['companyname'] == company].copy()
        company_df = company_df.sort_values(by='Date From')

        # Define the rolling window parameters
        start_year = company_df['Date From'].dt.year.min()
        end_year = company_df['Date From'].dt.year.max()
        window_size = 10
        validation_size = 1

        for start in range(start_year, end_year - window_size - validation_size + 1):
            train_start = start
            train_end = start + window_size
            val_start = train_end
            val_end = train_end + validation_size

            train_df = company_df[(company_df['Date From'].dt.year >= train_start) &
                                  (company_df['Date From'].dt.year < train_end)]
            test_df = company_df[(company_df['Date From'].dt.year >= val_start) &
                                 (company_df['Date From'].dt.year < val_end)]

            if len(test_df) == 0 or len(train_df) == 0:
                continue

            X_train = np.vstack(train_df['embedding'].values)
            y_train = train_df['Future Return Direction'].values
            X_test = np.vstack(test_df['embedding'].values)
            y_test = test_df['Future Return Direction'].values

            # Train logistic regression model
            logistic_model = LogisticRegression(max_iter=1000)
            logistic_model.fit(X_train, y_train)

            # Get prediction probabilities
            y_prob = logistic_model.predict_proba(X_test)[:, 1]  # Probability of the positive class

            if len(y_prob) != len(test_df):
                raise ValueError(f"Mismatch between test data length ({len(test_df)}) and predictions ({len(y_prob)})")

            test_df['predicted_prob'] = y_prob
            predictions_df = pd.concat([predictions_df, test_df[['Date From', 'companyname', 'predicted_prob', 'Weekly Compound Return']]], ignore_index=True)

    df = df.merge(predictions_df, on=['Date From', 'companyname', 'Weekly Compound Return'], how='left', suffixes=('', '_pred'))
    return df

def construct_portfolio(df, time_period='Week'):
    df['Date From'] = pd.to_datetime(df['Date From'])
    if time_period == 'Week':
        df['Period'] = df['Date From'].dt.to_period('W').dt.to_timestamp()
    else:
        raise ValueError("Invalid time_period. Use 'Week'.")

    portfolio_returns = []

    for period, group in df.groupby('Period'):
        if period.year < 2016:
            continue

        # Sort group by predicted_prob descending
        group_sorted = group.sort_values(by='predicted_prob', ascending=False)

        # Select top and bottom companies
        num_top_companies = 5
        num_bottom_companies = 5
        top_companies = group_sorted.head(num_top_companies)
        bottom_companies = group_sorted.tail(num_bottom_companies)

        # Equal-weighted returns
        long_return_eq = np.mean(np.log1p(top_companies['Weekly Compound Return']))
        short_return_eq = np.mean(np.log1p(bottom_companies['Weekly Compound Return']))
        long_short_return_eq = long_return_eq - short_return_eq

        # Value-weighted returns
        long_return_val = np.sum(np.log1p(top_companies['Weekly Compound Return']) * top_companies['market_cap']) / np.sum(top_companies['market_cap'])
        short_return_val = np.sum(np.log1p(bottom_companies['Weekly Compound Return']) * bottom_companies['market_cap']) / np.sum(bottom_companies['market_cap'])
        long_short_return_val = long_return_val - short_return_val

        portfolio_returns.append({
            'Period': period,
            'Long Return (Eq)': long_return_eq,
            'Short Return (Eq)': short_return_eq,
            'Long-Short Return (Eq)': long_short_return_eq,
            'Long Return (Val)': long_return_val,
            'Short Return (Val)': short_return_val,
            'Long-Short Return (Val)': long_short_return_val
        })

    portfolio_df = pd.DataFrame(portfolio_returns)
    portfolio_df['EW L'] = portfolio_df['Long Return (Eq)'].cumsum()
    portfolio_df['EW S'] = portfolio_df['Short Return (Eq)'].cumsum()
    portfolio_df['EW LS'] = portfolio_df['Long-Short Return (Eq)'].cumsum()
    portfolio_df['VW L'] = portfolio_df['Long Return (Val)'].cumsum()
    portfolio_df['VW S'] = portfolio_df['Short Return (Val)'].cumsum()
    portfolio_df['VW LS'] = portfolio_df['Long-Short Return (Val)'].cumsum()

    actual_returns = df[df['Date From'].dt.year >= 2016].groupby('Period')['Weekly Compound Return'].mean()
    actual_cumulative_returns = np.log1p(actual_returns).cumsum()
    portfolio_df = portfolio_df.merge(actual_cumulative_returns.rename('Market'), on='Period', how='left')

    metrics = {}
    for portfolio in ['EW L', 'EW S', 'EW LS', 'VW L', 'VW S', 'VW LS']:
        returns = portfolio_df[portfolio]

        if returns.isnull().all() or returns.eq(0).all():
            sharpe_ratio = np.nan
            max_drawdown = np.nan
            volatility = np.nan
        else:
            sharpe_ratio = returns.mean() / returns.std() * np.sqrt(52) if returns.std() != 0 else np.nan
            cumulative_returns = returns.cumsum()
            max_drawdown = (cumulative_returns.cummax() - cumulative_returns).max()
            volatility = returns.std() * np.sqrt(52)

        metrics[portfolio] = {
            'Sharpe Ratio': sharpe_ratio
        }

        print(f"Metrics for {portfolio}:")
        print(f"Sharpe Ratio: {sharpe_ratio}")
        print()

    portfolio_df.to_csv('BERT_portfolio_returns.csv', index=False)
    print("Portfolio returns saved to 'BERT_portfolio_returns.csv'")
    return portfolio_df

def plot_portfolio_returns(portfolio_df, title_suffix=''):
    plt.figure(figsize=(12, 6))

    plt.plot(portfolio_df['Period'], portfolio_df['EW L'], marker='o', markersize=1, label='EW L')
    plt.plot(portfolio_df['Period'], portfolio_df['EW S'], marker='o', markersize=1, label='EW S')
    plt.plot(portfolio_df['Period'], portfolio_df['EW LS'], marker='o', markersize=1, label='EW LS')
    plt.plot(portfolio_df['Period'], portfolio_df['VW L'], marker='o', markersize=1, label='VW L')
    plt.plot(portfolio_df['Period'], portfolio_df['VW S'], marker='o', markersize=1, label='VW S')
    plt.plot(portfolio_df['Period'], portfolio_df['VW LS'], marker='o', markersize=1, label='VW LS')
    plt.plot(portfolio_df['Period'], portfolio_df['Market'], marker='o', markersize=1, label='Market')

    plt.title(f'Cumulative {title_suffix} Portfolio Returns Over Time')
    plt.xlabel('Period')
    plt.ylabel('Cumulative Log Return')
    plt.legend()
    plt.grid(True)

    plt.gca().xaxis.set_major_locator(mdates.YearLocator())
    plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%Y'))
    plt.xticks(rotation=45)
    plt.show()

# Example usage for Weekly
df = prepare_data(insample_df, outsample_df)
df = rolling_window_analysis(df)

# Weekly Portfolio
portfolio_df_week = construct_portfolio(df, time_period='Week')
portfolio_df_week = portfolio_df_week[portfolio_df_week['Period'].dt.year >= 2016]
plot_portfolio_returns(portfolio_df_week, title_suffix='Weekly')

# RoBERTa

## Load Embedding

In [None]:
# Load the pre-trained RoBERTa model and tokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaModel.from_pretrained('roberta-base')

# Load embedded Dataframe
with open('embedding-RoBERTa-AllCompany-NEW.pkl', 'rb') as f:
    insample_df, outsample_df = pd.read_pickle(f)

# Display the first 2 rows of outsample_df to check
filtered_df = outsample_df[outsample_df['headline'] != '[No_Headline]']
filtered_df.head(3)

## Accuracy per-companies

In [None]:
# Combine insample and outsample data for rolling window
df = pd.concat([insample_df, outsample_df])

# Convert 'Date From' to datetime
df['Date From'] = pd.to_datetime(df['Date From'])

# Sort by date
df = df.sort_values(by='Date From')

# Check available years
available_years = df['Date From'].dt.year.unique()
print("Years available in the data:", available_years)

# Define rolling window parameters
window_size = 365 * 10  # 10 years in days
prediction_period = 365  # 1 year in days

# Get unique companies
companies = df['companyname'].unique()

# Dictionary to store DataFrames for each company
company_results_dfs = {}

# Rolling window analysis for each company
for company in companies:
    company_df = df[df['companyname'] == company].copy()

    accuracies = []
    precisions = []
    recalls = []
    f1_scores = []
    years = []

    # Setup time series cross-validation
    tscv = TimeSeriesSplit(n_splits=5)  # You can adjust the number of splits as needed

    for train_index, test_index in tscv.split(company_df):
        train_df, test_df = company_df.iloc[train_index], company_df.iloc[test_index]

        if len(test_df) == 0 or len(train_df) == 0:
            continue

        X_train = np.vstack(train_df['embedding'].values)
        y_train = train_df['Future Return Direction'].values

        X_test = np.vstack(test_df['embedding'].values)
        y_test = test_df['Future Return Direction'].values

        # Train logistic regression model
        logistic_model = LogisticRegression(max_iter=1000)
        logistic_model.fit(X_train, y_train)

        # Make predictions
        y_pred = logistic_model.predict(X_test)

        # Ensure the length of predictions and test_df matches
        if len(y_pred) != len(test_df):
            raise ValueError(f"Mismatch between test data length ({len(test_df)}) and predictions ({len(y_pred)})")

        # Store the predictions and true values
        test_df['predicted'] = y_pred

        # Update the company-specific DataFrame with predictions
        company_df.loc[test_df.index, 'predicted'] = test_df['predicted']

        print(f'Company: {company}')
        print(f'Training window: {train_df["Date From"].min()} to {train_df["Date From"].max()}')
        print(f'Test window: {test_df["Date From"].min()} to {test_df["Date From"].max()}')
        print(f'Predicted years: {test_df["Date From"].dt.year.unique()}')

    # Evaluate the Model for the company
    for year in range(2016, 2024):
        year_df = company_df[company_df['Date From'].dt.year == year]

        if 'predicted' not in year_df.columns or year_df['predicted'].isnull().all():
            print(f"Missing predictions for year {year} for company {company} due to insufficient data or missing predictions.")
            continue

        y_test = year_df['Future Return Direction'].values
        y_pred = year_df['predicted'].values

        # Remove NaN values in predictions
        valid_indices = ~np.isnan(y_pred)
        y_test = y_test[valid_indices]
        y_pred = y_pred[valid_indices]

        if len(y_pred) == 0 or len(y_test) == 0:
            print(f"Insufficient valid predictions for year {year} for company {company}.")
            continue

        # Calculate evaluation metrics
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average='binary', pos_label=1)
        recall = recall_score(y_test, y_pred, average='binary', pos_label=1)
        f1 = f1_score(y_test, y_pred, average='binary', pos_label=1)

        accuracies.append(accuracy)
        precisions.append(precision)
        recalls.append(recall)
        f1_scores.append(f1)
        years.append(year)

        print(f'Company: {company}, Year: {year}')
        print(f'Accuracy: {accuracy:.2f}')
        print(f'Precision: {precision:.2f}')
        print(f'Recall: {recall:.2f}')
        print(f'F1 Score: {f1:.2f}')
        print()

    # Store the results for the company in a DataFrame
    results_df = pd.DataFrame({
        'Year': years,
        'Accuracy': accuracies,
        'Precision': precisions,
        'Recall': recalls,
        'F1 Score': f1_scores
    })

    # Save the DataFrame in the dictionary
    company_results_dfs[company] = results_df

    print(f"Evaluation results for {company}:")
    print(results_df)

    # Plot accuracy over time for the company
    plt.figure(figsize=(10, 6))
    plt.plot(results_df['Year'], results_df['Accuracy'], marker='o', label='Accuracy')
    plt.plot(results_df['Year'], results_df['Precision'], marker='o', label='Precision')
    plt.plot(results_df['Year'], results_df['Recall'], marker='o', label='Recall')
    plt.plot(results_df['Year'], results_df['F1 Score'], marker='o', label='F1 Score')

    plt.title(f'Performance Metrics Over Time for {company}')
    plt.xlabel('Year')
    plt.ylabel('Score')
    plt.legend()
    plt.grid(True)
    plt.show()

    # Plot confusion matrices for each year
    n_plots = len(years)
    n_cols = 4
    n_rows = (n_plots // n_cols) + (n_plots % n_cols > 0)

    fig, axes = plt.subplots(n_rows, n_cols, figsize=(20, 4 * n_rows))
    axes = axes.flatten()

    for i, year in enumerate(years):
        year_df = company_df[company_df['Date From'].dt.year == year]
        y_test = year_df['Future Return Direction'].values
        y_pred = year_df['predicted'].values

        # Remove NaN values in predictions
        valid_indices = ~np.isnan(y_pred)
        y_test = y_test[valid_indices]
        y_pred = y_pred[valid_indices]

        if len(y_pred) > 0 and len(y_test) > 0:
            cm = confusion_matrix(y_test, y_pred)
            disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=logistic_model.classes_)
            disp.plot(cmap='Blues', ax=axes[i])
            axes[i].set_title(f'{company} {year}')

    for j in range(i + 1, len(axes)):
        fig.delaxes(axes[j])

    plt.tight_layout()
    plt.show()

# Access the stored DataFrames
for company, results_df in company_results_dfs.items():
    print(f"\nResults for {company}:")
    print(results_df)


In [None]:
# Concatenate all the company results into one DataFrame
all_results_df = pd.concat(company_results_dfs.values(), keys=company_results_dfs.keys()).reset_index(level=0).rename(columns={'level_0': 'Company'})

# Save the combined DataFrame to a CSV file
all_results_df.to_csv('[EVAL] RoBERTa_all_company_results.csv', index=False)


## Accuracy all years

In [None]:
# Combine insample and outsample data for rolling window
df = pd.concat([insample_df, outsample_df])

# Convert 'Date From' to datetime
df['Date From'] = pd.to_datetime(df['Date From'])

# Sort by date
df = df.sort_values(by='Date From')

# Check available years
available_years = df['Date From'].dt.year.unique()
print("Years available in the data:", available_years)

# Define rolling window parameters
window_size = 365 * 10  # 10 years in days
prediction_period = 365  # 1 year in days

# Get unique companies
companies = df['companyname'].unique()

# Dictionary to store DataFrames for each company
company_results_dfs = {}

# Rolling window analysis for each company
for company in companies:
    company_df = df[df['companyname'] == company].copy()

    accuracies = []
    precisions = []
    recalls = []
    f1_scores = []
    years = []

    # Setup time series cross-validation
    tscv = TimeSeriesSplit(n_splits=5)  # You can adjust the number of splits as needed

    for train_index, test_index in tscv.split(company_df):
        train_df, test_df = company_df.iloc[train_index], company_df.iloc[test_index]

        if len(test_df) == 0 or len(train_df) == 0:
            continue

        X_train = np.vstack(train_df['embedding'].values)
        y_train = train_df['Future Return Direction'].values

        X_test = np.vstack(test_df['embedding'].values)
        y_test = test_df['Future Return Direction'].values

        # Train logistic regression model
        logistic_model = LogisticRegression(max_iter=1000)
        logistic_model.fit(X_train, y_train)

        # Make predictions
        y_pred = logistic_model.predict(X_test)

        # Ensure the length of predictions and test_df matches
        if len(y_pred) != len(test_df):
            raise ValueError(f"Mismatch between test data length ({len(test_df)}) and predictions ({len(y_pred)})")

        # Store the predictions and true values
        test_df['predicted'] = y_pred

        # Update the company-specific DataFrame with predictions
        company_df.loc[test_df.index, 'predicted'] = test_df['predicted']

    # Evaluate the Model for the company
    for year in range(2016, 2024):
        year_df = company_df[company_df['Date From'].dt.year == year]

        if 'predicted' not in year_df.columns or year_df['predicted'].isnull().all():
            continue

        y_test = year_df['Future Return Direction'].values
        y_pred = year_df['predicted'].values

        # Remove NaN values in predictions
        valid_indices = ~np.isnan(y_pred)
        y_test = y_test[valid_indices]
        y_pred = y_pred[valid_indices]

        if len(y_pred) == 0 or len(y_test) == 0:
            continue

        # Calculate evaluation metrics
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average='binary', pos_label=1)
        recall = recall_score(y_test, y_pred, average='binary', pos_label=1)
        f1 = f1_score(y_test, y_pred, average='binary', pos_label=1)

        accuracies.append(accuracy)
        precisions.append(precision)
        recalls.append(recall)
        f1_scores.append(f1)
        years.append(year)

    # Store the results for the company in a DataFrame
    results_df = pd.DataFrame({
        'Year': years,
        'Accuracy': accuracies,
        'Precision': precisions,
        'Recall': recalls,
        'F1 Score': f1_scores
    })

    # Save the DataFrame in the dictionary
    company_results_dfs[company] = results_df

# Combine results of all companies into a single DataFrame
combined_results = pd.concat(company_results_dfs.values(), ignore_index=True)

# Calculate average accuracy per year across all companies
average_metrics_per_year = combined_results.groupby('Year').mean().reset_index()

# Print average metrics for each year
print("Average Metrics Per Year:")
print(average_metrics_per_year)

# Plot average accuracy per year
plt.figure(figsize=(10, 6))
sns.set(style='whitegrid')

plt.plot(average_metrics_per_year['Year'], average_metrics_per_year['Accuracy'], marker='o', label='Accuracy')
plt.plot(average_metrics_per_year['Year'], average_metrics_per_year['Precision'], marker='o', label='Precision')
plt.plot(average_metrics_per_year['Year'], average_metrics_per_year['Recall'], marker='o', label='Recall')
plt.plot(average_metrics_per_year['Year'], average_metrics_per_year['F1 Score'], marker='o', label='F1 Score')

plt.title('Average Performance Metrics Over Time Across All Companies')
plt.xlabel('Year')
plt.ylabel('Score')
plt.legend()
plt.grid(True)
plt.show()


## Standard Deviation all years

In [None]:
# Combine insample and outsample data for rolling window
df = pd.concat([insample_df, outsample_df])

# Convert 'Date From' to datetime
df['Date From'] = pd.to_datetime(df['Date From'])

# Sort by date
df = df.sort_values(by='Date From')

# Check available years
available_years = df['Date From'].dt.year.unique()
print("Years available in the data:", available_years)

# Get unique companies
companies = df['companyname'].unique()

# Dictionary to store DataFrames for each company
company_results_dfs = {}

# Rolling window analysis for each company
for company in companies:
    company_df = df[df['companyname'] == company].copy()

    accuracies = []
    years = []

    # Setup time series cross-validation
    tscv = TimeSeriesSplit(n_splits=5)  # You can adjust the number of splits as needed

    for train_index, test_index in tscv.split(company_df):
        train_df, test_df = company_df.iloc[train_index], company_df.iloc[test_index]

        if len(test_df) == 0 or len(train_df) == 0:
            continue

        X_train = np.vstack(train_df['embedding'].values)
        y_train = train_df['Future Return Direction'].values

        X_test = np.vstack(test_df['embedding'].values)
        y_test = test_df['Future Return Direction'].values

        # Train logistic regression model
        logistic_model = LogisticRegression(max_iter=1000)
        logistic_model.fit(X_train, y_train)

        # Make predictions
        y_pred = logistic_model.predict(X_test)

        # Ensure the length of predictions and test_df matches
        if len(y_pred) != len(test_df):
            raise ValueError(f"Mismatch between test data length ({len(test_df)}) and predictions ({len(y_pred)})")

        # Store the predictions and true values
        test_df['predicted'] = y_pred

        # Update the company-specific DataFrame with predictions
        company_df.loc[test_df.index, 'predicted'] = test_df['predicted']

    # Evaluate the Model for the company
    for year in range(2016, 2024):
        year_df = company_df[company_df['Date From'].dt.year == year]

        if 'predicted' not in year_df.columns or year_df['predicted'].isnull().all():
            continue

        y_test = year_df['Future Return Direction'].values
        y_pred = year_df['predicted'].values

        # Remove NaN values in predictions
        valid_indices = ~np.isnan(y_pred)
        y_test = y_test[valid_indices]
        y_pred = y_pred[valid_indices]

        if len(y_pred) == 0 or len(y_test) == 0:
            continue

        # Calculate evaluation metrics
        accuracy = accuracy_score(y_test, y_pred)

        accuracies.append(accuracy)
        years.append(year)

    # Store the results for the company in a DataFrame
    results_df = pd.DataFrame({
        'Year': years,
        'Accuracy': accuracies
    })

    # Save the DataFrame in the dictionary
    company_results_dfs[company] = results_df

# Combine results of all companies into a single DataFrame
combined_results = pd.concat(company_results_dfs.values(), ignore_index=True)

# Calculate average accuracy per year across all companies
average_metrics_per_year = combined_results.groupby('Year').agg(
    Accuracy_mean=('Accuracy', 'mean'),
    Accuracy_std=('Accuracy', 'std')
).reset_index()

# Print average metrics for each year
print("Average Metrics Per Year:")
print(average_metrics_per_year)

# Get a color sequence from Plotly's default colors
colors = px.colors.qualitative.Plotly

# Function to make the color more transparent
def get_transparent_color(color, alpha=0.2):
    # Convert hex to RGB and then to RGBA
    hex_color = color.lstrip('#')
    rgb_color = tuple(int(hex_color[i:i+2], 16) for i in (0, 2, 4))
    return f'rgba({rgb_color[0]}, {rgb_color[1]}, {rgb_color[2]}, {alpha})'

# Create a figure
fig = go.Figure()

# Add the mean line
fig.add_trace(go.Scatter(
    x=average_metrics_per_year['Year'],
    y=average_metrics_per_year['Accuracy_mean'],
    mode='lines',
    name='Accuracy',
    line=dict(color=colors[0], width=2)
))

# Add the standard deviation shaded area
fig.add_trace(go.Scatter(
    x=pd.concat([average_metrics_per_year['Year'], average_metrics_per_year['Year'][::-1]]),
    y=pd.concat([average_metrics_per_year['Accuracy_mean'] + average_metrics_per_year['Accuracy_std'],
                 (average_metrics_per_year['Accuracy_mean'] - average_metrics_per_year['Accuracy_std'])[::-1]]),
    fill='toself',
    fillcolor=get_transparent_color(colors[0], alpha=0.2),  # Use the same color with transparency
    line=dict(color='rgba(255,255,255,0)'),
    hoverinfo="skip",
    showlegend=False,
    name='Accuracy std dev'
))

# Customize layout
fig.update_layout(
    title='Average Rolling Window Accuracy Over Time Across All Companies',
    xaxis_title='Year',
    yaxis_title='Accuracy',
    template='plotly_white',
    showlegend=True
)

# Show the plot
fig.show()


## Portofolio Revision

In [None]:
def prepare_data(insample_df, outsample_df):
    df = pd.concat([insample_df, outsample_df])
    df['Date From'] = pd.to_datetime(df['Date From'])
    df = df.sort_values(by='Date From')
    available_years = df['Date From'].dt.year.unique()
    print("Years available in the data:", available_years)
    return df

def rolling_window_analysis(df):
    companies = df['companyname'].unique()
    predictions_df = pd.DataFrame()

    for company in companies:
        company_df = df[df['companyname'] == company].copy()
        company_df = company_df.sort_values(by='Date From')

        # Define the rolling window parameters
        start_year = company_df['Date From'].dt.year.min()
        end_year = company_df['Date From'].dt.year.max()
        window_size = 10
        validation_size = 1

        for start in range(start_year, end_year - window_size - validation_size + 1):
            train_start = start
            train_end = start + window_size
            val_start = train_end
            val_end = train_end + validation_size

            train_df = company_df[(company_df['Date From'].dt.year >= train_start) &
                                  (company_df['Date From'].dt.year < train_end)]
            test_df = company_df[(company_df['Date From'].dt.year >= val_start) &
                                 (company_df['Date From'].dt.year < val_end)]

            if len(test_df) == 0 or len(train_df) == 0:
                continue

            X_train = np.vstack(train_df['embedding'].values)
            y_train = train_df['Future Return Direction'].values
            X_test = np.vstack(test_df['embedding'].values)
            y_test = test_df['Future Return Direction'].values

            # Train logistic regression model
            logistic_model = LogisticRegression(max_iter=1000)
            logistic_model.fit(X_train, y_train)

            # Get prediction probabilities
            y_prob = logistic_model.predict_proba(X_test)[:, 1]  # Probability of the positive class

            if len(y_prob) != len(test_df):
                raise ValueError(f"Mismatch between test data length ({len(test_df)}) and predictions ({len(y_prob)})")

            test_df['predicted_prob'] = y_prob
            predictions_df = pd.concat([predictions_df, test_df[['Date From', 'companyname', 'predicted_prob', 'Weekly Compound Return']]], ignore_index=True)

    df = df.merge(predictions_df, on=['Date From', 'companyname', 'Weekly Compound Return'], how='left', suffixes=('', '_pred'))
    return df

def construct_portfolio(df, time_period='Week'):
    df['Date From'] = pd.to_datetime(df['Date From'])
    if time_period == 'Week':
        df['Period'] = df['Date From'].dt.to_period('W').dt.to_timestamp()
    else:
        raise ValueError("Invalid time_period. Use 'Week'.")

    portfolio_returns = []

    for period, group in df.groupby('Period'):
        if period.year < 2016:
            continue

        # Sort group by predicted_prob descending
        group_sorted = group.sort_values(by='predicted_prob', ascending=False)

        # Select top and bottom companies
        num_top_companies = 5
        num_bottom_companies = 5
        top_companies = group_sorted.head(num_top_companies)
        bottom_companies = group_sorted.tail(num_bottom_companies)

        # Equal-weighted returns
        long_return_eq = np.mean(np.log1p(top_companies['Weekly Compound Return']))
        short_return_eq = np.mean(np.log1p(bottom_companies['Weekly Compound Return']))
        long_short_return_eq = long_return_eq - short_return_eq

        # Value-weighted returns
        long_return_val = np.sum(np.log1p(top_companies['Weekly Compound Return']) * top_companies['market_cap']) / np.sum(top_companies['market_cap'])
        short_return_val = np.sum(np.log1p(bottom_companies['Weekly Compound Return']) * bottom_companies['market_cap']) / np.sum(bottom_companies['market_cap'])
        long_short_return_val = long_return_val - short_return_val

        portfolio_returns.append({
            'Period': period,
            'Long Return (Eq)': long_return_eq,
            'Short Return (Eq)': short_return_eq,
            'Long-Short Return (Eq)': long_short_return_eq,
            'Long Return (Val)': long_return_val,
            'Short Return (Val)': short_return_val,
            'Long-Short Return (Val)': long_short_return_val
        })

    portfolio_df = pd.DataFrame(portfolio_returns)
    portfolio_df['EW L'] = portfolio_df['Long Return (Eq)'].cumsum()
    portfolio_df['EW S'] = portfolio_df['Short Return (Eq)'].cumsum()
    portfolio_df['EW LS'] = portfolio_df['Long-Short Return (Eq)'].cumsum()
    portfolio_df['VW L'] = portfolio_df['Long Return (Val)'].cumsum()
    portfolio_df['VW S'] = portfolio_df['Short Return (Val)'].cumsum()
    portfolio_df['VW LS'] = portfolio_df['Long-Short Return (Val)'].cumsum()

    actual_returns = df[df['Date From'].dt.year >= 2016].groupby('Period')['Weekly Compound Return'].mean()
    actual_cumulative_returns = np.log1p(actual_returns).cumsum()
    portfolio_df = portfolio_df.merge(actual_cumulative_returns.rename('Market'), on='Period', how='left')

    metrics = {}
    for portfolio in ['EW L', 'EW S', 'EW LS', 'VW L', 'VW S', 'VW LS']:
        returns = portfolio_df[portfolio]

        if returns.isnull().all() or returns.eq(0).all():
            sharpe_ratio = np.nan
            max_drawdown = np.nan
            volatility = np.nan
        else:
            sharpe_ratio = returns.mean() / returns.std() * np.sqrt(52) if returns.std() != 0 else np.nan
            cumulative_returns = returns.cumsum()
            max_drawdown = (cumulative_returns.cummax() - cumulative_returns).max()
            volatility = returns.std() * np.sqrt(52)

        metrics[portfolio] = {
            'Sharpe Ratio': sharpe_ratio
        }

        print(f"Metrics for {portfolio}:")
        print(f"Sharpe Ratio: {sharpe_ratio}")
        print()

    portfolio_df.to_csv('RoBERTa_portfolio_returns.csv', index=False)
    print("Portfolio returns saved to 'RoBERTa_portfolio_returns.csv'")
    return portfolio_df

def plot_portfolio_returns(portfolio_df, title_suffix=''):
    plt.figure(figsize=(12, 6))

    plt.plot(portfolio_df['Period'], portfolio_df['EW L'], marker='o', markersize=1, label='EW L')
    plt.plot(portfolio_df['Period'], portfolio_df['EW S'], marker='o', markersize=1, label='EW S')
    plt.plot(portfolio_df['Period'], portfolio_df['EW LS'], marker='o', markersize=1, label='EW LS')
    plt.plot(portfolio_df['Period'], portfolio_df['VW L'], marker='o', markersize=1, label='VW L')
    plt.plot(portfolio_df['Period'], portfolio_df['VW S'], marker='o', markersize=1, label='VW S')
    plt.plot(portfolio_df['Period'], portfolio_df['VW LS'], marker='o', markersize=1, label='VW LS')
    plt.plot(portfolio_df['Period'], portfolio_df['Market'], marker='o', markersize=1, label='Market')

    plt.title(f'Cumulative {title_suffix} Portfolio Returns Over Time')
    plt.xlabel('Period')
    plt.ylabel('Cumulative Log Return')
    plt.legend()
    plt.grid(True)

    plt.gca().xaxis.set_major_locator(mdates.YearLocator())
    plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%Y'))
    plt.xticks(rotation=45)
    plt.show()

# Example usage for Weekly
df = prepare_data(insample_df, outsample_df)
df = rolling_window_analysis(df)

# Weekly Portfolio
portfolio_df_week = construct_portfolio(df, time_period='Week')
portfolio_df_week = portfolio_df_week[portfolio_df_week['Period'].dt.year >= 2016]
plot_portfolio_returns(portfolio_df_week, title_suffix='Weekly')

# Distil BERT

## Load Embedding

In [None]:
# Load the pre-trained DistilBERT model and tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained('distilbert-base-uncased')

# Load embedded Dataframe
with open('embedding-DistilBERT-AllCompany-NEW.pkl', 'rb') as f:
    insample_df, outsample_df = pd.read_pickle(f)

# Display the first 2 rows of outsample_df to check
filtered_df = outsample_df[outsample_df['headline'] != '[No_Headline]']
filtered_df.head(3)

## Accuracy per-companies

In [None]:
# Combine insample and outsample data for rolling window
df = pd.concat([insample_df, outsample_df])

# Convert 'Date From' to datetime
df['Date From'] = pd.to_datetime(df['Date From'])

# Sort by date
df = df.sort_values(by='Date From')

# Check available years
available_years = df['Date From'].dt.year.unique()
print("Years available in the data:", available_years)

# Define rolling window parameters
window_size = 365 * 10  # 10 years in days
prediction_period = 365  # 1 year in days

# Get unique companies
companies = df['companyname'].unique()

# Dictionary to store DataFrames for each company
company_results_dfs = {}

# Rolling window analysis for each company
for company in companies:
    company_df = df[df['companyname'] == company].copy()

    accuracies = []
    precisions = []
    recalls = []
    f1_scores = []
    years = []

    # Setup time series cross-validation
    tscv = TimeSeriesSplit(n_splits=5)  # You can adjust the number of splits as needed

    for train_index, test_index in tscv.split(company_df):
        train_df, test_df = company_df.iloc[train_index], company_df.iloc[test_index]

        if len(test_df) == 0 or len(train_df) == 0:
            continue

        X_train = np.vstack(train_df['embedding'].values)
        y_train = train_df['Future Return Direction'].values

        X_test = np.vstack(test_df['embedding'].values)
        y_test = test_df['Future Return Direction'].values

        # Train logistic regression model
        logistic_model = LogisticRegression(max_iter=1000)
        logistic_model.fit(X_train, y_train)

        # Make predictions
        y_pred = logistic_model.predict(X_test)

        # Ensure the length of predictions and test_df matches
        if len(y_pred) != len(test_df):
            raise ValueError(f"Mismatch between test data length ({len(test_df)}) and predictions ({len(y_pred)})")

        # Store the predictions and true values
        test_df['predicted'] = y_pred

        # Update the company-specific DataFrame with predictions
        company_df.loc[test_df.index, 'predicted'] = test_df['predicted']

        print(f'Company: {company}')
        print(f'Training window: {train_df["Date From"].min()} to {train_df["Date From"].max()}')
        print(f'Test window: {test_df["Date From"].min()} to {test_df["Date From"].max()}')
        print(f'Predicted years: {test_df["Date From"].dt.year.unique()}')

    # Evaluate the Model for the company
    for year in range(2016, 2024):
        year_df = company_df[company_df['Date From'].dt.year == year]

        if 'predicted' not in year_df.columns or year_df['predicted'].isnull().all():
            print(f"Missing predictions for year {year} for company {company} due to insufficient data or missing predictions.")
            continue

        y_test = year_df['Future Return Direction'].values
        y_pred = year_df['predicted'].values

        # Remove NaN values in predictions
        valid_indices = ~np.isnan(y_pred)
        y_test = y_test[valid_indices]
        y_pred = y_pred[valid_indices]

        if len(y_pred) == 0 or len(y_test) == 0:
            print(f"Insufficient valid predictions for year {year} for company {company}.")
            continue

        # Calculate evaluation metrics
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average='binary', pos_label=1)
        recall = recall_score(y_test, y_pred, average='binary', pos_label=1)
        f1 = f1_score(y_test, y_pred, average='binary', pos_label=1)

        accuracies.append(accuracy)
        precisions.append(precision)
        recalls.append(recall)
        f1_scores.append(f1)
        years.append(year)

        print(f'Company: {company}, Year: {year}')
        print(f'Accuracy: {accuracy:.2f}')
        print(f'Precision: {precision:.2f}')
        print(f'Recall: {recall:.2f}')
        print(f'F1 Score: {f1:.2f}')
        print()

    # Store the results for the company in a DataFrame
    results_df = pd.DataFrame({
        'Year': years,
        'Accuracy': accuracies,
        'Precision': precisions,
        'Recall': recalls,
        'F1 Score': f1_scores
    })

    # Save the DataFrame in the dictionary
    company_results_dfs[company] = results_df

    print(f"Evaluation results for {company}:")
    print(results_df)

    # Plot accuracy over time for the company
    plt.figure(figsize=(10, 6))
    plt.plot(results_df['Year'], results_df['Accuracy'], marker='o', label='Accuracy')
    plt.plot(results_df['Year'], results_df['Precision'], marker='o', label='Precision')
    plt.plot(results_df['Year'], results_df['Recall'], marker='o', label='Recall')
    plt.plot(results_df['Year'], results_df['F1 Score'], marker='o', label='F1 Score')

    plt.title(f'Performance Metrics Over Time for {company}')
    plt.xlabel('Year')
    plt.ylabel('Score')
    plt.legend()
    plt.grid(True)
    plt.show()

    # Plot confusion matrices for each year
    n_plots = len(years)
    n_cols = 4
    n_rows = (n_plots // n_cols) + (n_plots % n_cols > 0)

    fig, axes = plt.subplots(n_rows, n_cols, figsize=(20, 4 * n_rows))
    axes = axes.flatten()

    for i, year in enumerate(years):
        year_df = company_df[company_df['Date From'].dt.year == year]
        y_test = year_df['Future Return Direction'].values
        y_pred = year_df['predicted'].values

        # Remove NaN values in predictions
        valid_indices = ~np.isnan(y_pred)
        y_test = y_test[valid_indices]
        y_pred = y_pred[valid_indices]

        if len(y_pred) > 0 and len(y_test) > 0:
            cm = confusion_matrix(y_test, y_pred)
            disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=logistic_model.classes_)
            disp.plot(cmap='Blues', ax=axes[i])
            axes[i].set_title(f'{company} {year}')

    for j in range(i + 1, len(axes)):
        fig.delaxes(axes[j])

    plt.tight_layout()
    plt.show()

# Access the stored DataFrames
for company, results_df in company_results_dfs.items():
    print(f"\nResults for {company}:")
    print(results_df)


In [None]:
# Concatenate all the company results into one DataFrame
all_results_df = pd.concat(company_results_dfs.values(), keys=company_results_dfs.keys()).reset_index(level=0).rename(columns={'level_0': 'Company'})

# Save the combined DataFrame to a CSV file
all_results_df.to_csv('[EVAL] DistilBERT_all_company_results.csv', index=False)


## Accuracy all years

In [None]:
# Combine insample and outsample data for rolling window
df = pd.concat([insample_df, outsample_df])

# Convert 'Date From' to datetime
df['Date From'] = pd.to_datetime(df['Date From'])

# Sort by date
df = df.sort_values(by='Date From')

# Check available years
available_years = df['Date From'].dt.year.unique()
print("Years available in the data:", available_years)

# Define rolling window parameters
window_size = 365 * 10  # 10 years in days
prediction_period = 365  # 1 year in days

# Get unique companies
companies = df['companyname'].unique()

# Dictionary to store DataFrames for each company
company_results_dfs = {}

# Rolling window analysis for each company
for company in companies:
    company_df = df[df['companyname'] == company].copy()

    accuracies = []
    precisions = []
    recalls = []
    f1_scores = []
    years = []

    # Setup time series cross-validation
    tscv = TimeSeriesSplit(n_splits=5)  # You can adjust the number of splits as needed

    for train_index, test_index in tscv.split(company_df):
        train_df, test_df = company_df.iloc[train_index], company_df.iloc[test_index]

        if len(test_df) == 0 or len(train_df) == 0:
            continue

        X_train = np.vstack(train_df['embedding'].values)
        y_train = train_df['Future Return Direction'].values

        X_test = np.vstack(test_df['embedding'].values)
        y_test = test_df['Future Return Direction'].values

        # Train logistic regression model
        logistic_model = LogisticRegression(max_iter=1000)
        logistic_model.fit(X_train, y_train)

        # Make predictions
        y_pred = logistic_model.predict(X_test)

        # Ensure the length of predictions and test_df matches
        if len(y_pred) != len(test_df):
            raise ValueError(f"Mismatch between test data length ({len(test_df)}) and predictions ({len(y_pred)})")

        # Store the predictions and true values
        test_df['predicted'] = y_pred

        # Update the company-specific DataFrame with predictions
        company_df.loc[test_df.index, 'predicted'] = test_df['predicted']

    # Evaluate the Model for the company
    for year in range(2016, 2024):
        year_df = company_df[company_df['Date From'].dt.year == year]

        if 'predicted' not in year_df.columns or year_df['predicted'].isnull().all():
            continue

        y_test = year_df['Future Return Direction'].values
        y_pred = year_df['predicted'].values

        # Remove NaN values in predictions
        valid_indices = ~np.isnan(y_pred)
        y_test = y_test[valid_indices]
        y_pred = y_pred[valid_indices]

        if len(y_pred) == 0 or len(y_test) == 0:
            continue

        # Calculate evaluation metrics
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average='binary', pos_label=1)
        recall = recall_score(y_test, y_pred, average='binary', pos_label=1)
        f1 = f1_score(y_test, y_pred, average='binary', pos_label=1)

        accuracies.append(accuracy)
        precisions.append(precision)
        recalls.append(recall)
        f1_scores.append(f1)
        years.append(year)

    # Store the results for the company in a DataFrame
    results_df = pd.DataFrame({
        'Year': years,
        'Accuracy': accuracies,
        'Precision': precisions,
        'Recall': recalls,
        'F1 Score': f1_scores
    })

    # Save the DataFrame in the dictionary
    company_results_dfs[company] = results_df

# Combine results of all companies into a single DataFrame
combined_results = pd.concat(company_results_dfs.values(), ignore_index=True)

# Calculate average accuracy per year across all companies
average_metrics_per_year = combined_results.groupby('Year').mean().reset_index()

# Print average metrics for each year
print("Average Metrics Per Year:")
print(average_metrics_per_year)

# Plot average accuracy per year
plt.figure(figsize=(10, 6))
sns.set(style='whitegrid')

plt.plot(average_metrics_per_year['Year'], average_metrics_per_year['Accuracy'], marker='o', label='Accuracy')
plt.plot(average_metrics_per_year['Year'], average_metrics_per_year['Precision'], marker='o', label='Precision')
plt.plot(average_metrics_per_year['Year'], average_metrics_per_year['Recall'], marker='o', label='Recall')
plt.plot(average_metrics_per_year['Year'], average_metrics_per_year['F1 Score'], marker='o', label='F1 Score')

plt.title('Average Performance Metrics Over Time Across All Companies')
plt.xlabel('Year')
plt.ylabel('Score')
plt.legend()
plt.grid(True)
plt.show()


## Standard Deviation all years

In [None]:
# Combine insample and outsample data for rolling window
df = pd.concat([insample_df, outsample_df])

# Convert 'Date From' to datetime
df['Date From'] = pd.to_datetime(df['Date From'])

# Sort by date
df = df.sort_values(by='Date From')

# Check available years
available_years = df['Date From'].dt.year.unique()
print("Years available in the data:", available_years)

# Get unique companies
companies = df['companyname'].unique()

# Dictionary to store DataFrames for each company
company_results_dfs = {}

# Rolling window analysis for each company
for company in companies:
    company_df = df[df['companyname'] == company].copy()

    accuracies = []
    years = []

    # Setup time series cross-validation
    tscv = TimeSeriesSplit(n_splits=5)  # You can adjust the number of splits as needed

    for train_index, test_index in tscv.split(company_df):
        train_df, test_df = company_df.iloc[train_index], company_df.iloc[test_index]

        if len(test_df) == 0 or len(train_df) == 0:
            continue

        X_train = np.vstack(train_df['embedding'].values)
        y_train = train_df['Future Return Direction'].values

        X_test = np.vstack(test_df['embedding'].values)
        y_test = test_df['Future Return Direction'].values

        # Train logistic regression model
        logistic_model = LogisticRegression(max_iter=1000)
        logistic_model.fit(X_train, y_train)

        # Make predictions
        y_pred = logistic_model.predict(X_test)

        # Ensure the length of predictions and test_df matches
        if len(y_pred) != len(test_df):
            raise ValueError(f"Mismatch between test data length ({len(test_df)}) and predictions ({len(y_pred)})")

        # Store the predictions and true values
        test_df['predicted'] = y_pred

        # Update the company-specific DataFrame with predictions
        company_df.loc[test_df.index, 'predicted'] = test_df['predicted']

    # Evaluate the Model for the company
    for year in range(2016, 2024):
        year_df = company_df[company_df['Date From'].dt.year == year]

        if 'predicted' not in year_df.columns or year_df['predicted'].isnull().all():
            continue

        y_test = year_df['Future Return Direction'].values
        y_pred = year_df['predicted'].values

        # Remove NaN values in predictions
        valid_indices = ~np.isnan(y_pred)
        y_test = y_test[valid_indices]
        y_pred = y_pred[valid_indices]

        if len(y_pred) == 0 or len(y_test) == 0:
            continue

        # Calculate evaluation metrics
        accuracy = accuracy_score(y_test, y_pred)

        accuracies.append(accuracy)
        years.append(year)

    # Store the results for the company in a DataFrame
    results_df = pd.DataFrame({
        'Year': years,
        'Accuracy': accuracies
    })

    # Save the DataFrame in the dictionary
    company_results_dfs[company] = results_df

# Combine results of all companies into a single DataFrame
combined_results = pd.concat(company_results_dfs.values(), ignore_index=True)

# Calculate average accuracy per year across all companies
average_metrics_per_year = combined_results.groupby('Year').agg(
    Accuracy_mean=('Accuracy', 'mean'),
    Accuracy_std=('Accuracy', 'std')
).reset_index()

# Print average metrics for each year
print("Average Metrics Per Year:")
print(average_metrics_per_year)

# Get a color sequence from Plotly's default colors
colors = px.colors.qualitative.Plotly

# Function to make the color more transparent
def get_transparent_color(color, alpha=0.2):
    # Convert hex to RGB and then to RGBA
    hex_color = color.lstrip('#')
    rgb_color = tuple(int(hex_color[i:i+2], 16) for i in (0, 2, 4))
    return f'rgba({rgb_color[0]}, {rgb_color[1]}, {rgb_color[2]}, {alpha})'

# Create a figure
fig = go.Figure()

# Add the mean line
fig.add_trace(go.Scatter(
    x=average_metrics_per_year['Year'],
    y=average_metrics_per_year['Accuracy_mean'],
    mode='lines',
    name='Accuracy',
    line=dict(color=colors[0], width=2)
))

# Add the standard deviation shaded area
fig.add_trace(go.Scatter(
    x=pd.concat([average_metrics_per_year['Year'], average_metrics_per_year['Year'][::-1]]),
    y=pd.concat([average_metrics_per_year['Accuracy_mean'] + average_metrics_per_year['Accuracy_std'],
                 (average_metrics_per_year['Accuracy_mean'] - average_metrics_per_year['Accuracy_std'])[::-1]]),
    fill='toself',
    fillcolor=get_transparent_color(colors[0], alpha=0.2),  # Use the same color with transparency
    line=dict(color='rgba(255,255,255,0)'),
    hoverinfo="skip",
    showlegend=False,
    name='Accuracy std dev'
))

# Customize layout
fig.update_layout(
    title='Average Rolling Window Accuracy Over Time Across All Companies',
    xaxis_title='Year',
    yaxis_title='Accuracy',
    template='plotly_white',
    showlegend=True
)

# Show the plot
fig.show()


## Portfolio

In [None]:
def prepare_data(insample_df, outsample_df):
    df = pd.concat([insample_df, outsample_df])
    df['Date From'] = pd.to_datetime(df['Date From'])
    df = df.sort_values(by='Date From')
    available_years = df['Date From'].dt.year.unique()
    print("Years available in the data:", available_years)
    return df

def rolling_window_analysis(df):
    companies = df['companyname'].unique()
    predictions_df = pd.DataFrame()

    for company in companies:
        company_df = df[df['companyname'] == company].copy()
        company_df = company_df.sort_values(by='Date From')

        # Define the rolling window parameters
        start_year = company_df['Date From'].dt.year.min()
        end_year = company_df['Date From'].dt.year.max()
        window_size = 10
        validation_size = 1

        for start in range(start_year, end_year - window_size - validation_size + 1):
            train_start = start
            train_end = start + window_size
            val_start = train_end
            val_end = train_end + validation_size

            train_df = company_df[(company_df['Date From'].dt.year >= train_start) &
                                  (company_df['Date From'].dt.year < train_end)]
            test_df = company_df[(company_df['Date From'].dt.year >= val_start) &
                                 (company_df['Date From'].dt.year < val_end)]

            if len(test_df) == 0 or len(train_df) == 0:
                continue

            X_train = np.vstack(train_df['embedding'].values)
            y_train = train_df['Future Return Direction'].values
            X_test = np.vstack(test_df['embedding'].values)
            y_test = test_df['Future Return Direction'].values

            # Train logistic regression model
            logistic_model = LogisticRegression(max_iter=1000)
            logistic_model.fit(X_train, y_train)

            # Get prediction probabilities
            y_prob = logistic_model.predict_proba(X_test)[:, 1]  # Probability of the positive class

            if len(y_prob) != len(test_df):
                raise ValueError(f"Mismatch between test data length ({len(test_df)}) and predictions ({len(y_prob)})")

            test_df['predicted_prob'] = y_prob
            predictions_df = pd.concat([predictions_df, test_df[['Date From', 'companyname', 'predicted_prob', 'Weekly Compound Return']]], ignore_index=True)

    df = df.merge(predictions_df, on=['Date From', 'companyname', 'Weekly Compound Return'], how='left', suffixes=('', '_pred'))
    return df

def construct_portfolio(df, time_period='Week'):
    df['Date From'] = pd.to_datetime(df['Date From'])
    if time_period == 'Week':
        df['Period'] = df['Date From'].dt.to_period('W').dt.to_timestamp()
    else:
        raise ValueError("Invalid time_period. Use 'Week'.")

    portfolio_returns = []

    for period, group in df.groupby('Period'):
        if period.year < 2016:
            continue

        # Sort group by predicted_prob descending
        group_sorted = group.sort_values(by='predicted_prob', ascending=False)

        # Select top and bottom companies
        num_top_companies = 5
        num_bottom_companies = 5
        top_companies = group_sorted.head(num_top_companies)
        bottom_companies = group_sorted.tail(num_bottom_companies)

        # Equal-weighted returns
        long_return_eq = np.mean(np.log1p(top_companies['Weekly Compound Return']))
        short_return_eq = np.mean(np.log1p(bottom_companies['Weekly Compound Return']))
        long_short_return_eq = long_return_eq - short_return_eq

        # Value-weighted returns
        long_return_val = np.sum(np.log1p(top_companies['Weekly Compound Return']) * top_companies['market_cap']) / np.sum(top_companies['market_cap'])
        short_return_val = np.sum(np.log1p(bottom_companies['Weekly Compound Return']) * bottom_companies['market_cap']) / np.sum(bottom_companies['market_cap'])
        long_short_return_val = long_return_val - short_return_val

        portfolio_returns.append({
            'Period': period,
            'Long Return (Eq)': long_return_eq,
            'Short Return (Eq)': short_return_eq,
            'Long-Short Return (Eq)': long_short_return_eq,
            'Long Return (Val)': long_return_val,
            'Short Return (Val)': short_return_val,
            'Long-Short Return (Val)': long_short_return_val
        })

    portfolio_df = pd.DataFrame(portfolio_returns)
    portfolio_df['EW L'] = portfolio_df['Long Return (Eq)'].cumsum()
    portfolio_df['EW S'] = portfolio_df['Short Return (Eq)'].cumsum()
    portfolio_df['EW LS'] = portfolio_df['Long-Short Return (Eq)'].cumsum()
    portfolio_df['VW L'] = portfolio_df['Long Return (Val)'].cumsum()
    portfolio_df['VW S'] = portfolio_df['Short Return (Val)'].cumsum()
    portfolio_df['VW LS'] = portfolio_df['Long-Short Return (Val)'].cumsum()

    actual_returns = df[df['Date From'].dt.year >= 2016].groupby('Period')['Weekly Compound Return'].mean()
    actual_cumulative_returns = np.log1p(actual_returns).cumsum()
    portfolio_df = portfolio_df.merge(actual_cumulative_returns.rename('Market'), on='Period', how='left')

    metrics = {}
    for portfolio in ['EW L', 'EW S', 'EW LS', 'VW L', 'VW S', 'VW LS']:
        returns = portfolio_df[portfolio]

        if returns.isnull().all() or returns.eq(0).all():
            sharpe_ratio = np.nan
            max_drawdown = np.nan
            volatility = np.nan
        else:
            sharpe_ratio = returns.mean() / returns.std() * np.sqrt(52) if returns.std() != 0 else np.nan
            cumulative_returns = returns.cumsum()
            max_drawdown = (cumulative_returns.cummax() - cumulative_returns).max()
            volatility = returns.std() * np.sqrt(52)

        metrics[portfolio] = {
            'Sharpe Ratio': sharpe_ratio
        }

        print(f"Metrics for {portfolio}:")
        print(f"Sharpe Ratio: {sharpe_ratio}")
        print()

    portfolio_df.to_csv('DistilBERT_portfolio_returns.csv', index=False)
    print("Portfolio returns saved to 'DistilBERT_portfolio_returns.csv'")
    return portfolio_df

def plot_portfolio_returns(portfolio_df, title_suffix=''):
    plt.figure(figsize=(12, 6))

    plt.plot(portfolio_df['Period'], portfolio_df['EW L'], marker='o', markersize=1, label='EW L')
    plt.plot(portfolio_df['Period'], portfolio_df['EW S'], marker='o', markersize=1, label='EW S')
    plt.plot(portfolio_df['Period'], portfolio_df['EW LS'], marker='o', markersize=1, label='EW LS')
    plt.plot(portfolio_df['Period'], portfolio_df['VW L'], marker='o', markersize=1, label='VW L')
    plt.plot(portfolio_df['Period'], portfolio_df['VW S'], marker='o', markersize=1, label='VW S')
    plt.plot(portfolio_df['Period'], portfolio_df['VW LS'], marker='o', markersize=1, label='VW LS')
    plt.plot(portfolio_df['Period'], portfolio_df['Market'], marker='o', markersize=1, label='Market')

    plt.title(f'Cumulative {title_suffix} Portfolio Returns Over Time')
    plt.xlabel('Period')
    plt.ylabel('Cumulative Log Return')
    plt.legend()
    plt.grid(True)

    plt.gca().xaxis.set_major_locator(mdates.YearLocator())
    plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%Y'))
    plt.xticks(rotation=45)
    plt.show()

# Example usage for Weekly
df = prepare_data(insample_df, outsample_df)
df = rolling_window_analysis(df)

# Weekly Portfolio
portfolio_df_week = construct_portfolio(df, time_period='Week')
portfolio_df_week = portfolio_df_week[portfolio_df_week['Period'].dt.year >= 2016]
plot_portfolio_returns(portfolio_df_week, title_suffix='Weekly')

# Distil RoBERTa

## Load Embedding

In [None]:
# Load the pre-trained DistilRoBERTa model and tokenizer
tokenizer = RobertaTokenizer.from_pretrained('distilroberta-base')
model = RobertaModel.from_pretrained('distilroberta-base')

# Load embedded Dataframe
with open('embedding-DistilRoBERTa-AllCompany-NEW.pkl', 'rb') as f:
    insample_df, outsample_df = pd.read_pickle(f)

# Display the first 2 rows of outsample_df to check
filtered_df = outsample_df[outsample_df['headline'] != '[No_Headline]']
filtered_df.head(3)

## Accuracy per-companies

In [None]:
# Combine insample and outsample data for rolling window
df = pd.concat([insample_df, outsample_df])

# Convert 'Date From' to datetime
df['Date From'] = pd.to_datetime(df['Date From'])

# Sort by date
df = df.sort_values(by='Date From')

# Check available years
available_years = df['Date From'].dt.year.unique()
print("Years available in the data:", available_years)

# Define rolling window parameters
window_size = 365 * 10  # 10 years in days
prediction_period = 365  # 1 year in days

# Get unique companies
companies = df['companyname'].unique()

# Dictionary to store DataFrames for each company
company_results_dfs = {}

# Rolling window analysis for each company
for company in companies:
    company_df = df[df['companyname'] == company].copy()

    accuracies = []
    precisions = []
    recalls = []
    f1_scores = []
    years = []

    # Setup time series cross-validation
    tscv = TimeSeriesSplit(n_splits=5)  # You can adjust the number of splits as needed

    for train_index, test_index in tscv.split(company_df):
        train_df, test_df = company_df.iloc[train_index], company_df.iloc[test_index]

        if len(test_df) == 0 or len(train_df) == 0:
            continue

        X_train = np.vstack(train_df['embedding'].values)
        y_train = train_df['Future Return Direction'].values

        X_test = np.vstack(test_df['embedding'].values)
        y_test = test_df['Future Return Direction'].values

        # Train logistic regression model
        logistic_model = LogisticRegression(max_iter=1000)
        logistic_model.fit(X_train, y_train)

        # Make predictions
        y_pred = logistic_model.predict(X_test)

        # Ensure the length of predictions and test_df matches
        if len(y_pred) != len(test_df):
            raise ValueError(f"Mismatch between test data length ({len(test_df)}) and predictions ({len(y_pred)})")

        # Store the predictions and true values
        test_df['predicted'] = y_pred

        # Update the company-specific DataFrame with predictions
        company_df.loc[test_df.index, 'predicted'] = test_df['predicted']

        print(f'Company: {company}')
        print(f'Training window: {train_df["Date From"].min()} to {train_df["Date From"].max()}')
        print(f'Test window: {test_df["Date From"].min()} to {test_df["Date From"].max()}')
        print(f'Predicted years: {test_df["Date From"].dt.year.unique()}')

    # Evaluate the Model for the company
    for year in range(2016, 2024):
        year_df = company_df[company_df['Date From'].dt.year == year]

        if 'predicted' not in year_df.columns or year_df['predicted'].isnull().all():
            print(f"Missing predictions for year {year} for company {company} due to insufficient data or missing predictions.")
            continue

        y_test = year_df['Future Return Direction'].values
        y_pred = year_df['predicted'].values

        # Remove NaN values in predictions
        valid_indices = ~np.isnan(y_pred)
        y_test = y_test[valid_indices]
        y_pred = y_pred[valid_indices]

        if len(y_pred) == 0 or len(y_test) == 0:
            print(f"Insufficient valid predictions for year {year} for company {company}.")
            continue

        # Calculate evaluation metrics
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average='binary', pos_label=1)
        recall = recall_score(y_test, y_pred, average='binary', pos_label=1)
        f1 = f1_score(y_test, y_pred, average='binary', pos_label=1)

        accuracies.append(accuracy)
        precisions.append(precision)
        recalls.append(recall)
        f1_scores.append(f1)
        years.append(year)

        print(f'Company: {company}, Year: {year}')
        print(f'Accuracy: {accuracy:.2f}')
        print(f'Precision: {precision:.2f}')
        print(f'Recall: {recall:.2f}')
        print(f'F1 Score: {f1:.2f}')
        print()

    # Store the results for the company in a DataFrame
    results_df = pd.DataFrame({
        'Year': years,
        'Accuracy': accuracies,
        'Precision': precisions,
        'Recall': recalls,
        'F1 Score': f1_scores
    })

    # Save the DataFrame in the dictionary
    company_results_dfs[company] = results_df

    print(f"Evaluation results for {company}:")
    print(results_df)

    # Plot accuracy over time for the company
    plt.figure(figsize=(10, 6))
    plt.plot(results_df['Year'], results_df['Accuracy'], marker='o', label='Accuracy')
    plt.plot(results_df['Year'], results_df['Precision'], marker='o', label='Precision')
    plt.plot(results_df['Year'], results_df['Recall'], marker='o', label='Recall')
    plt.plot(results_df['Year'], results_df['F1 Score'], marker='o', label='F1 Score')

    plt.title(f'Performance Metrics Over Time for {company}')
    plt.xlabel('Year')
    plt.ylabel('Score')
    plt.legend()
    plt.grid(True)
    plt.show()

    # Plot confusion matrices for each year
    n_plots = len(years)
    n_cols = 4
    n_rows = (n_plots // n_cols) + (n_plots % n_cols > 0)

    fig, axes = plt.subplots(n_rows, n_cols, figsize=(20, 4 * n_rows))
    axes = axes.flatten()

    for i, year in enumerate(years):
        year_df = company_df[company_df['Date From'].dt.year == year]
        y_test = year_df['Future Return Direction'].values
        y_pred = year_df['predicted'].values

        # Remove NaN values in predictions
        valid_indices = ~np.isnan(y_pred)
        y_test = y_test[valid_indices]
        y_pred = y_pred[valid_indices]

        if len(y_pred) > 0 and len(y_test) > 0:
            cm = confusion_matrix(y_test, y_pred)
            disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=logistic_model.classes_)
            disp.plot(cmap='Blues', ax=axes[i])
            axes[i].set_title(f'{company} {year}')

    for j in range(i + 1, len(axes)):
        fig.delaxes(axes[j])

    plt.tight_layout()
    plt.show()

# Access the stored DataFrames
for company, results_df in company_results_dfs.items():
    print(f"\nResults for {company}:")
    print(results_df)


In [None]:
# Concatenate all the company results into one DataFrame
all_results_df = pd.concat(company_results_dfs.values(), keys=company_results_dfs.keys()).reset_index(level=0).rename(columns={'level_0': 'Company'})

# Save the combined DataFrame to a CSV file
all_results_df.to_csv('[EVAL] DistilRoBERTa_all_company_results.csv', index=False)


## Accuracy all years

In [None]:
# Combine insample and outsample data for rolling window
df = pd.concat([insample_df, outsample_df])

# Convert 'Date From' to datetime
df['Date From'] = pd.to_datetime(df['Date From'])

# Sort by date
df = df.sort_values(by='Date From')

# Check available years
available_years = df['Date From'].dt.year.unique()
print("Years available in the data:", available_years)

# Define rolling window parameters
window_size = 365 * 10  # 10 years in days
prediction_period = 365  # 1 year in days

# Get unique companies
companies = df['companyname'].unique()

# Dictionary to store DataFrames for each company
company_results_dfs = {}

# Rolling window analysis for each company
for company in companies:
    company_df = df[df['companyname'] == company].copy()

    accuracies = []
    precisions = []
    recalls = []
    f1_scores = []
    years = []

    # Setup time series cross-validation
    tscv = TimeSeriesSplit(n_splits=5)  # You can adjust the number of splits as needed

    for train_index, test_index in tscv.split(company_df):
        train_df, test_df = company_df.iloc[train_index], company_df.iloc[test_index]

        if len(test_df) == 0 or len(train_df) == 0:
            continue

        X_train = np.vstack(train_df['embedding'].values)
        y_train = train_df['Future Return Direction'].values

        X_test = np.vstack(test_df['embedding'].values)
        y_test = test_df['Future Return Direction'].values

        # Train logistic regression model
        logistic_model = LogisticRegression(max_iter=1000)
        logistic_model.fit(X_train, y_train)

        # Make predictions
        y_pred = logistic_model.predict(X_test)

        # Ensure the length of predictions and test_df matches
        if len(y_pred) != len(test_df):
            raise ValueError(f"Mismatch between test data length ({len(test_df)}) and predictions ({len(y_pred)})")

        # Store the predictions and true values
        test_df['predicted'] = y_pred

        # Update the company-specific DataFrame with predictions
        company_df.loc[test_df.index, 'predicted'] = test_df['predicted']

    # Evaluate the Model for the company
    for year in range(2016, 2024):
        year_df = company_df[company_df['Date From'].dt.year == year]

        if 'predicted' not in year_df.columns or year_df['predicted'].isnull().all():
            continue

        y_test = year_df['Future Return Direction'].values
        y_pred = year_df['predicted'].values

        # Remove NaN values in predictions
        valid_indices = ~np.isnan(y_pred)
        y_test = y_test[valid_indices]
        y_pred = y_pred[valid_indices]

        if len(y_pred) == 0 or len(y_test) == 0:
            continue

        # Calculate evaluation metrics
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average='binary', pos_label=1)
        recall = recall_score(y_test, y_pred, average='binary', pos_label=1)
        f1 = f1_score(y_test, y_pred, average='binary', pos_label=1)

        accuracies.append(accuracy)
        precisions.append(precision)
        recalls.append(recall)
        f1_scores.append(f1)
        years.append(year)

    # Store the results for the company in a DataFrame
    results_df = pd.DataFrame({
        'Year': years,
        'Accuracy': accuracies,
        'Precision': precisions,
        'Recall': recalls,
        'F1 Score': f1_scores
    })

    # Save the DataFrame in the dictionary
    company_results_dfs[company] = results_df

# Combine results of all companies into a single DataFrame
combined_results = pd.concat(company_results_dfs.values(), ignore_index=True)

# Calculate average accuracy per year across all companies
average_metrics_per_year = combined_results.groupby('Year').mean().reset_index()

# Print average metrics for each year
print("Average Metrics Per Year:")
print(average_metrics_per_year)

# Plot average accuracy per year
plt.figure(figsize=(10, 6))
sns.set(style='whitegrid')

plt.plot(average_metrics_per_year['Year'], average_metrics_per_year['Accuracy'], marker='o', label='Accuracy')
plt.plot(average_metrics_per_year['Year'], average_metrics_per_year['Precision'], marker='o', label='Precision')
plt.plot(average_metrics_per_year['Year'], average_metrics_per_year['Recall'], marker='o', label='Recall')
plt.plot(average_metrics_per_year['Year'], average_metrics_per_year['F1 Score'], marker='o', label='F1 Score')

plt.title('Average Performance Metrics Over Time Across All Companies')
plt.xlabel('Year')
plt.ylabel('Score')
plt.legend()
plt.grid(True)
plt.show()


## Standard Deviation All Years

In [None]:
# Combine insample and outsample data for rolling window
df = pd.concat([insample_df, outsample_df])

# Convert 'Date From' to datetime
df['Date From'] = pd.to_datetime(df['Date From'])

# Sort by date
df = df.sort_values(by='Date From')

# Check available years
available_years = df['Date From'].dt.year.unique()
print("Years available in the data:", available_years)

# Get unique companies
companies = df['companyname'].unique()

# Dictionary to store DataFrames for each company
company_results_dfs = {}

# Rolling window analysis for each company
for company in companies:
    company_df = df[df['companyname'] == company].copy()

    accuracies = []
    years = []

    # Setup time series cross-validation
    tscv = TimeSeriesSplit(n_splits=5)  # You can adjust the number of splits as needed

    for train_index, test_index in tscv.split(company_df):
        train_df, test_df = company_df.iloc[train_index], company_df.iloc[test_index]

        if len(test_df) == 0 or len(train_df) == 0:
            continue

        X_train = np.vstack(train_df['embedding'].values)
        y_train = train_df['Future Return Direction'].values

        X_test = np.vstack(test_df['embedding'].values)
        y_test = test_df['Future Return Direction'].values

        # Train logistic regression model
        logistic_model = LogisticRegression(max_iter=1000)
        logistic_model.fit(X_train, y_train)

        # Make predictions
        y_pred = logistic_model.predict(X_test)

        # Ensure the length of predictions and test_df matches
        if len(y_pred) != len(test_df):
            raise ValueError(f"Mismatch between test data length ({len(test_df)}) and predictions ({len(y_pred)})")

        # Store the predictions and true values
        test_df['predicted'] = y_pred

        # Update the company-specific DataFrame with predictions
        company_df.loc[test_df.index, 'predicted'] = test_df['predicted']

    # Evaluate the Model for the company
    for year in range(2016, 2024):
        year_df = company_df[company_df['Date From'].dt.year == year]

        if 'predicted' not in year_df.columns or year_df['predicted'].isnull().all():
            continue

        y_test = year_df['Future Return Direction'].values
        y_pred = year_df['predicted'].values

        # Remove NaN values in predictions
        valid_indices = ~np.isnan(y_pred)
        y_test = y_test[valid_indices]
        y_pred = y_pred[valid_indices]

        if len(y_pred) == 0 or len(y_test) == 0:
            continue

        # Calculate evaluation metrics
        accuracy = accuracy_score(y_test, y_pred)

        accuracies.append(accuracy)
        years.append(year)

    # Store the results for the company in a DataFrame
    results_df = pd.DataFrame({
        'Year': years,
        'Accuracy': accuracies
    })

    # Save the DataFrame in the dictionary
    company_results_dfs[company] = results_df

# Combine results of all companies into a single DataFrame
combined_results = pd.concat(company_results_dfs.values(), ignore_index=True)

# Calculate average accuracy per year across all companies
average_metrics_per_year = combined_results.groupby('Year').agg(
    Accuracy_mean=('Accuracy', 'mean'),
    Accuracy_std=('Accuracy', 'std')
).reset_index()

# Print average metrics for each year
print("Average Metrics Per Year:")
print(average_metrics_per_year)

# Get a color sequence from Plotly's default colors
colors = px.colors.qualitative.Plotly

# Function to make the color more transparent
def get_transparent_color(color, alpha=0.2):
    # Convert hex to RGB and then to RGBA
    hex_color = color.lstrip('#')
    rgb_color = tuple(int(hex_color[i:i+2], 16) for i in (0, 2, 4))
    return f'rgba({rgb_color[0]}, {rgb_color[1]}, {rgb_color[2]}, {alpha})'

# Create a figure
fig = go.Figure()

# Add the mean line
fig.add_trace(go.Scatter(
    x=average_metrics_per_year['Year'],
    y=average_metrics_per_year['Accuracy_mean'],
    mode='lines',
    name='Accuracy',
    line=dict(color=colors[0], width=2)
))

# Add the standard deviation shaded area
fig.add_trace(go.Scatter(
    x=pd.concat([average_metrics_per_year['Year'], average_metrics_per_year['Year'][::-1]]),
    y=pd.concat([average_metrics_per_year['Accuracy_mean'] + average_metrics_per_year['Accuracy_std'],
                 (average_metrics_per_year['Accuracy_mean'] - average_metrics_per_year['Accuracy_std'])[::-1]]),
    fill='toself',
    fillcolor=get_transparent_color(colors[0], alpha=0.2),  # Use the same color with transparency
    line=dict(color='rgba(255,255,255,0)'),
    hoverinfo="skip",
    showlegend=False,
    name='Accuracy std dev'
))

# Customize layout
fig.update_layout(
    title='Average Rolling Window Accuracy Over Time Across All Companies',
    xaxis_title='Year',
    yaxis_title='Accuracy',
    template='plotly_white',
    showlegend=True
)

# Show the plot
fig.show()


## Portfolio

In [None]:
def prepare_data(insample_df, outsample_df):
    df = pd.concat([insample_df, outsample_df])
    df['Date From'] = pd.to_datetime(df['Date From'])
    df = df.sort_values(by='Date From')
    available_years = df['Date From'].dt.year.unique()
    print("Years available in the data:", available_years)
    return df

def rolling_window_analysis(df):
    companies = df['companyname'].unique()
    predictions_df = pd.DataFrame()

    for company in companies:
        company_df = df[df['companyname'] == company].copy()
        company_df = company_df.sort_values(by='Date From')

        # Define the rolling window parameters
        start_year = company_df['Date From'].dt.year.min()
        end_year = company_df['Date From'].dt.year.max()
        window_size = 10
        validation_size = 1

        for start in range(start_year, end_year - window_size - validation_size + 1):
            train_start = start
            train_end = start + window_size
            val_start = train_end
            val_end = train_end + validation_size

            train_df = company_df[(company_df['Date From'].dt.year >= train_start) &
                                  (company_df['Date From'].dt.year < train_end)]
            test_df = company_df[(company_df['Date From'].dt.year >= val_start) &
                                 (company_df['Date From'].dt.year < val_end)]

            if len(test_df) == 0 or len(train_df) == 0:
                continue

            X_train = np.vstack(train_df['embedding'].values)
            y_train = train_df['Future Return Direction'].values
            X_test = np.vstack(test_df['embedding'].values)
            y_test = test_df['Future Return Direction'].values

            # Train logistic regression model
            logistic_model = LogisticRegression(max_iter=1000)
            logistic_model.fit(X_train, y_train)

            # Get prediction probabilities
            y_prob = logistic_model.predict_proba(X_test)[:, 1]  # Probability of the positive class

            if len(y_prob) != len(test_df):
                raise ValueError(f"Mismatch between test data length ({len(test_df)}) and predictions ({len(y_prob)})")

            test_df['predicted_prob'] = y_prob
            predictions_df = pd.concat([predictions_df, test_df[['Date From', 'companyname', 'predicted_prob', 'Weekly Compound Return']]], ignore_index=True)

    df = df.merge(predictions_df, on=['Date From', 'companyname', 'Weekly Compound Return'], how='left', suffixes=('', '_pred'))
    return df

def construct_portfolio(df, time_period='Week'):
    df['Date From'] = pd.to_datetime(df['Date From'])
    if time_period == 'Week':
        df['Period'] = df['Date From'].dt.to_period('W').dt.to_timestamp()
    else:
        raise ValueError("Invalid time_period. Use 'Week'.")

    portfolio_returns = []

    for period, group in df.groupby('Period'):
        if period.year < 2016:
            continue

        # Sort group by predicted_prob descending
        group_sorted = group.sort_values(by='predicted_prob', ascending=False)

        # Select top and bottom companies
        num_top_companies = 5
        num_bottom_companies = 5
        top_companies = group_sorted.head(num_top_companies)
        bottom_companies = group_sorted.tail(num_bottom_companies)

        # Equal-weighted returns
        long_return_eq = np.mean(np.log1p(top_companies['Weekly Compound Return']))
        short_return_eq = np.mean(np.log1p(bottom_companies['Weekly Compound Return']))
        long_short_return_eq = long_return_eq - short_return_eq

        # Value-weighted returns
        long_return_val = np.sum(np.log1p(top_companies['Weekly Compound Return']) * top_companies['market_cap']) / np.sum(top_companies['market_cap'])
        short_return_val = np.sum(np.log1p(bottom_companies['Weekly Compound Return']) * bottom_companies['market_cap']) / np.sum(bottom_companies['market_cap'])
        long_short_return_val = long_return_val - short_return_val

        portfolio_returns.append({
            'Period': period,
            'Long Return (Eq)': long_return_eq,
            'Short Return (Eq)': short_return_eq,
            'Long-Short Return (Eq)': long_short_return_eq,
            'Long Return (Val)': long_return_val,
            'Short Return (Val)': short_return_val,
            'Long-Short Return (Val)': long_short_return_val
        })

    portfolio_df = pd.DataFrame(portfolio_returns)
    portfolio_df['EW L'] = portfolio_df['Long Return (Eq)'].cumsum()
    portfolio_df['EW S'] = portfolio_df['Short Return (Eq)'].cumsum()
    portfolio_df['EW LS'] = portfolio_df['Long-Short Return (Eq)'].cumsum()
    portfolio_df['VW L'] = portfolio_df['Long Return (Val)'].cumsum()
    portfolio_df['VW S'] = portfolio_df['Short Return (Val)'].cumsum()
    portfolio_df['VW LS'] = portfolio_df['Long-Short Return (Val)'].cumsum()

    actual_returns = df[df['Date From'].dt.year >= 2016].groupby('Period')['Weekly Compound Return'].mean()
    actual_cumulative_returns = np.log1p(actual_returns).cumsum()
    portfolio_df = portfolio_df.merge(actual_cumulative_returns.rename('Market'), on='Period', how='left')

    metrics = {}
    for portfolio in ['EW L', 'EW S', 'EW LS', 'VW L', 'VW S', 'VW LS']:
        returns = portfolio_df[portfolio]

        if returns.isnull().all() or returns.eq(0).all():
            sharpe_ratio = np.nan
            max_drawdown = np.nan
            volatility = np.nan
        else:
            sharpe_ratio = returns.mean() / returns.std() * np.sqrt(52) if returns.std() != 0 else np.nan
            cumulative_returns = returns.cumsum()
            max_drawdown = (cumulative_returns.cummax() - cumulative_returns).max()
            volatility = returns.std() * np.sqrt(52)

        metrics[portfolio] = {
            'Sharpe Ratio': sharpe_ratio
        }

        print(f"Metrics for {portfolio}:")
        print(f"Sharpe Ratio: {sharpe_ratio}")
        print()

    portfolio_df.to_csv('DistilRoBERTa_portfolio_returns.csv', index=False)
    print("Portfolio returns saved to 'DistilRoBERTa_portfolio_returns.csv'")
    return portfolio_df

def plot_portfolio_returns(portfolio_df, title_suffix=''):
    plt.figure(figsize=(12, 6))

    plt.plot(portfolio_df['Period'], portfolio_df['EW L'], marker='o', markersize=1, label='EW L')
    plt.plot(portfolio_df['Period'], portfolio_df['EW S'], marker='o', markersize=1, label='EW S')
    plt.plot(portfolio_df['Period'], portfolio_df['EW LS'], marker='o', markersize=1, label='EW LS')
    plt.plot(portfolio_df['Period'], portfolio_df['VW L'], marker='o', markersize=1, label='VW L')
    plt.plot(portfolio_df['Period'], portfolio_df['VW S'], marker='o', markersize=1, label='VW S')
    plt.plot(portfolio_df['Period'], portfolio_df['VW LS'], marker='o', markersize=1, label='VW LS')
    plt.plot(portfolio_df['Period'], portfolio_df['Market'], marker='o', markersize=1, label='Market')

    plt.title(f'Cumulative {title_suffix} Portfolio Returns Over Time')
    plt.xlabel('Period')
    plt.ylabel('Cumulative Log Return')
    plt.legend()
    plt.grid(True)

    plt.gca().xaxis.set_major_locator(mdates.YearLocator())
    plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%Y'))
    plt.xticks(rotation=45)
    plt.show()

# Example usage for Weekly
df = prepare_data(insample_df, outsample_df)
df = rolling_window_analysis(df)

# Weekly Portfolio
portfolio_df_week = construct_portfolio(df, time_period='Week')
portfolio_df_week = portfolio_df_week[portfolio_df_week['Period'].dt.year >= 2016]
plot_portfolio_returns(portfolio_df_week, title_suffix='Weekly')

# FinBERT

## Load Embedding

In [None]:
# Load the pre-trained FinBERT model and tokenizer
tokenizer = AutoTokenizer.from_pretrained('yiyanghkust/finbert-tone', use_fast=False)
model = AutoModel.from_pretrained('yiyanghkust/finbert-tone')

# Load embedded Dataframe
with open('embedding-FinBERT-AllCompany-NEW.pkl', 'rb') as f:
    insample_df, outsample_df = pd.read_pickle(f)

# Display the first 2 rows of outsample_df to check
filtered_df = outsample_df[outsample_df['headline'] != '[No_Headline]']
filtered_df.head(3)

## Accuracy per-companies

In [None]:
# Combine insample and outsample data for rolling window
df = pd.concat([insample_df, outsample_df])

# Convert 'Date From' to datetime
df['Date From'] = pd.to_datetime(df['Date From'])

# Sort by date
df = df.sort_values(by='Date From')

# Check available years
available_years = df['Date From'].dt.year.unique()
print("Years available in the data:", available_years)

# Define rolling window parameters
window_size = 365 * 10  # 10 years in days
prediction_period = 365  # 1 year in days

# Get unique companies
companies = df['companyname'].unique()

# Dictionary to store DataFrames for each company
company_results_dfs = {}

# Rolling window analysis for each company
for company in companies:
    company_df = df[df['companyname'] == company].copy()

    accuracies = []
    precisions = []
    recalls = []
    f1_scores = []
    years = []

    # Setup time series cross-validation
    tscv = TimeSeriesSplit(n_splits=5)  # You can adjust the number of splits as needed

    for train_index, test_index in tscv.split(company_df):
        train_df, test_df = company_df.iloc[train_index], company_df.iloc[test_index]

        if len(test_df) == 0 or len(train_df) == 0:
            continue

        X_train = np.vstack(train_df['embedding'].values)
        y_train = train_df['Future Return Direction'].values

        X_test = np.vstack(test_df['embedding'].values)
        y_test = test_df['Future Return Direction'].values

        # Train logistic regression model
        logistic_model = LogisticRegression(max_iter=1000)
        logistic_model.fit(X_train, y_train)

        # Make predictions
        y_pred = logistic_model.predict(X_test)

        # Ensure the length of predictions and test_df matches
        if len(y_pred) != len(test_df):
            raise ValueError(f"Mismatch between test data length ({len(test_df)}) and predictions ({len(y_pred)})")

        # Store the predictions and true values
        test_df['predicted'] = y_pred

        # Update the company-specific DataFrame with predictions
        company_df.loc[test_df.index, 'predicted'] = test_df['predicted']

        print(f'Company: {company}')
        print(f'Training window: {train_df["Date From"].min()} to {train_df["Date From"].max()}')
        print(f'Test window: {test_df["Date From"].min()} to {test_df["Date From"].max()}')
        print(f'Predicted years: {test_df["Date From"].dt.year.unique()}')

    # Evaluate the Model for the company
    for year in range(2016, 2024):
        year_df = company_df[company_df['Date From'].dt.year == year]

        if 'predicted' not in year_df.columns or year_df['predicted'].isnull().all():
            print(f"Missing predictions for year {year} for company {company} due to insufficient data or missing predictions.")
            continue

        y_test = year_df['Future Return Direction'].values
        y_pred = year_df['predicted'].values

        # Remove NaN values in predictions
        valid_indices = ~np.isnan(y_pred)
        y_test = y_test[valid_indices]
        y_pred = y_pred[valid_indices]

        if len(y_pred) == 0 or len(y_test) == 0:
            print(f"Insufficient valid predictions for year {year} for company {company}.")
            continue

        # Calculate evaluation metrics
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average='binary', pos_label=1)
        recall = recall_score(y_test, y_pred, average='binary', pos_label=1)
        f1 = f1_score(y_test, y_pred, average='binary', pos_label=1)

        accuracies.append(accuracy)
        precisions.append(precision)
        recalls.append(recall)
        f1_scores.append(f1)
        years.append(year)

        print(f'Company: {company}, Year: {year}')
        print(f'Accuracy: {accuracy:.2f}')
        print(f'Precision: {precision:.2f}')
        print(f'Recall: {recall:.2f}')
        print(f'F1 Score: {f1:.2f}')
        print()

    # Store the results for the company in a DataFrame
    results_df = pd.DataFrame({
        'Year': years,
        'Accuracy': accuracies,
        'Precision': precisions,
        'Recall': recalls,
        'F1 Score': f1_scores
    })

    # Save the DataFrame in the dictionary
    company_results_dfs[company] = results_df

    print(f"Evaluation results for {company}:")
    print(results_df)

    # Plot accuracy over time for the company
    plt.figure(figsize=(10, 6))
    plt.plot(results_df['Year'], results_df['Accuracy'], marker='o', label='Accuracy')
    plt.plot(results_df['Year'], results_df['Precision'], marker='o', label='Precision')
    plt.plot(results_df['Year'], results_df['Recall'], marker='o', label='Recall')
    plt.plot(results_df['Year'], results_df['F1 Score'], marker='o', label='F1 Score')

    plt.title(f'Performance Metrics Over Time for {company}')
    plt.xlabel('Year')
    plt.ylabel('Score')
    plt.legend()
    plt.grid(True)
    plt.show()

    # Plot confusion matrices for each year
    n_plots = len(years)
    n_cols = 4
    n_rows = (n_plots // n_cols) + (n_plots % n_cols > 0)

    fig, axes = plt.subplots(n_rows, n_cols, figsize=(20, 4 * n_rows))
    axes = axes.flatten()

    for i, year in enumerate(years):
        year_df = company_df[company_df['Date From'].dt.year == year]
        y_test = year_df['Future Return Direction'].values
        y_pred = year_df['predicted'].values

        # Remove NaN values in predictions
        valid_indices = ~np.isnan(y_pred)
        y_test = y_test[valid_indices]
        y_pred = y_pred[valid_indices]

        if len(y_pred) > 0 and len(y_test) > 0:
            cm = confusion_matrix(y_test, y_pred)
            disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=logistic_model.classes_)
            disp.plot(cmap='Blues', ax=axes[i])
            axes[i].set_title(f'{company} {year}')

    for j in range(i + 1, len(axes)):
        fig.delaxes(axes[j])

    plt.tight_layout()
    plt.show()

# Access the stored DataFrames
for company, results_df in company_results_dfs.items():
    print(f"\nResults for {company}:")
    print(results_df)


In [None]:
# Concatenate all the company results into one DataFrame
all_results_df = pd.concat(company_results_dfs.values(), keys=company_results_dfs.keys()).reset_index(level=0).rename(columns={'level_0': 'Company'})

# Save the combined DataFrame to a CSV file
all_results_df.to_csv('[EVAL] FinBERT_all_company_results.csv', index=False)


## Accuracy all years

In [None]:
# Combine insample and outsample data for rolling window
df = pd.concat([insample_df, outsample_df])

# Convert 'Date From' to datetime
df['Date From'] = pd.to_datetime(df['Date From'])

# Sort by date
df = df.sort_values(by='Date From')

# Check available years
available_years = df['Date From'].dt.year.unique()
print("Years available in the data:", available_years)

# Define rolling window parameters
window_size = 365 * 10  # 10 years in days
prediction_period = 365  # 1 year in days

# Get unique companies
companies = df['companyname'].unique()

# Dictionary to store DataFrames for each company
company_results_dfs = {}

# Rolling window analysis for each company
for company in companies:
    company_df = df[df['companyname'] == company].copy()

    accuracies = []
    precisions = []
    recalls = []
    f1_scores = []
    years = []

    # Setup time series cross-validation
    tscv = TimeSeriesSplit(n_splits=5)  # You can adjust the number of splits as needed

    for train_index, test_index in tscv.split(company_df):
        train_df, test_df = company_df.iloc[train_index], company_df.iloc[test_index]

        if len(test_df) == 0 or len(train_df) == 0:
            continue

        X_train = np.vstack(train_df['embedding'].values)
        y_train = train_df['Future Return Direction'].values

        X_test = np.vstack(test_df['embedding'].values)
        y_test = test_df['Future Return Direction'].values

        # Train logistic regression model
        logistic_model = LogisticRegression(max_iter=1000)
        logistic_model.fit(X_train, y_train)

        # Make predictions
        y_pred = logistic_model.predict(X_test)

        # Ensure the length of predictions and test_df matches
        if len(y_pred) != len(test_df):
            raise ValueError(f"Mismatch between test data length ({len(test_df)}) and predictions ({len(y_pred)})")

        # Store the predictions and true values
        test_df['predicted'] = y_pred

        # Update the company-specific DataFrame with predictions
        company_df.loc[test_df.index, 'predicted'] = test_df['predicted']

    # Evaluate the Model for the company
    for year in range(2016, 2024):
        year_df = company_df[company_df['Date From'].dt.year == year]

        if 'predicted' not in year_df.columns or year_df['predicted'].isnull().all():
            continue

        y_test = year_df['Future Return Direction'].values
        y_pred = year_df['predicted'].values

        # Remove NaN values in predictions
        valid_indices = ~np.isnan(y_pred)
        y_test = y_test[valid_indices]
        y_pred = y_pred[valid_indices]

        if len(y_pred) == 0 or len(y_test) == 0:
            continue

        # Calculate evaluation metrics
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average='binary', pos_label=1)
        recall = recall_score(y_test, y_pred, average='binary', pos_label=1)
        f1 = f1_score(y_test, y_pred, average='binary', pos_label=1)

        accuracies.append(accuracy)
        precisions.append(precision)
        recalls.append(recall)
        f1_scores.append(f1)
        years.append(year)

    # Store the results for the company in a DataFrame
    results_df = pd.DataFrame({
        'Year': years,
        'Accuracy': accuracies,
        'Precision': precisions,
        'Recall': recalls,
        'F1 Score': f1_scores
    })

    # Save the DataFrame in the dictionary
    company_results_dfs[company] = results_df

# Combine results of all companies into a single DataFrame
combined_results = pd.concat(company_results_dfs.values(), ignore_index=True)

# Calculate average accuracy per year across all companies
average_metrics_per_year = combined_results.groupby('Year').mean().reset_index()

# Print average metrics for each year
print("Average Metrics Per Year:")
print(average_metrics_per_year)

# Plot average accuracy per year
plt.figure(figsize=(10, 6))
sns.set(style='whitegrid')

plt.plot(average_metrics_per_year['Year'], average_metrics_per_year['Accuracy'], marker='o', label='Accuracy')
plt.plot(average_metrics_per_year['Year'], average_metrics_per_year['Precision'], marker='o', label='Precision')
plt.plot(average_metrics_per_year['Year'], average_metrics_per_year['Recall'], marker='o', label='Recall')
plt.plot(average_metrics_per_year['Year'], average_metrics_per_year['F1 Score'], marker='o', label='F1 Score')

plt.title('Average Performance Metrics Over Time Across All Companies')
plt.xlabel('Year')
plt.ylabel('Score')
plt.legend()
plt.grid(True)
plt.show()


## Standard Deviation all years

In [None]:
# Combine insample and outsample data for rolling window
df = pd.concat([insample_df, outsample_df])

# Convert 'Date From' to datetime
df['Date From'] = pd.to_datetime(df['Date From'])

# Sort by date
df = df.sort_values(by='Date From')

# Check available years
available_years = df['Date From'].dt.year.unique()
print("Years available in the data:", available_years)

# Get unique companies
companies = df['companyname'].unique()

# Dictionary to store DataFrames for each company
company_results_dfs = {}

# Rolling window analysis for each company
for company in companies:
    company_df = df[df['companyname'] == company].copy()

    accuracies = []
    years = []

    # Setup time series cross-validation
    tscv = TimeSeriesSplit(n_splits=5)  # You can adjust the number of splits as needed

    for train_index, test_index in tscv.split(company_df):
        train_df, test_df = company_df.iloc[train_index], company_df.iloc[test_index]

        if len(test_df) == 0 or len(train_df) == 0:
            continue

        X_train = np.vstack(train_df['embedding'].values)
        y_train = train_df['Future Return Direction'].values

        X_test = np.vstack(test_df['embedding'].values)
        y_test = test_df['Future Return Direction'].values

        # Train logistic regression model
        logistic_model = LogisticRegression(max_iter=1000)
        logistic_model.fit(X_train, y_train)

        # Make predictions
        y_pred = logistic_model.predict(X_test)

        # Ensure the length of predictions and test_df matches
        if len(y_pred) != len(test_df):
            raise ValueError(f"Mismatch between test data length ({len(test_df)}) and predictions ({len(y_pred)})")

        # Store the predictions and true values
        test_df['predicted'] = y_pred

        # Update the company-specific DataFrame with predictions
        company_df.loc[test_df.index, 'predicted'] = test_df['predicted']

    # Evaluate the Model for the company
    for year in range(2016, 2024):
        year_df = company_df[company_df['Date From'].dt.year == year]

        if 'predicted' not in year_df.columns or year_df['predicted'].isnull().all():
            continue

        y_test = year_df['Future Return Direction'].values
        y_pred = year_df['predicted'].values

        # Remove NaN values in predictions
        valid_indices = ~np.isnan(y_pred)
        y_test = y_test[valid_indices]
        y_pred = y_pred[valid_indices]

        if len(y_pred) == 0 or len(y_test) == 0:
            continue

        # Calculate evaluation metrics
        accuracy = accuracy_score(y_test, y_pred)

        accuracies.append(accuracy)
        years.append(year)

    # Store the results for the company in a DataFrame
    results_df = pd.DataFrame({
        'Year': years,
        'Accuracy': accuracies
    })

    # Save the DataFrame in the dictionary
    company_results_dfs[company] = results_df

# Combine results of all companies into a single DataFrame
combined_results = pd.concat(company_results_dfs.values(), ignore_index=True)

# Calculate average accuracy per year across all companies
average_metrics_per_year = combined_results.groupby('Year').agg(
    Accuracy_mean=('Accuracy', 'mean'),
    Accuracy_std=('Accuracy', 'std')
).reset_index()

# Print average metrics for each year
print("Average Metrics Per Year:")
print(average_metrics_per_year)

# Get a color sequence from Plotly's default colors
colors = px.colors.qualitative.Plotly

# Function to make the color more transparent
def get_transparent_color(color, alpha=0.2):
    # Convert hex to RGB and then to RGBA
    hex_color = color.lstrip('#')
    rgb_color = tuple(int(hex_color[i:i+2], 16) for i in (0, 2, 4))
    return f'rgba({rgb_color[0]}, {rgb_color[1]}, {rgb_color[2]}, {alpha})'

# Create a figure
fig = go.Figure()

# Add the mean line
fig.add_trace(go.Scatter(
    x=average_metrics_per_year['Year'],
    y=average_metrics_per_year['Accuracy_mean'],
    mode='lines',
    name='Accuracy',
    line=dict(color=colors[0], width=2)
))

# Add the standard deviation shaded area
fig.add_trace(go.Scatter(
    x=pd.concat([average_metrics_per_year['Year'], average_metrics_per_year['Year'][::-1]]),
    y=pd.concat([average_metrics_per_year['Accuracy_mean'] + average_metrics_per_year['Accuracy_std'],
                 (average_metrics_per_year['Accuracy_mean'] - average_metrics_per_year['Accuracy_std'])[::-1]]),
    fill='toself',
    fillcolor=get_transparent_color(colors[0], alpha=0.2),  # Use the same color with transparency
    line=dict(color='rgba(255,255,255,0)'),
    hoverinfo="skip",
    showlegend=False,
    name='Accuracy std dev'
))

# Customize layout
fig.update_layout(
    title='Average Rolling Window Accuracy Over Time Across All Companies',
    xaxis_title='Year',
    yaxis_title='Accuracy',
    template='plotly_white',
    showlegend=True
)

# Show the plot
fig.show()


## Portfolio

In [None]:
def prepare_data(insample_df, outsample_df):
    df = pd.concat([insample_df, outsample_df])
    df['Date From'] = pd.to_datetime(df['Date From'])
    df = df.sort_values(by='Date From')
    available_years = df['Date From'].dt.year.unique()
    print("Years available in the data:", available_years)
    return df

def rolling_window_analysis(df):
    companies = df['companyname'].unique()
    predictions_df = pd.DataFrame()

    for company in companies:
        company_df = df[df['companyname'] == company].copy()
        company_df = company_df.sort_values(by='Date From')

        # Define the rolling window parameters
        start_year = company_df['Date From'].dt.year.min()
        end_year = company_df['Date From'].dt.year.max()
        window_size = 10
        validation_size = 1

        for start in range(start_year, end_year - window_size - validation_size + 1):
            train_start = start
            train_end = start + window_size
            val_start = train_end
            val_end = train_end + validation_size

            train_df = company_df[(company_df['Date From'].dt.year >= train_start) &
                                  (company_df['Date From'].dt.year < train_end)]
            test_df = company_df[(company_df['Date From'].dt.year >= val_start) &
                                 (company_df['Date From'].dt.year < val_end)]

            if len(test_df) == 0 or len(train_df) == 0:
                continue

            X_train = np.vstack(train_df['embedding'].values)
            y_train = train_df['Future Return Direction'].values
            X_test = np.vstack(test_df['embedding'].values)
            y_test = test_df['Future Return Direction'].values

            # Train logistic regression model
            logistic_model = LogisticRegression(max_iter=1000)
            logistic_model.fit(X_train, y_train)

            # Get prediction probabilities
            y_prob = logistic_model.predict_proba(X_test)[:, 1]  # Probability of the positive class

            if len(y_prob) != len(test_df):
                raise ValueError(f"Mismatch between test data length ({len(test_df)}) and predictions ({len(y_prob)})")

            test_df['predicted_prob'] = y_prob
            predictions_df = pd.concat([predictions_df, test_df[['Date From', 'companyname', 'predicted_prob', 'Weekly Compound Return']]], ignore_index=True)

    df = df.merge(predictions_df, on=['Date From', 'companyname', 'Weekly Compound Return'], how='left', suffixes=('', '_pred'))
    return df

def construct_portfolio(df, time_period='Week'):
    df['Date From'] = pd.to_datetime(df['Date From'])
    if time_period == 'Week':
        df['Period'] = df['Date From'].dt.to_period('W').dt.to_timestamp()
    else:
        raise ValueError("Invalid time_period. Use 'Week'.")

    portfolio_returns = []

    for period, group in df.groupby('Period'):
        if period.year < 2016:
            continue

        # Sort group by predicted_prob descending
        group_sorted = group.sort_values(by='predicted_prob', ascending=False)

        # Select top and bottom companies
        num_top_companies = 5
        num_bottom_companies = 5
        top_companies = group_sorted.head(num_top_companies)
        bottom_companies = group_sorted.tail(num_bottom_companies)

        # Equal-weighted returns
        long_return_eq = np.mean(np.log1p(top_companies['Weekly Compound Return']))
        short_return_eq = np.mean(np.log1p(bottom_companies['Weekly Compound Return']))
        long_short_return_eq = long_return_eq - short_return_eq

        # Value-weighted returns
        long_return_val = np.sum(np.log1p(top_companies['Weekly Compound Return']) * top_companies['market_cap']) / np.sum(top_companies['market_cap'])
        short_return_val = np.sum(np.log1p(bottom_companies['Weekly Compound Return']) * bottom_companies['market_cap']) / np.sum(bottom_companies['market_cap'])
        long_short_return_val = long_return_val - short_return_val

        portfolio_returns.append({
            'Period': period,
            'Long Return (Eq)': long_return_eq,
            'Short Return (Eq)': short_return_eq,
            'Long-Short Return (Eq)': long_short_return_eq,
            'Long Return (Val)': long_return_val,
            'Short Return (Val)': short_return_val,
            'Long-Short Return (Val)': long_short_return_val
        })

    portfolio_df = pd.DataFrame(portfolio_returns)
    portfolio_df['EW L'] = portfolio_df['Long Return (Eq)'].cumsum()
    portfolio_df['EW S'] = portfolio_df['Short Return (Eq)'].cumsum()
    portfolio_df['EW LS'] = portfolio_df['Long-Short Return (Eq)'].cumsum()
    portfolio_df['VW L'] = portfolio_df['Long Return (Val)'].cumsum()
    portfolio_df['VW S'] = portfolio_df['Short Return (Val)'].cumsum()
    portfolio_df['VW LS'] = portfolio_df['Long-Short Return (Val)'].cumsum()

    actual_returns = df[df['Date From'].dt.year >= 2016].groupby('Period')['Weekly Compound Return'].mean()
    actual_cumulative_returns = np.log1p(actual_returns).cumsum()
    portfolio_df = portfolio_df.merge(actual_cumulative_returns.rename('Market'), on='Period', how='left')

    metrics = {}
    for portfolio in ['EW L', 'EW S', 'EW LS', 'VW L', 'VW S', 'VW LS']:
        returns = portfolio_df[portfolio]

        if returns.isnull().all() or returns.eq(0).all():
            sharpe_ratio = np.nan
            max_drawdown = np.nan
            volatility = np.nan
        else:
            sharpe_ratio = returns.mean() / returns.std() * np.sqrt(52) if returns.std() != 0 else np.nan
            cumulative_returns = returns.cumsum()
            max_drawdown = (cumulative_returns.cummax() - cumulative_returns).max()
            volatility = returns.std() * np.sqrt(52)

        metrics[portfolio] = {
            'Sharpe Ratio': sharpe_ratio
        }

        print(f"Metrics for {portfolio}:")
        print(f"Sharpe Ratio: {sharpe_ratio}")
        print()

    portfolio_df.to_csv('FinBERT_portfolio_returns.csv', index=False)
    print("Portfolio returns saved to 'FinBERT_portfolio_returns.csv'")
    return portfolio_df

def plot_portfolio_returns(portfolio_df, title_suffix=''):
    plt.figure(figsize=(12, 6))

    plt.plot(portfolio_df['Period'], portfolio_df['EW L'], marker='o', markersize=1, label='EW L')
    plt.plot(portfolio_df['Period'], portfolio_df['EW S'], marker='o', markersize=1, label='EW S')
    plt.plot(portfolio_df['Period'], portfolio_df['EW LS'], marker='o', markersize=1, label='EW LS')
    plt.plot(portfolio_df['Period'], portfolio_df['VW L'], marker='o', markersize=1, label='VW L')
    plt.plot(portfolio_df['Period'], portfolio_df['VW S'], marker='o', markersize=1, label='VW S')
    plt.plot(portfolio_df['Period'], portfolio_df['VW LS'], marker='o', markersize=1, label='VW LS')
    plt.plot(portfolio_df['Period'], portfolio_df['Market'], marker='o', markersize=1, label='Market')

    plt.title(f'Cumulative {title_suffix} Portfolio Returns Over Time')
    plt.xlabel('Period')
    plt.ylabel('Cumulative Log Return')
    plt.legend()
    plt.grid(True)

    plt.gca().xaxis.set_major_locator(mdates.YearLocator())
    plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%Y'))
    plt.xticks(rotation=45)
    plt.show()

# Example usage for Weekly
df = prepare_data(insample_df, outsample_df)
df = rolling_window_analysis(df)

# Weekly Portfolio
portfolio_df_week = construct_portfolio(df, time_period='Week')
portfolio_df_week = portfolio_df_week[portfolio_df_week['Period'].dt.year >= 2016]
plot_portfolio_returns(portfolio_df_week, title_suffix='Weekly')