This notebook can be used to create rolling averages of the different sentiment sources

In [None]:
#Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

In [None]:
#Load the data from the parquet file
#1. Running on google colab - upload the roberta.parquet file
#df = pd.read_parquet('/content/roberta.parquet')

#2. Running on vscode
df = pd.read_parquet('roberta.parquet')

In [None]:
#Quick overview of the data
print("Columns in the dataset:", df.columns.tolist())
print("First few rows:")
display(df.head())

print("Missing values in news sentiment:", df['news_sentiment'].isnull().sum())
print("Missing values in submissions sentiment:", df['submissions_sentiment'].isnull().sum())
print("Missing values in comments sentiment:", df['comments_sentiment'].isnull().sum())


In [None]:

# Filter the DataFrame to only include dates between 2014-01-01 and 2022-01-01
truncated = df[(df['dt'] >= '2014-01-01') & (df['dt'] <= '2022-01-01')]

In [None]:
sentiment_df = truncated.copy()
sentiment_df = sentiment_df.drop(columns= ['open', 'close', 'high', 'low', 'sp500_open', 'sp500_close'])
sentiment_df.head()

In [None]:
df_clean = sentiment_df.copy()
df_clean[['news_sentiment', 'submissions_sentiment', 'comments_sentiment']] = \
    df_clean[['news_sentiment', 'submissions_sentiment', 'comments_sentiment']].fillna(0)

print("Data after filling missing sentiment values with 0:")
display(df_clean.head())

In [None]:

print("Missing values in news sentiment:", df_clean['news_sentiment'].isnull().sum())
print("Missing values in submissions sentiment:", df_clean['submissions_sentiment'].isnull().sum())
print("Missing values in comments sentiment:", df_clean['comments_sentiment'].isnull().sum())

# Check data types and missing values
print("\nData Info:")
df_clean.info()


In [None]:
if 'dt' in df_clean.columns:
    df_clean['dt'] = pd.to_datetime(df_clean['dt'])
    df_clean = df_clean.sort_values('dt')
    df_clean.set_index('dt', inplace=True)
display(df_clean.head())


In [None]:

def compute_rolling_30_day_mean(symbol_df, column_name):
    """
    Computes the rolling 30‑day mean for the specified column in the symbol DataFrame.

    Parameters:
      symbol_df (DataFrame): The DataFrame for a single symbol, with a datetime index.
      column_name (str): The column for which to compute the rolling 30‑day mean.

    Returns:
      tuple: (updated DataFrame, rolling mean Series)
    """
    # Ensure the DataFrame is sorted by the datetime index
    symbol_df = symbol_df.sort_index()

    # Compute the rolling mean over a window of 30 days.
    rolling_col = f"{column_name}_rolling_30d_mean"
    symbol_df[rolling_col] = symbol_df[column_name].rolling(window=30, min_periods=1).mean()

    return symbol_df, symbol_df[rolling_col]

Edit source of sentiment - comments_sentiment / submission_sentiment / news_sentiment to generate rollingMean for each sentiment source

In [None]:


# Get unique symbols from df_clean
symbols = df_clean['symbol'].unique()

# Dictionary to store transitions for each symbol (using 'news_sentiment'; adjust as needed)
symbol_transitions = {}

for symbol in symbols:
    # Filter data for the current symbol and make a copy to ensure independence
    df_symbol = df_clean[df_clean['symbol'] == symbol].copy()

    # (Optional) Debug: Print the number of data points for this symbol
    print(f"Symbol: {symbol}, Data points: {len(df_symbol)}")
    display(df_symbol.head())

    # Compute transitions using the symbol-specific DataFrame
    period_df, transitions = compute_rolling_30_day_mean(df_symbol, 'comments_sentiment')

    # Debug print to verify period DataFrame and transitions
    print(f"Symbol: {symbol} Period DataFrame:")
    display(period_df)
    print(f"Symbol: {symbol} Transitions:")
    display(transitions.dropna())

    # Only store if there is valid transition data
    if not transitions.empty:
        symbol_transitions[symbol] = transitions

Rename parquet file generated to match the sentiment source

In [None]:
def resample_transitions(transitions, freq='30D'):
    # Assume the index of transitions is a datetime index
    return transitions.resample(freq).mean()

# Combine transitions after resampling
all_transitions = []
for symbol, transitions in symbol_transitions.items():
    # Resample transitions to 30-day frequency
    transitions_resampled = resample_transitions(transitions)

    # Reset index so that the date becomes a column
    tmp = transitions_resampled.reset_index()
    # Rename columns (assuming the datetime column was named 'dt' or similar)
    tmp.columns = ['date', 'comments_sentiment']
    tmp['symbol'] = symbol
    all_transitions.append(tmp)

# Concatenate all symbol transitions into one DataFrame
transitions_df = pd.concat(all_transitions, ignore_index=True)

# Save as a Parquet file
transitions_df.to_parquet('rollingComments.parquet', index=False)

# To verify, load it back:
loaded_transitions = pd.read_parquet('rollingComments.parquet')
print(loaded_transitions.head())

In [None]:
#Define the output directory in /content/
output_dir = "/content/rollingComments"
os.makedirs(output_dir, exist_ok=True)

In [None]:
# Load the transitions DataFrame from the saved Parquet file
transitions_df = pd.read_parquet('rollingComments.parquet')

# Get the unique symbols from the transitions DataFrame
symbols = transitions_df['symbol'].unique()

# Loop through each symbol and plot its transitions
for symbol in symbols:
    # Filter transitions for the current symbol and sort by date
    df_symbol = transitions_df[transitions_df['symbol'] == symbol].copy()
    df_symbol.sort_values('date', inplace=True)

    # Create a new figure for this symbol
    plt.figure(figsize=(10, 5))
    plt.plot(df_symbol['date'], df_symbol['comments_sentiment'], marker='o')
    plt.xlabel("Date (Start of Next Period)")
    plt.ylabel("Rolling Mean (Comments Sentiment)")
    plt.title(f"Reddit Comments Sentiment Rolling Mean for {symbol}")
    plt.grid(True)

    # Define the file path for saving the figure in /content/plots
    output_path = os.path.join(output_dir, f"comments_rolling_{symbol}.png")

    # Save the figure
    plt.savefig(output_path)

    # Close the figure to free up memory
    plt.close()

print(f"All figures have been saved in {output_dir}")

In [None]:
# Zip the folder
!zip -r /content/rollingComments.zip /content/rollingComments
