Make sure you have generated rollingMean parquet files according to SentimentPreprocessing notebook and upload to use this notebook. NB: File paths specified in this notebook assume you are using a google colab environment, rename paths accordingly if you would like to save output data somewhere specific.

In [None]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# List of popular symbols
popular_symbols = [
    "AAPL", "MSFT", "TSLA", "AMZN", "NVDA", "NFLX",
    "META", "IBM", "MCD", "NKE", "SBUX", "MS", "JPM",
    "EBAY", "COST", "GE", "BA", "PYPL", "GS",
]

# Load transitions data from the three Parquet files
news_df = pd.read_parquet('rollingNews.parquet')
submissions_df = pd.read_parquet('rollingSubmissions.parquet')
comments_df = pd.read_parquet('rollingComments.parquet')

# Ensure the date column is in datetime format
for df in [news_df, submissions_df, comments_df]:
    df['date'] = pd.to_datetime(df['date'])

# Filter each DataFrame for only the popular symbols
news_df = news_df[news_df['symbol'].isin(popular_symbols)]
submissions_df = submissions_df[submissions_df['symbol'].isin(popular_symbols)]
comments_df = comments_df[comments_df['symbol'].isin(popular_symbols)]


Correlation matrix between periods per sentiment type per company

In [None]:
from scipy.stats import pearsonr

In [None]:
# Dictionary to store results per company
resultsNews = {}

# Process each company (symbol) separately
for symbol, group in news_df.groupby('symbol'):
    # Sort by date to ensure correct order
    group_sorted = group.sort_values('date').reset_index(drop=True)
    # Check if there are at least two periods to compare
    if len(group_sorted) < 2:
        print(f"Not enough periods for symbol: {symbol}")
        continue

    # Extract the sentiment series
    sentiment = group_sorted['news_sentiment']
    # Create two series:
    #   - current: sentiment for periods 1 to N-1
    #   - next_period: sentiment for periods 2 to N
    current = sentiment.iloc[:-1].reset_index(drop=True)
    next_period = sentiment.iloc[1:].reset_index(drop=True)
    # Compute the Pearson correlation between consecutive periods
    corr, p_value = pearsonr(current, next_period)
    resultsNews[symbol] = {'correlation': corr, 'p_value': p_value}
    print(f"Symbol: {symbol} -> Pearson correlation between consecutive periods: {corr:.4f}, p-value: {p_value:.4g}")

# If needed, you can inspect the results dictionary for all symbols
print("\nAll results:")
print(resultsNews)

In [None]:
# Dictionary to store results per company
resultsSubmissions = {}

# Process each company (symbol) separately
for symbol, group in submissions_df.groupby('symbol'):
    # Sort by date to ensure correct order
    group_sorted = group.sort_values('date').reset_index(drop=True)
    # Check if there are at least two periods to compare
    if len(group_sorted) < 2:
        print(f"Not enough periods for symbol: {symbol}")
        continue

    # Extract the sentiment series
    sentiment = group_sorted['submissions_sentiment']
    # Create two series:
    #   - current: sentiment for periods 1 to N-1
    #   - next_period: sentiment for periods 2 to N
    current = sentiment.iloc[:-1].reset_index(drop=True)
    next_period = sentiment.iloc[1:].reset_index(drop=True)
    # Compute the Pearson correlation between consecutive periods
    corr, p_value = pearsonr(current, next_period)
    resultsSubmissions[symbol] = {'correlation': corr, 'p_value': p_value}
    print(f"Symbol: {symbol} -> Pearson correlation between consecutive periods: {corr:.4f}, p-value: {p_value:.4g}")

# If needed, you can inspect the results dictionary for all symbols
print("\nAll results:")
print(resultsSubmissions)

In [None]:
# Dictionary to store results per company
resultsComments = {}

# Process each company (symbol) separately
for symbol, group in comments_df.groupby('symbol'):
    # Sort by date to ensure correct order
    group_sorted = group.sort_values('date').reset_index(drop=True)
    # Check if there are at least two periods to compare
    if len(group_sorted) < 2:
        print(f"Not enough periods for symbol: {symbol}")
        continue

    # Extract the sentiment series
    sentiment = group_sorted['comments_sentiment']
    # Create two series:
    #   - current: sentiment for periods 1 to N-1
    #   - next_period: sentiment for periods 2 to N
    current = sentiment.iloc[:-1].reset_index(drop=True)
    next_period = sentiment.iloc[1:].reset_index(drop=True)
    # Compute the Pearson correlation between consecutive periods
    corr, p_value = pearsonr(current, next_period)
    resultsComments[symbol] = {'correlation': corr, 'p_value': p_value}
    print(f"Symbol: {symbol} -> Pearson correlation between consecutive periods: {corr:.4f}, p-value: {p_value:.4g}")

# If needed, you can inspect the results dictionary for all symbols
print("\nAll results:")
print(resultsComments)

In [None]:
# Extract symbols and correlation coefficients
symbols = list(resultsNews.keys())
correlations = [resultsNews[sym]['correlation'] for sym in symbols]

# Create a bar chart
plt.figure(figsize=(12, 6))
bars = plt.bar(symbols, correlations, color='skyblue')

# Add correlation values on top of each bar
for bar, corr in zip(bars, correlations):
    plt.text(bar.get_x() + bar.get_width()/2, bar.get_height(), f'{corr:.2f}',
             ha='center', va='bottom', fontsize=9)

plt.xlabel('Company Symbol')
plt.ylabel('Pearson Correlation')
plt.title('Correlation Between Consecutive 30-Day News Sentiment Periods per Company')
plt.ylim(0, 1.05)  # Set y-limit to provide some space above bars
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Extract symbols and correlation coefficients
symbols = list(resultsSubmissions.keys())
correlations = [resultsSubmissions[sym]['correlation'] for sym in symbols]

# Create a bar chart
plt.figure(figsize=(12, 6))
bars = plt.bar(symbols, correlations, color='skyblue')

# Add correlation values on top of each bar
for bar, corr in zip(bars, correlations):
    plt.text(bar.get_x() + bar.get_width()/2, bar.get_height(), f'{corr:.2f}',
             ha='center', va='bottom', fontsize=9)

plt.xlabel('Company Symbol')
plt.ylabel('Pearson Correlation')
plt.title('Correlation Between Consecutive 30-Day Reddit Submissions Sentiment Periods per Company')
plt.ylim(0, 1.05)  # Set y-limit to provide some space above bars
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Extract symbols and correlation coefficients
symbols = list(resultsComments.keys())
correlations = [resultsComments[sym]['correlation'] for sym in symbols]

# Create a bar chart
plt.figure(figsize=(12, 6))
bars = plt.bar(symbols, correlations, color='skyblue')

# Add correlation values on top of each bar
for bar, corr in zip(bars, correlations):
    plt.text(bar.get_x() + bar.get_width()/2, bar.get_height(), f'{corr:.2f}',
             ha='center', va='bottom', fontsize=9)

plt.xlabel('Company Symbol')
plt.ylabel('Pearson Correlation')
plt.title('Correlation Between Consecutive 30-Day Reddit Comments Sentiment Periods per Company')
plt.ylim(0, 1.05)  # Set y-limit to provide some space above bars
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

Merge the data frames

In [None]:
# To merge, we assume the columns holding transition values are named:
# 'news_transition' in news_df, 'submissions_transition' in submissions_df, and 'comments_transition' in comments_df.
# If they are named differently, rename them accordingly.

# Merge the DataFrames on 'symbol' and 'date'
merged_df = pd.merge(news_df, submissions_df, on=['symbol', 'date'], how='outer')
merged_df = pd.merge(merged_df, comments_df, on=['symbol', 'date'], how='outer')

# Sort and fill missing values (if desired, e.g., with 0)
merged_df.sort_values(['symbol', 'date'], inplace=True)
merged_df.fillna(0, inplace=True)

# Display a preview of the merged data
print("Merged Data Preview:")
display(merged_df.head())

Correlation Matrix between each sentiment type per company

In [None]:
for symbol in popular_symbols:
    symbol_data = merged_df[merged_df['symbol'] == symbol]
    if symbol_data.empty:
        continue

    corr_matrix = symbol_data[['news_sentiment', 'submissions_sentiment', 'comments_sentiment']].corr()
    print(f"Correlation matrix for {symbol}:")
    print(corr_matrix)
    print("\n")

Pairwise Correlation Plots

In [None]:
# Create the output directory for pairplots in /content/
output_dir = "/content/pairplots"
os.makedirs(output_dir, exist_ok=True)

# Loop through each popular symbol and generate pairwise scatter plots
for symbol in popular_symbols:
    # Filter merged_df for the current symbol
    symbol_data = merged_df[merged_df['symbol'] == symbol]
    if symbol_data.empty:
        continue  # Skip symbols with no data

    # Create a pairplot using only the transition columns
    pair_grid = sns.pairplot(symbol_data[['news_sentiment', 'submissions_sentiment', 'comments_sentiment']])
    pair_grid.fig.suptitle(f"Pairwise Scatter Plots for {symbol}", y=1.02)

    # Save the pairplot to the output directory
    file_path = os.path.join(output_dir, f"pairplot_{symbol}.png")
    pair_grid.savefig(file_path)

    # Close the plot to free memory
    plt.close('all')

In [None]:
# Zip the folder containing the pairplots
!zip -r /content/pairplots.zip /content/pairplots

Summary Statistics

In [None]:
for symbol in popular_symbols:
    symbol_data = merged_df[merged_df['symbol'] == symbol]
    if symbol_data.empty:
        continue

    stats = symbol_data[['news_sentiment', 'submissions_sentiment', 'comments_sentiment']].describe()
    print(f"Descriptive statistics for {symbol}:")
    print(stats)
    print("\n")

Rolling window analysis

Identify periods when rolling means diverge between news and Reddit channels. Sudden shifts may indicate market events or changes in sentiment drivers.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

rolling_dir = '/content/rolling_plots'
os.makedirs(rolling_dir, exist_ok=True)

rolling_window = 5  # Adjust the window size as desired

for symbol in popular_symbols:
    symbol_data = merged_df[merged_df['symbol'] == symbol].copy().sort_values('date')
    if symbol_data.empty:
        continue

    # Compute rolling means for each sentiment transition measure
    symbol_data['news_roll_mean'] = symbol_data['news_sentiment'].rolling(window=rolling_window).mean()
    symbol_data['submissions_roll_mean'] = symbol_data['submissions_sentiment'].rolling(window=rolling_window).mean()
    symbol_data['comments_roll_mean'] = symbol_data['comments_sentiment'].rolling(window=rolling_window).mean()

    # Create the plot
    plt.figure(figsize=(12, 6))
    plt.plot(symbol_data['date'], symbol_data['news_roll_mean'], label='News Rolling Mean')
    plt.plot(symbol_data['date'], symbol_data['submissions_roll_mean'], label='Submissions Rolling Mean')
    plt.plot(symbol_data['date'], symbol_data['comments_roll_mean'], label='Comments Rolling Mean')
    plt.title(f"Rolling Means (window={rolling_window}) for {symbol}")
    plt.xlabel("Date")
    plt.ylabel("Transition Value (Rolling Mean)")
    plt.legend()
    plt.grid(True)

    # Save the plot as PNG
    plot_filename = os.path.join(rolling_dir, f"rolling_{symbol}.png")
    plt.savefig(plot_filename)
    plt.close()

In [None]:
# Zip the folder containing the pairplots
!zip -r /content/rolling_plots.zip /content/rolling_plots

Regression:

Look at the regression coefficients and their significance. Significant positive (or negative) coefficients for Reddit submissions or comments transitions suggest that changes in Reddit sentiment are statistically associated with changes in news sentiment. Compare R-squared values across symbols to gauge how well the model explains the variation in news sentiment.

Changes in Reddit effect News



*   Dependent: News
*   Predictor: Reddit Comments  & Reddit submissions



In [None]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm
import statsmodels.tsa.stattools as tsastat
import seaborn as sns
import io
from contextlib import redirect_stdout

regression_dir = '/content/regression_RedditOnNews'
os.makedirs(regression_dir, exist_ok=True)

for symbol in popular_symbols:
    symbol_data = merged_df[merged_df['symbol'] == symbol].copy().sort_values('date')
    # Drop rows with missing transitions
    symbol_data = symbol_data.dropna(subset=['news_sentiment', 'submissions_sentiment', 'comments_sentiment'])
    if symbol_data.empty:
        continue

    # Set up regression: news_transition ~ submissions_transition + comments_transition
    y = symbol_data['news_sentiment']
    X = symbol_data[['submissions_sentiment', 'comments_sentiment']]
    X = sm.add_constant(X)  # add constant
    model = sm.OLS(y, X).fit()

    # Get the model summary as text
    summary_str = model.summary().as_text()

    # Save to a text file
    regression_filename = os.path.join(regression_dir, f"regression_{symbol}.txt")
    with open(regression_filename, 'w') as f:
        f.write(summary_str)

In [None]:
# Zip the folder containing the pairplots
!zip -r /content/regression_RedditOnNews.zip /content/regression_RedditOnNews

Changes in News effect Reddit

*   Dependent: Reddit Submission
*   Predictor: News



In [None]:
import os
import statsmodels.api as sm

regression_dir = '/content/regression_NewsOnRedditSubmissions'
os.makedirs(regression_dir, exist_ok=True)

for symbol in popular_symbols:
    # Filter and sort the data by date for the current symbol
    symbol_data = merged_df[merged_df['symbol'] == symbol].copy().sort_values('date')
    # Drop rows with missing values for news_sentiment or submissions_sentiment
    symbol_data = symbol_data.dropna(subset=['news_sentiment', 'submissions_sentiment'])
    if symbol_data.empty:
        continue

    # Set up regression: reddit_submissions ~ news_sentiment
    # Independent variable: news sentiment, with a constant added
    X = sm.add_constant(symbol_data['news_sentiment'])
    # Dependent variable: reddit submissions sentiment
    y = symbol_data['submissions_sentiment']

    # Fit the OLS regression model
    model = sm.OLS(y, X).fit()

    # Get the model summary as text
    summary_str = model.summary().as_text()

    # Save the summary to a text file
    regression_filename = os.path.join(regression_dir, f"regression_{symbol}.txt")
    with open(regression_filename, 'w') as f:
        f.write(summary_str)

In [None]:
# Zip the folder containing the pairplots
!zip -r /content/regression_NewsOnRedditSubmissions.zip /content/regression_NewsOnRedditSubmissions

Changes in News effect Reddit

*   Dependent: Reddit Comments
*   Predictor: News

In [None]:
import os
import statsmodels.api as sm

regression_dir = '/content/regression_NewsOnComments'
os.makedirs(regression_dir, exist_ok=True)

for symbol in popular_symbols:
    # Filter and sort the data by date for the current symbol
    symbol_data = merged_df[merged_df['symbol'] == symbol].copy().sort_values('date')
    # Drop rows with missing values for news_sentiment or submissions_sentiment
    symbol_data = symbol_data.dropna(subset=['news_sentiment', 'comments_sentiment'])
    if symbol_data.empty:
        continue

    # Set up regression: reddit_submissions ~ news_sentiment
    # Independent variable: news sentiment, with a constant added
    X = sm.add_constant(symbol_data['news_sentiment'])
    # Dependent variable: reddit submissions sentiment
    y = symbol_data['comments_sentiment']

    # Fit the OLS regression model
    model = sm.OLS(y, X).fit()

    # Get the model summary as text
    summary_str = model.summary().as_text()

    # Save the summary to a text file
    regression_filename = os.path.join(regression_dir, f"regression_{symbol}.txt")
    with open(regression_filename, 'w') as f:
        f.write(summary_str)

In [None]:
# Zip the folder containing the pairplots
!zip -r /content/regression_NewsOnComments.zip /content/regression_NewsOnComments

Changes in Reddit Submissions effect News

*   Dependent: News
*   Predictor: Reddit Submissions

In [None]:
import os
import statsmodels.api as sm

regression_dir = '/content/regression_SubmissionOnNews'
os.makedirs(regression_dir, exist_ok=True)

for symbol in popular_symbols:
    # Filter and sort the data by date for the current symbol
    symbol_data = merged_df[merged_df['symbol'] == symbol].copy().sort_values('date')
    # Drop rows with missing values for news_sentiment or submissions_sentiment
    symbol_data = symbol_data.dropna(subset=['news_sentiment', 'submissions_sentiment'])
    if symbol_data.empty:
        continue

    # Set up regression: reddit_submissions ~ news_sentiment
    # Independent variable: news sentiment, with a constant added
    X = sm.add_constant(symbol_data['submissions_sentiment'])
    # Dependent variable: reddit submissions sentiment
    y = symbol_data['news_sentiment']

    # Fit the OLS regression model
    model = sm.OLS(y, X).fit()

    # Get the model summary as text
    summary_str = model.summary().as_text()

    # Save the summary to a text file
    regression_filename = os.path.join(regression_dir, f"regression_{symbol}.txt")
    with open(regression_filename, 'w') as f:
        f.write(summary_str)

In [None]:
# Zip the folder containing the pairplots
!zip -r /content/regression_SubmissionOnNews.zip /content/regression_SubmissionOnNews

Changes in Reddit Submissions effect News

*   Dependent: News
*   Predictor: Reddit Comments

In [None]:
import os
import statsmodels.api as sm

regression_dir = '/content/regression_CommentsOnNews'
os.makedirs(regression_dir, exist_ok=True)

for symbol in popular_symbols:
    # Filter and sort the data by date for the current symbol
    symbol_data = merged_df[merged_df['symbol'] == symbol].copy().sort_values('date')
    # Drop rows with missing values for news_sentiment or submissions_sentiment
    symbol_data = symbol_data.dropna(subset=['news_sentiment', 'comments_sentiment'])
    if symbol_data.empty:
        continue

    # Set up regression: reddit_submissions ~ news_sentiment
    # Independent variable: news sentiment, with a constant added
    X = sm.add_constant(symbol_data['comments_sentiment'])
    # Dependent variable: reddit submissions sentiment
    y = symbol_data['news_sentiment']

    # Fit the OLS regression model
    model = sm.OLS(y, X).fit()

    # Get the model summary as text
    summary_str = model.summary().as_text()

    # Save the summary to a text file
    regression_filename = os.path.join(regression_dir, f"regression_{symbol}.txt")
    with open(regression_filename, 'w') as f:
        f.write(summary_str)

In [None]:
# Zip the folder containing the pairplots
!zip -r /content/regression_CommentsOnNews.zip /content/regression_CommentsOnNews

Causality:

If the Granger causality test suggests that Reddit transitions “cause” (in a forecasting sense) news transitions (e.g., p-values < 0.05), this may imply that Reddit sentiment changes precede changes in news sentiment. Compare the results for submissions versus comments to see which has a stronger predictive relationship.

Reddit Granger Causes News

Submissions Predictive relationship for News
Comments Predictive relationship for News

In [None]:
granger_dir = '/content/granger_RedditOnNews'

os.makedirs(granger_dir, exist_ok=True)

maxlag = 4  # Maximum lag for Granger causality tests

for symbol in popular_symbols:
    symbol_data = merged_df[merged_df['symbol'] == symbol].copy().sort_values('date')
    # Use only the necessary columns and drop rows with missing data
    test_data = symbol_data[['news_sentiment', 'submissions_sentiment', 'comments_sentiment']].dropna()
    if test_data.empty:
        continue

    # Capture the Granger test outputs
    granger_output = io.StringIO()
    with redirect_stdout(granger_output):
        print(f"Granger Causality Tests for {symbol}:\n")
        print("Testing if submissions_sentiment Granger-causes news_sentiment:")
        tsastat.grangercausalitytests(test_data[['news_sentiment', 'submissions_sentiment']], maxlag=maxlag, verbose=True)
        print("\nTesting if comments_sentiment Granger-causes news_sentiment:")
        tsastat.grangercausalitytests(test_data[['news_sentiment', 'comments_sentiment']], maxlag=maxlag, verbose=True)

    granger_text = granger_output.getvalue()
    granger_filename = os.path.join(granger_dir, f"granger_{symbol}.txt")
    with open(granger_filename, 'w') as f:
        f.write(granger_text)
    granger_output.close()

In [None]:
# Zip the folder containing the pairplots
!zip -r /content/granger_RedditOnNews.zip /content/granger_RedditOnNews



News Granger Causes Reddit

News Predictive relationship for submissions
News Predictive relationship for comments

In [None]:
granger_dir = '/content/granger_NewsOnReddit'

os.makedirs(granger_dir, exist_ok=True)

maxlag = 4  # Maximum lag for Granger causality tests

for symbol in popular_symbols:
    symbol_data = merged_df[merged_df['symbol'] == symbol].copy().sort_values('date')
    # Use only the necessary columns and drop rows with missing data
    test_data = symbol_data[['news_sentiment', 'submissions_sentiment', 'comments_sentiment']].dropna()
    if test_data.empty:
        continue

    # Capture the Granger test outputs
    granger_output = io.StringIO()
    with redirect_stdout(granger_output):
        print(f"Granger Causality Tests for {symbol}:\n")
        print("Testing if news_sentiment Granger-causes submissions_sentiment:")
        tsastat.grangercausalitytests(test_data[['submissions_sentiment', 'news_sentiment']], maxlag=maxlag, verbose=True)

        print("\nTesting if news_sentiment Granger-causes comments_sentiment :")
        tsastat.grangercausalitytests(test_data[['comments_sentiment', 'news_sentiment']], maxlag=maxlag, verbose=True)

    granger_text = granger_output.getvalue()
    granger_filename = os.path.join(granger_dir, f"granger_{symbol}.txt")
    with open(granger_filename, 'w') as f:
        f.write(granger_text)
    granger_output.close()

In [None]:
# Zip the folder containing the pairplots
!zip -r /content/granger_NewsOnReddit.zip /content/granger_NewsOnReddit