In [None]:
# Install package for Rank-Biased Overlap
!pip install rbo

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting rbo
  Downloading rbo-0.1.2-py3-none-any.whl (7.5 kB)
Installing collected packages: rbo
Successfully installed rbo-0.1.2


In [None]:
# Import required packages
import pandas as pd
import numpy as np
import os
import rbo

In [None]:
# Read Reddit ranked list and convert to list for RBO analysis
reddit_df = pd.read_csv('../data/meta/toptrending_100_stocks.txt')
reddit_lst = reddit_df['Ticker'].tolist()

In [None]:
# Read Yahoo Finance CSV file
collated_df = pd.read_csv('../data/meta/Yahoo Finance.csv')
# Convert Earnings Date and Previous Earnings Date to datetime format
collated_df['Earnings Date'] = pd.to_datetime(collated_df['Earnings Date'], format = '%d/%m/%Y')
collated_df['Earnings Date'] = collated_df['Earnings Date'].dt.strftime('%Y-%m-%d')
# Extract the required columns and convert to list for more efficient iteration over rows
sentiment_df = collated_df[['Ticker', 'Earnings Date']].copy(deep = True)
# Filter only for earnings dates in 2022 to match Reddit rankings
sentiment_df = sentiment_df.loc[sentiment_df['Earnings Date'] > '2021-12-31']
sentiment_lst = sentiment_df.values.tolist()

In [None]:
# Change Directory for Twitter Data
os.chdir('../data/datafiles/twitter/')

In [None]:
# Initialize a list to collect files not found
fnf = []
# Initialize a dictionary to collect Twitter mentions found
twitter_dict = {}

# Loop through each stock per quarter in the Yahoo Finance list
for idx, ticker in enumerate(sentiment_lst):
    # Format the file name
    file = f'{ticker[0]}_Tweets_{ticker[1]}.csv'
    print(f'Initiating {idx}, {ticker[0]} {ticker[1]} collation...')

    try:
        # Read the CSV file if it exists
        ticker_df = pd.read_csv(file)
        # Convert date column to datetime format
        ticker_df['date'] = pd.to_datetime(ticker_df['date'], format = '%Y-%m-%d')
        # Filter only for Tweets posted in 2022 to match Reddit rankings
        mask = ticker_df['date'] > '2021-12-31'
        twitter_dict[f'{ticker[0]}'] = twitter_dict.get(f'{ticker[0]}', 0) + len(ticker_df.loc[mask])
        
        print(f'Completed {idx}, {ticker[0]} {ticker[1]} collation.')
    
    # If file not found, append to fnf list
    except FileNotFoundError:
        print(f'File Not Found: {idx}, {file}')
        fnf.append((idx, ticker))

In [None]:
# Check for files not found - Should have only 1 observation
print(len(fnf), fnf)

In [None]:
# Convert Twitter mentions dictionary to list for RBO analysis
twitter_lst = []
twitter_keys = sorted(twitter_dict, key=twitter_dict.get, reverse=True)
for k in twitter_keys:
    twitter_lst.append(k)

In [None]:
# Calculate RBO for ranking similarity score

rbo.RankingSimilarity(reddit_lst, twitter_lst).rbo()

In [None]:
# Print top 10 rankings for both Reddit and Twitter

print("\033[4mReddit\033[0m", "\033[4mTwitter\033[0m")
for i in range(10):
    print(f'{reddit_lst[i]}\t{twitter_lst[i]}')