In [None]:
!pip install statsmodels

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import numpy as np
import pandas as pd
from statsmodels.tsa.holtwinters import SimpleExpSmoothing
import os

In [None]:
# Change Directory for Twitter Data
os.chdir('../data/alternative_alpha_datasets/')

In [None]:
# Read CSV from Twitter sentiment analysis with alpha = 0.2
collated_df = pd.read_csv('Twitter_Sentiments.csv')
# Convert Earnings Date and Previous Earnings Date to datetime format
collated_df['Earnings Date'] = pd.to_datetime(collated_df['Earnings Date'], format = '%d/%m/%Y')
collated_df['Previous Earnings Date'] = pd.to_datetime(collated_df['Previous Earnings Date'], format = '%d/%m/%Y')
# Number of days between each quarter
collated_df['Delta'] = (collated_df['Earnings Date'] - collated_df['Previous Earnings Date']).dt.days
sentiment_df = collated_df[['Ticker', 'Earnings Date', 'Previous Earnings Date', 'Delta']].copy(deep = True)
sentiment_lst = sentiment_df.values.tolist()

# Initialise column for Twitter sentiment analysis
collated_df['Twitter Sentiment'] = np.nan

In [None]:
# Change Directory for Twitter Data
os.chdir('../data/datafiles/twitter/')

In [None]:
# Initialize a list to collect files not found
fnf = []

# Loop through each stock per quarter in the Yahoo Finance list
for idx, ticker in enumerate(sentiment_lst):
    # Format the file name
    file = f'{ticker[0]}_Tweets_{ticker[4]}.csv'
    print(f'Initiating {idx}, {ticker[0]} {ticker[4]} collation...')

    try:
        # Read the CSV file if it exists
        ticker_df = pd.read_csv(file)
    
        # Initialize a datetime series from the previous earnings date to the current earnings date
        tmp = np.zeros(ticker[3]+1)
        index = pd.date_range(start=ticker[2], end=ticker[1], freq='D')
        data = pd.Series(tmp, index)

        # Convert date column to datetime format
        ticker_df['date'] = pd.to_datetime(ticker_df['date'], format = '%Y-%m-%d')
        ticker_df['collated_score'] = np.nan

        # Calculcate a collated sentiment score for each date in the range
        # This score is a weighted average based on retweet count + 1 (to include the Tweet author in the count)
        # Retweet count + 1 serves as a measure of the number of people who agree with the sentiment
        for date in index:
            mask = ticker_df['date'] == date
            ticker_df.loc[mask,'collated_score'] = ticker_df.loc[mask, 'compound_score'] * ((ticker_df.loc[mask,'retweets_count']+1)/(ticker_df.loc[mask,'retweets_count'].sum()+ticker_df[mask].count()[0]))
            data[date] = ticker_df.loc[mask, 'collated_score'].sum()

        # Perform a linear interpolation on the data such that the distribution is not affected
        data.interpolate(inplace=True)

        # Perform exponential smoothing with alpha = 0.6 on the data
        # Heuristic initialization method is only available if the number of observations is more than 10
        try:
            model_fit = SimpleExpSmoothing(data, initialization_method="heuristic").fit(
                smoothing_level=0.6, optimized=False
            )
        except:
            model_fit = SimpleExpSmoothing(data).fit(
                smoothing_level=0.6, optimized=False
            )
        
        # Obtain the final value as the aggregated sentiment score over the whole period
        model_fcast = model_fit.forecast(1)
        
        # Input the final aggregated sentiment score into the df to be saved
        collated_df.loc[idx, 'Twitter Sentiment'] = model_fcast[-1]
        print(f'Completed {idx}, {ticker[0]} {ticker[4]} collation.')
    
    # If file not found, append to fnf list
    except FileNotFoundError:
        print(f'File Not Found: {idx}, {file}')
        fnf.append((idx, ticker))

In [None]:
# Check for files not found - Should have only 1 observation
print(len(fnf), fnf)

In [None]:
# Check that there is only one missing value corresponding to the file not found
collated_df['Twitter Sentiment'].isna().sum()

In [None]:
# Drop the redundant column
collated_df = collated_df.drop('Earnings Date STR', axis=1)

In [None]:
# Uncomment this line if you wish to save to CSV
# collated_df.to_csv('Twitter_Sentiments_Alpha0.6.csv')