In [322]:
import pandas as pd
import numpy as np
import torch
from transformers import AutoModel, AutoTokenizer
from transformers import pipeline, AutoModelForSequenceClassification
from sklearn.cluster import KMeans
import torch.nn.functional as F
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from sklearn.preprocessing import StandardScaler
import lightgbm as lgb
from sklearn.metrics import accuracy_score
from sklearn.metrics import r2_score
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import Bidirectional, BatchNormalization

In [323]:
# Download stopwords list
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/balubabu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/balubabu/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [324]:
stock_tweets = pd.read_csv('stock_tweets.csv')
stock_data = pd.read_csv('stock_yfinance_data.csv')

In [325]:
print(stock_tweets.columns)
print(stock_data.columns)

Index(['Unnamed: 0.1', 'Unnamed: 0', 'Date', 'Tweet', 'Stock Name',
       'Company Name', 'Cluster', 'Sentiment_Score'],
      dtype='object')
Index(['Date', 'Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume',
       'Stock Name'],
      dtype='object')


In [326]:
print("Dimensions of the stock_tweets: ", stock_tweets.shape)
print("Dimensions of the stock data: ", stock_data.shape)

Dimensions of the stock_tweets:  (9145, 8)
Dimensions of the stock data:  (3024, 8)


### Q1 (3 marks) BERT and Clustering!

In [327]:
def preprocess_text(text):
    # Convert to lower case
    text = text.lower()
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    # Remove user @ references and '#' from tweet
    text = re.sub(r'\@\w+|\#','', text)
    # Remove punctuations
    text = re.sub(r'[^\w\s]', '', text)
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    word_tokens = nltk.word_tokenize(text)
    filtered_text = [word for word in word_tokens if word not in stop_words]
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    lemmatized_output = [lemmatizer.lemmatize(word) for word in filtered_text]
    # Re-create document from filtered tokens
    text = ' '.join(lemmatized_output)
    return text
stock_tweets['Tweet'] = [preprocess_text(tweet) for tweet in stock_tweets['Tweet']]

In [328]:
# Load tokenizer and model from Hugging Face Hub
model_name = "cardiffnlp/twitter-roberta-base-sentiment"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, output_hidden_states=True)

# Create a sentiment analysis pipeline using the loaded model and tokenizer
sentiment_pipeline = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)

tweets = stock_tweets['Tweet']

# Analyze the sentiment of each tweet
results = [(tweet, sentiment_pipeline(tweet)) for tweet in tweets]

# Print the results
for tweet, sentiment in results:
    print(f"Tweet: {tweet}\nSentiment: {sentiment}\n")
    
# Function to convert tweets to embeddings
def get_embeddings(tweets, tokenizer, model):
    embeddings = []
    max_len = 0
    for tweet in tweets:
        # Encode the tweets
        encoded_input = tokenizer(tweet, return_tensors='pt', padding=True, truncation=True, max_length=512)
        # Generate embeddings
        with torch.no_grad():
            outputs = model(**encoded_input)
        # Retrieve the full hidden states for the last layer
        tweet_embeddings = outputs.hidden_states[-1]
        max_len = max(max_len, tweet_embeddings.size(1))
        embeddings.append(tweet_embeddings)
        
    # Pad embeddings to the maximum length
    padded_embeddings = [F.pad(embed, (0, 0, 0, max_len - embed.size(1))) for embed in embeddings]
    # Concatenate along the batch dimension
    final_embeddings = torch.cat(padded_embeddings, dim=0)
    
    return final_embeddings.cpu().numpy()
    
# Obtain embeddings
embeddings = get_embeddings(tweets, tokenizer, model)

# Example of how to handle one of the embeddings
# Let's print the dimensions and some part of the embedding of the first tweet to confirm
print("Embedding dimensions for the first tweet:", embeddings[0].shape) # dimension is 768


Tweet: bought first aapl stock since seen increase tell dont invest
Sentiment: [{'label': 'LABEL_1', 'score': 0.6648916006088257}]

Tweet: medium really pushing hard big money client tsla option tomorrow sht make laughable even hitting new low anyone care sell stock year ago buffett sold aapl last year talk
Sentiment: [{'label': 'LABEL_1', 'score': 0.508172333240509}]

Tweet: tim cook ceo aapl earned total compensation warren buffett earned dividend aapl zero hour clocked zero meeting amp zero headache caused invest
Sentiment: [{'label': 'LABEL_1', 'score': 0.5247932076454163}]

Tweet: thread broad overview balance sheet want specific amp helpful example deep dive balance sheet youtube channel using aapls recent number example
Sentiment: [{'label': 'LABEL_1', 'score': 0.7013223171234131}]

Tweet: trendline aapl continuing reject
Sentiment: [{'label': 'LABEL_0', 'score': 0.626981794834137}]

Tweet: return last year bitcoin btc tesla tsla nvidia nvda netflix nflx amazon amzn apple aapl s

Embedding dimensions for the first tweet: (120, 768)


In [329]:
print("Some embeddings for the first tweet: ", embeddings[0][0, :5])  # Print the first 5 tokens of the first tweet
print("Dimensions of the embeddings ", embeddings.shape )

Some embeddings for the first tweet:  [-0.836834   -0.44356164  0.06713825 -0.5251011   0.38492772]
Dimensions of the embeddings  (9145, 120, 768)


In [330]:
# dimensions of the tweet embeddings is (9145, 223, 768) tensor
# k-means requires 2D data, hence averaging the embeddings across tokens (the middle dimension) for each tweet
# Calculate the mean across the token dimension
embeddings_mean = embeddings.mean(axis=1)

print("Dimensions of the embeddings mean: ", embeddings_mean.shape)
# Now `tweet_embeddings_mean` has shape (9145, 768)

# Perform k-means clustering
kmeans = KMeans(n_clusters=2, random_state=0).fit(embeddings_mean)
labels = kmeans.labels_
print("Labels - ", labels)

# Print the cluster each tweet belongs to
for tweet, label in zip(tweets, labels):
    print(f"Tweet: {tweet} - Cluster: {label}")
    print("------------")

Dimensions of the embeddings mean:  (9145, 768)


  super()._check_params_vs_input(X, default_n_init=10)


Labels -  [0 1 0 ... 0 0 1]
Tweet: bought first aapl stock since seen increase tell dont invest - Cluster: 0
------------
Tweet: medium really pushing hard big money client tsla option tomorrow sht make laughable even hitting new low anyone care sell stock year ago buffett sold aapl last year talk - Cluster: 1
------------
Tweet: tim cook ceo aapl earned total compensation warren buffett earned dividend aapl zero hour clocked zero meeting amp zero headache caused invest - Cluster: 0
------------
Tweet: thread broad overview balance sheet want specific amp helpful example deep dive balance sheet youtube channel using aapls recent number example - Cluster: 0
------------
Tweet: trendline aapl continuing reject - Cluster: 0
------------
Tweet: return last year bitcoin btc tesla tsla nvidia nvda netflix nflx amazon amzn apple aapl sampp spy bond agg gold gld literally picked worst performing asset - Cluster: 1
------------
Tweet: need share aapl - Cluster: 0
------------
Tweet: trade plan 

In [331]:
# Create a list of strings with formatted labels
formatted_labels = ['Cluster ' + str(label) for label in labels]
# Convert the list to a pandas Series
cluster_series = pd.Series(formatted_labels)
stock_tweets['Cluster'] = cluster_series

In [332]:
# Group by 'Cluster' and 'Label' and count occurrences
cluster_label_distribution = stock_tweets.groupby(['Stock Name', 'Cluster']).size().unstack(fill_value=0)
# Display the distribution
print(cluster_label_distribution)

Cluster     Cluster 0  Cluster 1
Stock Name                      
AAPL             3386       1670
MSFT             2608       1481


From the above cluster label distribution, we conclude that AAPL is Cluster 0 and MSFT is Cluster 1. The accuracy evaluates to be around 53.23%.

### Q2 (7 marks) Time Series Forecasting!

In [333]:
print(len(results))

9145


In [334]:
def process_tweet_scores(data):
    scores = []
    for item in data:
        tweet, sentiment = item
        label = sentiment[0]['label']
        score = sentiment[0]['score']
        
        # Conditionally adjust the score based on the label
        if label == 'LABEL_0':
            score = -score  # Negative of the score
        elif label == 'LABEL_1':
            score = 0  # Score as zero
        elif label == 'LABEL_2':
            score = score  # Positive of the score
        else:
            score = 0  # Default case, unlikely to be used unless new labels are introduced

        scores.append(score)

    return pd.Series(scores)

score_series = process_tweet_scores(results)
stock_tweets['Sentiment_Score'] = score_series

In [335]:
stock_tweets.to_csv("stock_tweets.csv")

In [336]:
# Convert 'Date' from timestamp to date only
stock_tweets['Date'] = pd.to_datetime(stock_tweets['Date']).dt.date

# Group by 'Date' and 'Stock Name' and calculate the median of the 'Sentiment_Score'
daily_median_sentiment = stock_tweets.groupby(['Date', 'Stock Name'])['Sentiment_Score'].median().reset_index()

# Generate a date range covering all dates in the dataset
date_range = pd.date_range(start=min(stock_tweets['Date']), end=max(stock_tweets['Date']))

# Assume the stocks are AAPL and MSFT; adjust as necessary
symbols = stock_tweets['Stock Name'].unique()

# Create a DataFrame with all combinations of Date and Stock Name
all_dates_symbols = pd.MultiIndex.from_product([date_range.date, symbols], names=['Date', 'Stock Name'])
all_dates_symbols_df = pd.DataFrame(index=all_dates_symbols).reset_index()

# Merge the all-date-symbol DataFrame with the median sentiments
complete_sentiment = all_dates_symbols_df.merge(daily_median_sentiment, on=['Date', 'Stock Name'], how='left')

# Fill NaN values in 'Sentiment_Score' with 0, as these represent days with no tweets
complete_sentiment['Sentiment_Score'].fillna(0, inplace=True)

In [337]:
# Count the number of zero values in the 'Sentiment_Score' column
number_of_zeros = (complete_sentiment['Sentiment_Score'] == 0).sum()

# Print the number of zero values
print(f"Number of zero values in 'Sentiment_Score': {number_of_zeros}")

print(complete_sentiment)

Number of zero values in 'Sentiment_Score': 702
           Date Stock Name  Sentiment_Score
0    2021-09-30       AAPL              0.0
1    2021-09-30       MSFT              0.0
2    2021-10-01       AAPL              0.0
3    2021-10-01       MSFT              0.0
4    2021-10-02       AAPL              0.0
..          ...        ...              ...
725  2022-09-27       MSFT              0.0
726  2022-09-28       AAPL              0.0
727  2022-09-28       MSFT              0.0
728  2022-09-29       AAPL              0.0
729  2022-09-29       MSFT              0.0

[730 rows x 3 columns]


In [338]:
complete_sentiment['Sentiment_Score'].describe()

count    730.000000
mean       0.012579
std        0.084342
min       -0.567745
25%        0.000000
50%        0.000000
75%        0.000000
max        0.821257
Name: Sentiment_Score, dtype: float64

In [339]:
# Computation of returns
stock_data = stock_data.dropna(how='all')
print(stock_data.shape)

(504, 8)


In [340]:
# Returns calculation
stock_data['Date'] = pd.to_datetime(stock_data['Date'])
stock_data.sort_values(by=['Stock Name', 'Date'], inplace=True)
stock_data['Daily Returns'] = stock_data.groupby('Stock Name')['Adj Close'].pct_change()

In [341]:
#Y1 calculation
# Filter out AAPL and MSFT into separate DataFrames
aapl_df = stock_data[stock_data['Stock Name'] == 'AAPL']
msft_df = stock_data[stock_data['Stock Name'] == 'MSFT']
# Ensure that both DataFrames are sorted by date (if not already)
aapl_df = aapl_df.sort_values(by='Date')
msft_df = msft_df.sort_values(by='Date')
# Add a new column 'AAPL_ge_MSFT' where 1 means AAPL's return >= MSFT's return, 0 otherwise
comparison_df = pd.merge(aapl_df, msft_df, on='Date', suffixes=('_AAPL', '_MSFT'))
comparison_df['AAPL_ge_MSFT'] = (comparison_df['Daily Returns_AAPL'] >= comparison_df['Daily Returns_MSFT']).astype(int)
stock_data['Y1'] = comparison_df['AAPL_ge_MSFT'].append(comparison_df['AAPL_ge_MSFT'],  ignore_index=True)

  stock_data['Y1'] = comparison_df['AAPL_ge_MSFT'].append(comparison_df['AAPL_ge_MSFT'],  ignore_index=True)


In [342]:
#Y2 Weekly Forward Volatility data
stock_data['Date'] = pd.to_datetime(stock_data['Date'])  # Ensuring 'Date' is in datetime format
stock_data.sort_values(by=['Stock Name', 'Date'], inplace=True)

# Define a function to apply the forward-looking rolling standard deviation
def calculate_forward_weekly_volatility(group):
    # Calculate rolling std dev of the next 5 days, including the current day
    # Shift the results back by 4 places so the volatility corresponds to the current day
    group['Forward Weekly Volatility'] = group['Daily Returns'].rolling(window=5).std().shift(-4)
    return group

# Apply the function separately for each stock
stock_data = stock_data.groupby('Stock Name').apply(calculate_forward_weekly_volatility)

To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  stock_data = stock_data.groupby('Stock Name').apply(calculate_forward_weekly_volatility)


In [343]:
# X1 calculating the Backward Volatility data
def calculate_backward_weekly_volatility(group):
    # Calculate the rolling standard deviation for a window of 5 days
    group['Backward Weekly Volatility'] = group['Daily Returns'].rolling(window=5).std()
    return group

stock_data = stock_data.groupby('Stock Name').apply(calculate_backward_weekly_volatility)

To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  stock_data = stock_data.groupby('Stock Name').apply(calculate_backward_weekly_volatility)


In [344]:
# Adding lagged stock winner data
for i in range(1, 6):
    stock_data[f'Y1_t-{i}'] = stock_data.groupby('Stock Name')['Y1'].shift(i)

In [345]:
# Adding the Sentiment_Score to the stock_data dataframe
# Convert 'Date' to datetime in both DataFrames, if not already done
stock_data['Date'] = pd.to_datetime(stock_data['Date'])
complete_sentiment['Date'] = pd.to_datetime(complete_sentiment['Date'])

stock_data = stock_data.merge(complete_sentiment[['Date', 'Stock Name', 'Sentiment_Score']],
                              on=['Date', 'Stock Name'],
                              how='left')

In [346]:
# Adding lagged Sentiment Score data
for i in range(1, 6):
    stock_data[f'Sentiment_Score_Lag_{i}'] = stock_data.groupby('Stock Name')['Sentiment_Score'].shift(i)

In [347]:
# Adding the Apple_wins in 5 days
stock_data['Apple_Wins_5'] = stock_data[['Y1_t-1', 'Y1_t-2', 'Y1_t-3', 'Y1_t-4', 'Y1_t-5']].fillna(0).sum(axis=1)

In [348]:
stock_data.to_csv("stock_data_my_OP.csv")

### Modeling

In [349]:
# Convert 'Date' to datetime if not already done
stock_data['Date'] = pd.to_datetime(stock_data['Date'])
# Find the maximum date in the dataset
max_date = stock_data['Date'].max()
# Calculate the date two months before the maximum date
split_date = max_date - pd.DateOffset(months=2)

# Split the data into training and validation sets
train_data = stock_data[stock_data['Date'] <= split_date]
validation_data = stock_data[stock_data['Date'] > split_date]

# Encoding the 'Stock Name' column
train_data['Stock_Name_Encoded'] = np.where(train_data['Stock Name'] == 'AAPL', 1, 0)
validation_data['Stock_Name_Encoded'] = np.where(validation_data['Stock Name'] == 'AAPL', 1, 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data['Stock_Name_Encoded'] = np.where(train_data['Stock Name'] == 'AAPL', 1, 0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  validation_data['Stock_Name_Encoded'] = np.where(validation_data['Stock Name'] == 'AAPL', 1, 0)


In [350]:
# Print the details of the splits
print(f"Training data range: {train_data['Date'].min()} to {train_data['Date'].max()}")
print(f"Validation data range: {validation_data['Date'].min()} to {validation_data['Date'].max()}")
print(f"Training set size: {train_data.shape[0]} rows")
print(f"Validation set size: {validation_data.shape[0]} rows")

Training data range: 2021-09-30 00:00:00 to 2022-07-29 00:00:00
Validation data range: 2022-08-01 00:00:00 to 2022-09-29 00:00:00
Training set size: 418 rows
Validation set size: 86 rows


### LSTM - Classification

In [351]:
# Features and target
features = ['Stock_Name_Encoded','Backward Weekly Volatility', 'Y1_t-1', 'Y1_t-2', 'Y1_t-3', 'Y1_t-4', 'Y1_t-5', 'Sentiment_Score_Lag_1', 'Sentiment_Score_Lag_2', 
            'Sentiment_Score_Lag_3', 'Sentiment_Score_Lag_4', 'Sentiment_Score_Lag_5', 'Apple_Wins_5']
target = 'Y1'

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(train_data[features])
X_val_scaled = scaler.transform(validation_data[features])

# Targets
y_train = train_data[target].values
y_val = validation_data[target].values

# Reshape for LSTM [samples, time steps, features]
X_train_scaled = X_train_scaled.reshape((X_train_scaled.shape[0], 1, X_train_scaled.shape[1]))
X_val_scaled = X_val_scaled.reshape((X_val_scaled.shape[0], 1, X_val_scaled.shape[1]))

In [352]:
# Revised model architecture
model = Sequential([
    Bidirectional(LSTM(50, return_sequences=True), input_shape=(X_train_scaled.shape[1], X_train_scaled.shape[2])),
    Dropout(0.3),
    LSTM(100),
    Dropout(0.3),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

  super().__init__(**kwargs)


In [353]:
# Evaluate the model on the validation set
performance = model.evaluate(X_val_scaled, y_val, verbose=1)
print(f'Validation Accuracy: {performance[1]}')

[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.5585 - loss: 0.6931  
Validation Accuracy: 0.5232558250427246


### LGBM - Classification

In [354]:
# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(train_data[features])
X_val_scaled = scaler.transform(validation_data[features])

# Prepare the target values
y_train = train_data[target].values
y_val = validation_data[target].values

# Create LightGBM datasets
train_set = lgb.Dataset(X_train_scaled, label=y_train)
valid_set = lgb.Dataset(X_val_scaled, label=y_val, reference=train_set)

In [355]:
# Set the parameters for the model
params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'binary_logloss',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 0
}

num_boost_round = 1000
lgbm_model = lgb.train(
    params,
    train_set,
    num_boost_round=num_boost_round,
    valid_sets=[valid_set],
    callbacks=[lgb.early_stopping(stopping_rounds=10), lgb.log_evaluation(period=10)]
)

# Make predictions
y_pred = lgbm_model.predict(X_val_scaled, num_iteration=lgbm_model.best_iteration)
# Convert probabilities to binary output
y_pred_binary = np.where(y_pred > 0.5, 1, 0)

# Calculate accuracy
accuracy = accuracy_score(y_val, y_pred_binary)
print("------------------------------")
print(f'Validation Accuracy: {accuracy}')

Training until validation scores don't improve for 10 rounds
[10]	valid_0's binary_logloss: 0.691371
Early stopping, best iteration is:
[1]	valid_0's binary_logloss: 0.677172
------------------------------
Validation Accuracy: 0.627906976744186


### LightGBM for volatility

In [356]:
# Standardizing the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(train_data[features])
X_val_scaled = scaler.transform(validation_data[features])

y_train = train_data[target]
y_val = validation_data[target]

# Setting up LightGBM dataset for regression
train_set = lgb.Dataset(X_train_scaled, label=y_train)
val_set = lgb.Dataset(X_val_scaled, label=y_val, reference=train_set)

# Setting parameters for regression
params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'l2',  # Use Mean Squared Error as the loss ('l2' in LightGBM)
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 0
}

In [357]:
# Train the regression model
num_boost_round = 1000
model = lgb.train(
    params,
    train_set,
    num_boost_round=num_boost_round,
    valid_sets=[val_set],
    callbacks=[lgb.early_stopping(stopping_rounds=50), lgb.log_evaluation(period=10)]
)

Training until validation scores don't improve for 50 rounds
[10]	valid_0's l2: 0.249695
[20]	valid_0's l2: 0.258531
[30]	valid_0's l2: 0.266395
[40]	valid_0's l2: 0.26903
[50]	valid_0's l2: 0.278189
Early stopping, best iteration is:
[1]	valid_0's l2: 0.242025


In [358]:
# Predictions on the validation set
y_pred = model.predict(X_val_scaled, num_iteration=model.best_iteration)

# Evaluate the model using R-squared metric
r_squared = r2_score(y_val, y_pred)
print(f'R-squared for the validation set: {r_squared}')

R-squared for the validation set: -0.03589090316559451
