In [None]:
import nltk
from datetime import datetime as dt
import pytz
from nltk.corpus import stopwords
nltk.download('stopwords')

# Credit to / help from https://saturncloud.io/blog/how-to-remove-stop-words-from-a-pandas-dataframe-using-python/
def remove_stopwords(words_tokenized):
    stop_words = set(stopwords.words('english'))  # List of english stopwords
    return [word for word in words_tokenized if word not in stop_words] # Using list comprehension, only choose the words that aren't stopwords

def convert_to_datetime(date_string):
    
    date_string, _ = date_string.rsplit("-", 1)
    
    converted_date = dt.strptime(date_string, '%Y-%m-%d %H:%M:%S')
    return converted_date

def preprocess(df, sample_size=None):
    df = df.dropna()
    # sample if specified
    if sample_size:
        df = df.sample(sample_size)
    # remove uncessary index column
    df = df.drop(df.columns[0], axis=1)
    # change stock column name to ticker
    df.rename(columns={'stock': 'ticker'}, inplace=True)
    # convert headlines to lowercase
    df['title'] = df['title'].str.lower()
    # remove punctuation
    df['title'] = df['title'].str.replace(r'[^a-zA-Z\s$0-9]', '', regex=True)
    # tokenize
    df['title'] = df['title'].str.split() 
    # remove stopwords
    df['title'] = df['title'].apply(remove_stopwords)
    # convert to datetime object
    df['date'] = df['date'].apply(convert_to_datetime)
    return df

Grab Stock Returns

Based on the time of the article published, we will retrieve two adjusted close prices of the stock and compute the corresponding return.

If the time of the article is published before 4:00 P.M. (non-inclusive), then:
1. The 'before' price will be the most recent (before the date) trading day's adjusted close price
2. The 'after' price will be the most upcoming trading day's adjusted close price

If the time of the article is published after 4:00 P.M., then:
1. The 'before' price will be the same day's adjusted close price
2. The 'after' priec will be the next day's adjusted close price

In [None]:
import pandas_market_calendars as mcal
from datetime import timedelta
# The paramater forward is a boolean representing whether we are looking for the next valid trading day or the most recent trading day
def getValidTradingCloseDate(date, forward=True):
        nyse = mcal.get_calendar('NYSE')
        if forward:
            start_date = date
            end_date = date+timedelta(days=15)
        else:
            start_date = date-timedelta(days=15)
            end_date = date

        validTradingDays = nyse.valid_days(start_date=start_date , end_date=end_date)
        return validTradingDays.date[2] if forward else validTradingDays.date[-2]


In [None]:
#Get all the yfinance data we need based on date.
import yfinance as yf

def retrieve_yfinance_data(row):
    curr_date = row['date']
    
    eod = dt.strptime('16:00:00', '%H:%M:%S').time()
    
    if curr_date.time() > eod:
        start_date = curr_date.date()
        end_date = getValidTradingCloseDate(start_date, forward=True)
    else:
        end_date = curr_date.date()
        start_date = getValidTradingCloseDate(end_date, forward=False)
        end_date = end_date + timedelta(days=1)
        
    data = yf.download(row['ticker'], start=start_date, end=end_date, progress=False, show_errors=False)
    
    if len(data) > 0:
        returns = (data['Adj Close'][-1] - data['Adj Close'][0]) / data['Adj Close'][0]
        return returns
    else:
        return None

In [None]:
from tqdm import tqdm

def get_returns(df):
    # df['returns'] = df.apply(retrieve_yfinance_data, axis=1)
    for idx, row in tqdm(df.iterrows(), total=df.shape[0]):
        returns = retrieve_yfinance_data(row)
        # print(returns)
        df.loc[idx,'returns'] = returns

    return df

Uncomment this code to 
1. sample two datasets of 50k samples from the 800k+ samples from the dataset.
2. save each dataset to its own pkl file 

*Note: We saved 2 datasets of 50k samples each because we had two people download them simultaneously to save time.


In [None]:
# fifty_thousand_articles_1 = preprocess(articles, sample_size=50000)
# fifty_thousand_articles_1 = get_returns(fifty_thousand_articles_1)
# pd.to_pickle(fifty_thousand_articles_1, '50k_processed_articles_1.pkl')

# fifty_thousand_articles_2 = preprocess(articles, sample_size=50000)
# fifty_thousand_articles_2 = get_returns(50k_articles_2)
# pd.to_pickle(fifty_thousand_articles_2, '50k_processed_articles_2.pkl')

Merge Two Datasets into 1 Dataset

In [None]:
# load both datasets
dataset_1 = pd.read_pickle('50k_processed_data_1.pkl')
dataset_2 = pd.read_pickle('50k_processed_data_2.pkl')

# merge the two datasets (disclude repeated rows)
dataset = pd.concat([dataset_1, dataset_2], axis=0)

# eliminate rows with the same index
dataset = dataset[~dataset.index.duplicated(keep='first')]

len(dataset)

Split Dataset into Training, Validation, and Testing

In [None]:
from sklearn.model_selection import train_test_split

def split(df):
    x_train, x_test_and_val, y_train, y_test_and_val  = train_test_split(df['title'], df['returns'], random_state=42, test_size=0.2) # train set is 80%,
    x_test, x_val, y_test, y_val = train_test_split(x_test_and_val, y_test_and_val, test_size=0.5, random_state=42) # test and val are 50% of the remaining 20% = 10%. 
    return x_train, y_train, x_val, y_val, x_test, y_test

x_train, y_train, x_val, y_val, x_test, y_test = split(dataset) # split the data


In [None]:
pd.to_pickle(dataset, 'Data/dataset.pkl')

Return Distribution Analysis

In [None]:
# Analyze the articles dataset distribution
import matplotlib.pyplot as plt
import numpy as np

def plot_returns(ax, datasetType, returns, bins=None):
    # If bins isn't specified, set it to 1/10th of the number of returns

    ax.hist(returns, bins=bins, alpha=0.7, color='b', edgecolor='black')
    ax.set_xlabel(f'{datasetType} Returns')
    ax.set_ylabel('Frequency')
    ax.set_title(f'Distribution of {datasetType} Returns')
    ax.set_xlim(-0.4, 0.4)

    # Calculate mean and standard deviation
    mean_return = np.mean(returns)
    std_deviation = np.std(returns)

    # Add mean and standard deviation to the plot
    ax.text(0.05, 0.9, f'Mean: {mean_return:.4f}', transform=ax.transAxes)
    ax.text(0.05, 0.85, f'Standard Deviation: {std_deviation:.4f}', transform=ax.transAxes)

def display_return_plots(y_train, y_val, y_test):
    # Create subplots
    _, axs = plt.subplots(1, 3, figsize=(15, 5))

    # Plot train, val, and test returns
    plot_returns(axs[0], 'Train', y_train, bins=2000)
    plot_returns(axs[1], 'Validation', y_val, bins=3000)
    plot_returns(axs[2], 'Test', y_test, bins=3000)

    plt.tight_layout()
    plt.show()

display_return_plots(y_train, y_val, y_test)

Loading Data

In [None]:
pd.to_pickle(dataset, 'Data/dataset.pkl')

Dataset Analysis

In [None]:
def analyze(x, y):
    num_samples = len(x)
    
    num_pos = len(y[y > 0])
    num_neg = len(y[y < 0])
    num_zero = len(y[y == 0])

    list_lengths = x.apply(len)

    # Find the size of the smallest/largest list
    num_min_tokens = min(list_lengths)
    num_max_tokens = max(list_lengths)
    num_mean_tokens = list_lengths.mean()
    
    return {"Number of Samples":num_samples, 
            "Number of Samples with Positive Returns": num_pos,
            "Number of Samples with No Returns": num_zero,
            "Number of Samples with Negative Returns": num_neg,
            "Minimum Number of Tokens": num_min_tokens, 
            "Maximum Number of Tokens":num_max_tokens, 
            "Mean Number of Tokens":num_mean_tokens}

def df_for_analysis(train_analysis, test_analysis, validation_analysis):
    df = pd.DataFrame([train_analysis, test_analysis, validation_analysis], index=['Train', 'Test', 'Validation'])
    return df

analysis_df = df_for_analysis(analyze(x_train, y_train), analyze(x_test,y_test), analyze(x_val, y_val))

# export dataset analysis dataframe as png
import dataframe_image as dfi
dfi.export(analysis_df, 'Data Analysis/Dataset Analysis.png')

analysis_df


In [None]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
articles = pd.read_csv('articles.csv')

Preprocessing Data

In [None]:
articles.head()