Loading Data

In [53]:
import pandas as pd

articles = pd.read_csv('articles.csv')

Preprocessing Data

In [54]:
import nltk
from datetime import datetime as dt
import pytz
from nltk.corpus import stopwords
nltk.download('stopwords')

# Credit to / help from https://saturncloud.io/blog/how-to-remove-stop-words-from-a-pandas-dataframe-using-python/
def remove_stopwords(words_tokenized):
    stop_words = set(stopwords.words('english'))  # List of english stopwords
    return [word for word in words_tokenized if word not in stop_words] # Using list comprehension, only choose the words that aren't stopwords

def convert_to_datetime(date_string):
    converted_date = dt.strptime(date_string, '%Y-%m-%d %H:%M:%S%z')
    converted_date = converted_date.replace(tzinfo=None)
    return converted_date

def preprocess(df, sample_size=None):
    df = df.dropna()
    # sample if specified
    if sample_size:
        df = df.sample(sample_size)
    # remove uncessary index column
    df = df.drop(df.columns[0], axis=1)
    # change stock column name to ticker
    df.rename(columns={'stock': 'ticker'}, inplace=True)
    # convert headlines to lowercase
    df['title'] = df['title'].str.lower()
    # remove punctuation
    df['title'] = df['title'].str.replace(r'[^a-zA-Z\s$0-9]', '', regex=True)
    # tokenize
    df['title'] = df['title'].str.split() 
    # remove stopwords
    df['title'] = df['title'].apply(remove_stopwords)
    # convert to datetime object
    df['date'] = df['date'].apply(convert_to_datetime)
    return df

articles = preprocess(articles, sample_size=1000)

articles

[nltk_data] Downloading package stopwords to C:\Users\Yashvardhan
[nltk_data]     Sharma\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,title,date,ticker
988839,"[stocks, set, new, 52week, highs, friday, augu...",2018-08-06 13:02:00,PFE
309834,"[bofa, lowers, target, cognizant, $77, legisla...",2013-06-19 12:07:00,CTSH
130454,"[modulus, provider, highperformance, trading, ...",2019-10-18 15:45:00,BAC
1310901,"[vermillion, inc, reports, q2, eps, $015, 15, ...",2014-08-14 16:22:00,VRML
1194268,"[synacor, reports, extension, search, ad, rela...",2018-05-31 08:38:00,SYNC
...,...,...,...
1343909,"[jung, hak, son, joins, wilshire, bancorp, chi...",2013-09-09 08:01:00,WIBC
549584,"[einhorn, says, green, mountain, thesis, playi...",2013-11-21 12:38:00,GMCR
735343,"[macquarie, analyst, downgrades, union, pacifi...",2015-08-20 08:05:00,KSU
1125060,"[earnings, scheduled, june, 6, 2019]",2019-06-06 04:02:00,SIG


Grab Stock Returns

Based on the time of the article published, we will retrieve two adjusted close prices of the stock and compute the corresponding return.

If the time of the article is published before 4:00 P.M. (non-inclusive), then:
1. The 'before' price will be the most recent (before the date) trading day's adjusted close price
2. The 'after' price will be the most upcoming trading day's adjusted close price

If the time of the article is published after 4:00 P.M., then:
1. The 'before' price will be the same day's adjusted close price
2. The 'after' priec will be the next day's adjusted close price

In [55]:
import pandas_market_calendars as mcal
from datetime import timedelta
# The paramater forward is a boolean representing whether we are looking for the next valid trading day or the most recent trading day
def getValidTradingCloseDate(date, forward=True):
        nyse = mcal.get_calendar('NYSE')
        if forward:
            start_date = date
            end_date = date+timedelta(days=15)
        else:
            start_date = date-timedelta(days=15)
            nextTradingDay = nyse.valid_days(start_date=date , end_date=date + timedelta(days=15))
            end_date = nextTradingDay[0]

        validTradingDays = nyse.valid_days(start_date=start_date , end_date=end_date)

        return validTradingDays.date[1] if forward else validTradingDays.date[-1]

In [56]:
#Get all the yfinance data we need based on date.
import yfinance as yf

def retrieve_yfinance_data(row):
    curr_date = row['date']
    
    eod = dt.strptime('16:00:00', '%H:%M:%S').time()
    
    if curr_date.time() > eod:
        start_date = curr_date
        end_date = getValidTradingCloseDate(start_date, forward=True)
    else:
        end_date = curr_date
        start_date = getValidTradingCloseDate(end_date, forward=False)
    
        
    start_date = start_date.strftime('%Y-%m-%d')
    end_date = end_date.strftime('%Y-%m-%d')
    
    print(start_date, end_date)
    
    data = yf.download(row['ticker'], start=start_date, end=end_date)
    if len(data) > 0:
        returns = (data['Adj Close'][-1] - data['Adj Close'][0]) / data['Adj Close'][0]
        
    return data

In [57]:
print(articles.iloc[0])
new_data = retrieve_yfinance_data(articles.iloc[0])

# yf.download("AVAV", start='2012-12-05', period='1d')

title     [stocks, set, new, 52week, highs, friday, augu...
date                                    2018-08-06 13:02:00
ticker                                                  PFE
Name: 988839, dtype: object


TypeError: Start and end cannot both be tz-aware with different timezones