In [1]:
import gensim
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as sps
import pickle
import yfinance as yf

from gensim.models import KeyedVectors
from datetime import datetime

sns.set_style("whitegrid")

# Part 0
## Load data and remove dublicates

In [None]:
with open('/content/drive/MyDrive/ML_EPFL_Project/motley-fool-data.pkl', 'rb') as f:
    df = pickle.load(f)

In [None]:
df.shape, df[df.duplicated(["transcript","ticker","q"])].shape

((18755, 5), (1162, 5))

In [None]:
df = df[~df.duplicated(["transcript","ticker","q"])].copy(deep=True)
df.reset_index(inplace=True, drop=True)
df.shape

(17593, 5)

# Part 1
## Convert date to datetime

In [None]:
 df.date.iloc[0]

'Aug 27, 2020, 9:00 p.m. ET'

In [None]:
df[df.date == '']

Unnamed: 0,date,exchange,q,ticker,transcript
412,,NYSE: XPO,2021-Q3,XPO,"Operator\nHello, and welcome to the XPO Logist..."


412 - missed

9718 and 9919 - wrong format

In [None]:
df.iloc[412].date = 'Nov 03, 2021, 12:30 p.m. ET'
df.iloc[6305].date = 'Aug 21, 2018, 10:00 a.m. ET'
df.iloc[7408].date = 'Jul 25, 2019, 9:00 a.m. ET'
df.iloc[7629].date = 'Aug 16, 2018, 9:00 a.m. ET'
df.iloc[9046].date = 'Aug 9, 2018, 9:00 a.m. ET'
df.iloc[9718].date = 'Apr 30, 2021, 10:00 a.m. ET'
df.iloc[9919].date = 'Oct 22, 2020, 10:00 a.m. ET'

In [None]:
msk = df.date.apply(lambda x: x[-2:] != 'ET')
dates = df[msk].date.apply(lambda x: x[-1])
df.loc[msk, 'date'] = dates

In [None]:
def preprocess(obs_date):
    while obs_date[0] == ' ':
        obs_date = obs_date[1:]

    dt_splt = obs_date.split(' ')[:-1]

    if dt_splt[0][-1] == '.':
        dt_splt[0] = dt_splt[0][:-1]

    change_month = {
        "March": "Mar",
        "December": "Dec",
        "February": "Feb",
        "April": "Apr",
        "November": "Nov",
        "July": "Jul",
        "June": "Jun",
        "October": "Oct",
        "August": "Aug",
        "Sept": "Sep",
        "TranscriptMarch": "Mar",
    }

    if len(dt_splt[0]) != 3:
        dt_splt[0] = change_month[dt_splt[0]]

    change_time = {
        'p.m.': "PM", 'a.m.': "AM", 'AM.': "AM",
    }
    dt_splt[-1] = change_time[dt_splt[-1]]

    if dt_splt[2][-1] != ',':
        dt_splt[2] += ','

    joined_date = ' '.join(dt_splt[:-2])
    return datetime.strptime(joined_date, "%b %d, %Y,")#, %I:%M %p")


df.loc[:, 'date'] = df.date.apply(preprocess)
df.head()

Unnamed: 0,date,exchange,q,ticker,transcript
0,2020-08-27 00:00:00,NASDAQ: BILI,2020-Q2,BILI,"Prepared Remarks:\nOperator\nGood day, and wel..."
1,2020-07-30 00:00:00,NYSE: GFF,2020-Q3,GFF,Prepared Remarks:\nOperator\nThank you for sta...
2,2019-10-23 00:00:00,NASDAQ: LRCX,2020-Q1,LRCX,Prepared Remarks:\nOperator\nGood day and welc...
3,2019-11-06 00:00:00,NASDAQ: BBSI,2019-Q3,BBSI,"Prepared Remarks:\nOperator\nGood day, everyon..."
4,2019-08-07 00:00:00,NASDAQ: CSTE,2019-Q2,CSTE,Prepared Remarks:\nOperator\nGreetings and wel...


In [None]:
df.loc[df.ticker == 'BIO.B', "ticker"] = "BIO-B"
df.loc[df.ticker == 'BF.B', "ticker"] = "BF-B"
df.loc[df.ticker == 'COUP', "ticker"] = "COUP.MX"

In [None]:
start_date = '2017-11-01'
end_date = '2023-11-01'
tickers = df.ticker.unique().tolist()
stocks = yf.download(tickers, start=start_date, end=end_date)['Close']
stocks.head()

[*********************100%%**********************]  2876 of 2876 completed
ERROR:yfinance:
368 Failed downloads:
ERROR:yfinance:['MR', 'HZN', 'PTR', 'MGI', 'HHC', 'ISEE', 'AFYA)', 'WETF', 'EVOP', 'TMX', 'RDS.A', 'ANH', 'BKI', 'DRNA', 'FOCS', 'NATI', 'AVLR', 'ARGO', 'CTXS', 'DYNC)', 'AKYA)', 'SRC', 'DEN', 'HTA', 'ITCB', 'VTOL)', 'FOE', 'ZEAL', 'HNGR', 'STOR', 'LCI', 'DISCA', 'GPP', 'CRY', 'HCRS.Q', 'BCEI', 'UMPQ', 'STAR', 'XLRN', 'WEBR', 'BCOR', 'NTCO', 'JT', 'NLTX', 'PRTK', 'PPD', 'HMLP', 'AX.DL', 'DISH', 'KL', 'OSH', 'SI', 'VIAB', 'VALN)', 'STON', 'SGFY)', 'ARNA', 'UIHC', 'CERN', 'OPB', 'ZNGA', 'ATHX', 'ACC', 'CIB)', 'WBT', 'MNRL', 'FTCH', 'CVIA', 'ESTA)', 'TEN', 'ROLL', 'GLOP', 'WBK', 'KRT)', 'RAIN)', 'CUB', 'IEA', 'AAIC', 'AEGN', 'GWGH)', 'AINV', 'RE', 'CTK', 'BHG', 'RAIN', 'NCR', 'TGP', 'ECHO', 'ADV)', 'CHNG', 'ATC)', 'DSPG', 'RADA', 'FRC', 'OFC', 'SUMO', 'PING', 'TA', 'UNVR', 'MANT', 'CSII', 'SGEN', 'NVTA', 'GMLP', 'STL', 'TRTN', 'MXIM', 'RRD', 'ESMT', 'TRQ', 'WKME)', 'WDR', 'ROCC

Ticker,A,AA,AAIC,AAL,AAN,AAOI,AAON,AAP,AAPL,AAT,...,ZLND.Y,ZM,ZNGA,ZS,ZTS,ZUMZ,ZUO,ZVIA,ZY,ZYXI
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2017-11-01,68.199997,48.91,,48.0,,40.41,23.5,82.330002,42.467499,37.25,...,,,,,64.199997,17.75,,,,2.627273
2017-11-02,68.080002,47.68,,47.790001,,39.080002,22.866667,81.019997,41.650002,37.810001,...,,,,,64.300003,17.5,,,,2.445455
2017-11-03,68.089996,47.25,,47.360001,,39.299999,22.866667,81.389999,43.5,38.799999,...,,,,,67.769997,17.75,,,,2.527273
2017-11-06,68.269997,47.200001,,47.459999,,37.580002,23.0,81.529999,43.092499,38.349998,...,,,,,68.919998,17.950001,,,,2.5
2017-11-07,68.32,47.110001,,47.509998,,37.98,23.1,80.660004,43.477501,38.639999,...,,,,,69.730003,17.6,,,,2.481818


In [None]:
no_data_stocks = stocks.columns[stocks.isna().sum(axis=0) == stocks.shape[0]]

In [None]:
no_data_stocks

Index(['AAIC', 'AAWW', 'ABB', 'ABC', 'ABTX', 'ACC', 'ACRX', 'ADMS', 'ADV)',
       'AEGN',
       ...
       'XLNX', 'XLRN', 'XM', 'YELL', 'ZEAL', 'ZEN', 'ZIXI', 'ZLND.Y', 'ZNGA',
       'ZY'],
      dtype='object', name='Ticker', length=368)

In [None]:
missed_msk = df.ticker.apply(lambda x: x in no_data_stocks)
df = df[~missed_msk].copy(deep=True)

In [None]:
def calc_returns(elem):
    cur_ticker = elem['ticker']
    cur_date = elem['date']

    res = {}
    msk = (stocks.index >= cur_date)
    cur_pr = stocks.loc[msk, cur_ticker].iloc[0]

    for end_date, title in zip(
        [1, 7, 30],
        ['daily_r', 'weekly_r', 'monthly_r']
        ):

        nxt_pr = stocks.loc[msk, cur_ticker].iloc[end_date]

        x = 1
        while nxt_pr != nxt_pr:
            nxt_pr = stocks.loc[msk, cur_ticker].iloc[end_date + x]
            x += 1

        res[title] = (nxt_pr - cur_pr) / cur_pr

    return pd.Series(res)


ret_stat = df.apply(calc_returns, axis=1)
df = pd.concat([df, ret_stat], axis=1)

In [None]:
df = df.dropna()
df.isna().sum()

date          0
exchange      0
q             0
ticker        0
transcript    0
daily_r       0
weekly_r      0
monthly_r     0
dtype: int64

In [None]:
df.to_pickle('/content/drive/MyDrive/ML_EPFL_Project/returns.pkl')
stocks.to_csv('/content/drive/MyDrive/ML_EPFL_Project/stocks.csv')

In [None]:
df.shape

(15801, 8)

# Load market

In [2]:
start_date = '2017-11-01'
end_date = '2023-11-01'
snp500 = yf.download(['SPY', 'AAPL'], start=start_date, end=end_date)['Close']
snp500.drop(columns=['AAPL'], inplace=True)
snp500.head()

[*********************100%%**********************]  2 of 2 completed


Ticker,SPY
Date,Unnamed: 1_level_1
2017-11-01,257.48999
2017-11-02,257.589996
2017-11-03,258.450012
2017-11-06,258.850006
2017-11-07,258.670013


In [3]:
snp500['ret_mar'] = (snp500 - snp500.shift(1)) / snp500.shift(1)
snp500.fillna(0, inplace=True)

In [5]:
snp500.to_csv('/content/drive/MyDrive/ML_EPFL_Project/market.csv')

# Text preprocessing

In [None]:
import re

from nltk.tokenize import TweetTokenizer
from nltk import WordNetLemmatizer
from nltk.corpus import  stopwords
import nltk

nltk.download('stopwords')
nltk.download('wordnet')

def remove_stopwords(text, stop_words):
    word_tokens = text.split()
    filtered_text = [word for word in word_tokens if word not in stop_words]
    return ' '.join(filtered_text)


tokenizer = TweetTokenizer()
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))


def preprocess(text):
    text = ' '.join(tokenizer.tokenize(text.lower()))
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'https?:\/\/\S+', '', text)

    text = remove_stopwords(text, stop_words)
    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split()])
    return text



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [None]:
from sklearn.model_selection import train_test_split
texts_train, texts_test = train_test_split(df, test_size=0.33, random_state=228)

texts_train.shape, texts_test.shape

((10586, 8), (5215, 8))

In [None]:
texts_train.loc[:, 'transcript'] = texts_train.transcript.apply(preprocess)

In [None]:
texts_test.loc[:, 'transcript'] = texts_test.transcript.apply(preprocess)

In [None]:
texts_train.to_pickle('/content/drive/MyDrive/ML_EPFL_Project/train.pkl')
texts_test.to_pickle('/content/drive/MyDrive/ML_EPFL_Project/test.pkl')