### Imports

In [2]:
import numpy as np
import pandas as pd

import string
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk import FreqDist

import os
from tqdm import tqdm

import pickle

### Aggregate News by Date

In [None]:
if not os.path.exists('reuters_news_aggregated'):
    os.makedirs('reuters_news_aggregated')

# Aggregate as list
for ticker in tqdm(os.listdir('reuters_news')):
    news = pd.read_pickle('reuters_news/'+ticker, 'bz2')

    news['news'] = news['head'].map(str)+'\n'+news['body']
    news = news.drop(columns=['head','body', 'top'])
    news = news.groupby(['ticker', 'date'], 
                        as_index=False, sort=False).agg(list)
    
    pd.to_pickle(news, 'reuters_news_aggregated/'+ticker, 'bz2')


# Further aggregate as a single string
for ticker in tqdm(os.listdir('reuters_news_aggregated')):
    try:
        news = pd.read_pickle('reuters_news_aggregated/'+ticker, 'bz2')
        news.news = news.news.str.join(' ').replace('\n', ' ')
    except:
        pass
    
    pd.to_pickle(news, 'reuters_news_aggregated/'+ticker, 'bz2')

### Generate Reuters Corpus

In [None]:
corpus = ''
for ticker in tqdm(os.listdir('reuters_news_aggregated')):
    try:
        news = pd.read_pickle('reuters_news_aggregated/'+ticker, 'bz2')
        corpus = corpus + ' '.join(news.news).replace('\n', ' ')+ ' '
    except:
        pass
    
pickle.dump(corpus, 
            open('reuters_news_corpus.pkl', "wb"), 
            protocol=pickle.HIGHEST_PROTOCOL)

### Get Top Financial Words

In [2]:
corpus = pickle.load(open('reuters_news_corpus.pkl', "rb"))

In [9]:
# Get stop words
stop = stopwords.words('english') + list(string.punctuation)
# Tokenize corpus and remoce stop words
tcorpus = [w for w in word_tokenize(corpus.lower()) if w not in stop]
# Get financial stop words
# stop_fin = [x[0] for x in FreqDist(tcorpus).most_common(65)]
# Filter financial stop words
# tcorpus = [w for w in tcorpus if w not in stop_fin]
# Get top financial words
top_fin_words = [x[0] for x in FreqDist(tcorpus).most_common(20000)]
pickle.dump(top_fin_words, 
            open('reuters_top_fin_words.pkl', "wb"), 
            protocol=pickle.HIGHEST_PROTOCOL)
top_fin_words = pickle.load(open('reuters_top_fin_words.pkl', "rb"))


### Keep only Top Financial Words in News

In [10]:
if not os.path.exists('reuters_news_filtered'):
    os.makedirs('reuters_news_filtered')

for ticker in tqdm(os.listdir('reuters_news_aggregated')):
    try:
        news = pd.read_pickle('reuters_news_aggregated/'+ticker, 'bz2')
        news.news = news.news.apply(lambda x: [w for w in word_tokenize(x.lower()) if w in top_fin_words])
    except:
        pass
    pd.to_pickle(news, 'reuters_news_filtered/'+ticker, 'bz2')

100%|██████████████████████████████████████████████████████████████████████████████| 2529/2529 [06:13<00:00,  6.78it/s]


### Label News

In [3]:
if not os.path.exists('reuters_news_labeled'):
    os.makedirs('reuters_news_labeled')

for ticker in tqdm(os.listdir('reuters_news_filtered')):
    try:
        news = pd.read_pickle('reuters_news_filtered/'+ticker, 'bz2')
        
        labels = pd.read_csv('return_labels/'+ticker+'.csv')[['Date', 'rel_short']]
        labels.columns = ['date', 'Y']
        
        news.date = pd.to_datetime(news.date).dt.date
        labels.date = pd.to_datetime(labels.date).dt.date
        
    except:
        continue 
    
    pd.to_pickle(news.merge(labels, on='date'), 'reuters_news_labeled/'+ticker, 'bz2')
        

100%|██████████████████████████████████████████████████████████████████████████████| 2529/2529 [01:43<00:00, 24.45it/s]


### Make a Single DataFrame out of all the News

In [4]:
reuters = pd.DataFrame(columns=['ticker', 'news', 'Y'])
for ticker in tqdm(os.listdir('reuters_news_labeled')):
    news = pd.read_pickle('reuters_news_labeled/'+ticker, 'bz2')
    reuters = pd.concat([reuters, news.drop(columns=['date'])], sort=False)
    
reuters = reuters.dropna().reset_index(drop=True)
pd.to_pickle(reuters, 'reuters_news_concatenated.pkl', 'bz2')

100%|███████████████████████████████████████████████████████████████████████████████| 918/918 [00:04<00:00, 218.95it/s]
