In [183]:
%matplotlib inline
import pandas as pd
import numpy as np
import time
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from datetime import datetime, timedelta
from nltk.tokenize import word_tokenize
import gensim
from gensim.models import Word2Vec

In [186]:
def process_text(path,method="tfidf",period=1,max_words=1000):
    """ Process a text into some word representation, bag of words, tfidf, word2vec etc. 

    Parameters: 
    path - The path to the data. Should be a pandas df saved as a .pkl-file, indexed by datetime. 
    method - The method for tokenizing, one of the following: ['bow', 'tfidf', 'embed', 'attention']
    period - The period to concatinate titles in expressed in days, at least 1 day. 

    Returns: 
    A numpy array (tokens) and the indices expressed as dates in datetime format. 
    """
    
    news_data = pd.DataFrame(pd.read_pickle(path))
    news_data.set_index('date',inplace=True)
    print("Found {} days with {} news titles.".format(len(set(news_data.index)), len(news_data.index)))
    tokens, data = tokenize(news_data,max_words,method,period)
    return tokens, data.index

def concat_news(news,period):
    """Merges news headlines over a period of days as one single list of words,
    tokenized by word_tokenize() in nltk. 
    
    Parameters: 
    news (DataFrame): News data of headlines (str) indexed by datetime.   
    period (int): Number of days in each period. 
    
    Returns: 
    DataFrame of lists of words indexed by datetime.
    """
    delta = timedelta(days=period)
    t1 = news.index[0]
    t2 = t1 + delta
    end_date = news.index[-1]
    data = pd.DataFrame({'titles': [], 'date': []})
    all_titles = news['title'].apply(word_tokenize)
    while(t2 < end_date):
        period_words = []
        titles = all_titles[t1.strftime("%Y-%m-%d") : t2.strftime("%Y-%m-%d")]
        for title in titles: 
            for word in title: 
                period_words.append(word)
        data = data.append({'date': t1, 'titles': gensim.utils.simple_preprocess(' '.join(period_words))},ignore_index=True)
        t1 = t2
        t2 = t2 + delta    
    titles = list([word_tokenize(t) for t in news.loc[t1.strftime("%Y-%m-%d") : end_date.strftime("%Y-%m-%d")]['title']])
    for title in titles: 
            for word in title: 
                period_words.append(word)
    data = data.append({'date': t1, 'titles': period_words},ignore_index=True)
    data.set_index('date',inplace=True)
    return data

def tokenize(news_data, max_words,method='tfidf',period=1):
    data = concat_news(news_data, period) 
    print("Concatinated to {} samples with an average of {} titles per sample.".format(len(data.index),round(len(news_data.index) / len(data.index),4)))
    if method == 'tfidf':    
        tokenizer = Tokenizer(num_words = max_words)
        tokenizer.fit_on_texts(data['titles'])
        tokens = tokenizer.texts_to_matrix(data['titles'],mode='tfidf')
        return tokens, data
    elif method == 'word2vec'
        print("Not yet implemented.")
        from gensim.models import word2vec
        processed_sents = [list(gensim.utils.tokenize(line,lowercase=True,deacc=True)) for line in news_data['title']]
        w2v = Word2Vec(processed_sents, size=150, iter=10) 
    else: 
        print("Unknown tokenizing method.")
        return None

In [182]:
#p = r"./Datasets/data/financial_headlines_20061020-20131119.pkl"
#tokens = process_text(p)