In [18]:
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string
import nltk
import itertools
import numpy as np

In [2]:
def load_data(path):
    data = pd.read_csv(path, sep="\t")
    return data
    
path = '../datasets/newyork-stock-exchange/combined-news-stock.txt'
raw_data = load_data(path)
print(raw_data.head(5))

         Date  Label                                               News
0  2008-08-08   True  b"Georgia 'downs two Russian warplanes' as cou...
1  2008-08-11   True  b'Why wont America and Nato help us? If they w...
2  2008-08-12  False  b'Remember that adorable 9-year-old who sang a...
3  2008-08-13  False  b' U.S. refuses Israel weapons to attack Iran:...
4  2008-08-14   True  b'All the experts admit that we should legalis...


In [28]:
def clean_data(raw_data):
    all_news = {}
    for index, row in raw_data.iterrows():
        news = row['News']
        date = row['Date']
        headlines = news.split("<.>")
        daily_headlines = []
        for headline in headlines:
            headline = headline.strip('"b')
            #converting the sentences into words
            tokens = word_tokenize(headline)
            #converting to lowercas
            tokens = [w.lower() for w in tokens]
            #remvoing non english alphabetic character
            words = [word for word in tokens if word.isalpha()]
            #removing the stop words
            #words = [w for w in words if not w in stop_words]
            #print(words)
            daily_headlines.append(words)
        all_news[date] = daily_headlines
    return all_news


cleaned_data = clean_data(raw_data)
print(len(cleaned_data))
for date in cleaned_data:
    print(cleaned_data[date])
    break

1989
[['georgia', 'two', 'russian', 'warplanes', 'as', 'countries', 'move', 'to', 'brink', 'of', 'war'], ['musharraf', 'to', 'be', 'impeached'], ['today', 'columns', 'of', 'troops', 'roll', 'into', 'south', 'ossetia', 'footage', 'from', 'fighting', 'youtube'], ['tanks', 'are', 'moving', 'towards', 'the', 'capital', 'of', 'south', 'ossetia', 'which', 'has', 'reportedly', 'been', 'completely', 'destroyed', 'by', 'georgian', 'artillery', 'fire'], ['afghan', 'children', 'raped', 'with', 'official', 'says', 'this', 'is', 'sick', 'a', 'three', 'year', 'old', 'was', 'raped', 'and', 'they', 'do', 'nothing'], ['russian', 'tanks', 'have', 'entered', 'south', 'ossetia', 'whilst', 'georgia', 'shoots', 'down', 'two', 'russian', 'jets'], ['breaking', 'georgia', 'invades', 'south', 'ossetia', 'russia', 'warned', 'it', 'would', 'intervene', 'on', 'so', 'side'], ['the', 'combatent', 'trials', 'are', 'nothing', 'but', 'a', 'sham', 'salim', 'haman', 'has', 'been', 'sentenced', 'to', 'years', 'but', 'will

In [14]:
def dictionary(cleaned_data,threshold):
    caps = []
    for date in cleaned_data:
        for news in cleaned_data[date]:
            caps.append(news)

    word_freq = nltk.FreqDist(itertools.chain(*caps))
    id_to_word = ['<pad>'] + [word for word, cnt in word_freq.items() if cnt >= threshold] + ['<unk>']
    word_to_id = {word:idx for idx, word in enumerate(id_to_word)}
    
    return id_to_word, word_to_id

id_to_word, word_to_id = dictionary(cleaned_data, 5)
print(word_to_id)



In [41]:
def tokenization(cleaned_data, word_to_id):
    tokens, dates = [], []
    lengths = []
    for date in cleaned_data:
        daily_headlines = []
        for headline in cleaned_data[date]:
            token = []
            for word in headline:
                if word in word_to_id:
                    token.append(word_to_id[word])
                else:
                    token.append(word_to_id['<unk>'])
            lengths.append(len(token))
            daily_headlines.append(token)
        tokens.append(daily_headlines)
        dates.append(date)
    #tokens = np.array(tokens).astype('int32')
    dates = np.array(dates)
    
    return tokens, dates, lengths

tokens, dates, lengths = tokenization(cleaned_data, word_to_id)
print(len(tokens))
print(len(tokens[0]))
print(tokens[0])

1989
25
[[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], [12, 8, 13, 11167], [14, 11167, 10, 15, 16, 17, 18, 19, 20, 21, 22, 23], [24, 25, 26, 27, 28, 29, 10, 18, 19, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39], [40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 42, 54, 55, 56, 57], [3, 24, 58, 59, 18, 19, 60, 1, 61, 62, 2, 3, 63], [64, 1, 65, 18, 19, 66, 67, 68, 69, 70, 71, 72, 73], [28, 11167, 74, 25, 57, 75, 49, 76, 11167, 11167, 31, 33, 77, 8, 78, 75, 79, 13, 80, 81, 82, 83, 84, 55, 85, 86, 68], [15, 87, 21, 11167, 29, 11167, 88, 89, 90, 91, 92, 93], [28, 11167, 1, 94, 11, 43, 66], [95, 96, 97, 94, 98, 8, 99, 100, 45, 31, 101, 102, 103, 104, 105, 106], [107, 108, 109, 71, 110, 10, 111, 112, 113, 28, 114], [72, 54, 1, 25, 115, 11, 54, 28, 116, 117, 118, 47, 119, 120, 10, 28, 121, 122, 49, 123, 124, 54, 125, 126, 127, 10, 28, 128, 10, 129], [130, 131, 132, 8, 133, 134, 10, 135, 6, 136], [137, 11, 138, 139, 14], [65, 18, 19, 140, 66, 141, 142, 79, 143, 11167, 1, 54, 144, 49, 145, 146, 11], [14