In [60]:
import os
import datetime
import numpy as np
import re
import warnings
import pandas as pd

warnings.filterwarnings("ignore")

In [140]:
def load_data(data_path):
    
    stock_news_df = pd.read_csv(data_path + 'news_reuters.csv', header=None,
                                names=['tickers', 'company', 'date', 'headline', 'first_sent', 'priority'])
    stock_price_df = pd.read_json(data_path + 'stockprices.json')
    
    return stock_news_df, stock_price_df

In [151]:
def transform_stock_price(price_df, duration):

    transform_df = price_df[duration].apply(pd.Series)
    transform_df = transform_df.stack().rename('price_change' + '_' + duration).reset_index()
    transform_df.rename(columns={'level_0': 'tickers', 'level_1': 'date'}, inplace=True)
    transform_df.date = transform_df.date.astype('int64')
    
    if duration == 'short':
        transform_df['signal'] = transform_df['price_change' + '_' + duration] \
        .map(lambda x: "stay" if -1 < x < 1 else ("up" if x > 1 else "down"))
    return transform_df

def combine_stock_news(news_df, price_df):
    
    combined_df = news_df.copy()
    
    durations = price_df.columns
    for duration in durations:
        price_duration_df = transform_stock_price(price_df, duration)
        combined_df = pd.merge(left=combined_df, right=price_duration_df,
                       on=['date', 'tickers'], how='inner')
    return combined_df
    

In [152]:
data_path = "inputs/"
news_df, price_df = load_data(data_path)

combined_df = combine_stock_news(news_df, price_df)
