In [1]:
import torch
import os
import pandas as pd
import numpy as np

In [2]:
tensor_dir = '../../Desktop/FullNewsBatchedTensors'
file_paths = os.listdir(tensor_dir)

loaded_tensors = []
# Load tensors from files
for path in file_paths:
    loaded_tensor = torch.load(f"{tensor_dir}/{path}")
    loaded_tensors.append(loaded_tensor)

In [3]:
combined_tensor = torch.cat(loaded_tensors, dim = 0)
combined_tensor

tensor([[0.0338, 0.3513, 0.6149],
        [0.0201, 0.9283, 0.0516],
        [0.7294, 0.0246, 0.2459],
        ...,
        [0.4446, 0.4888, 0.0666],
        [0.1363, 0.0203, 0.8434],
        [0.1363, 0.0203, 0.8434]], device='mps:0')

In [4]:
partner_headlines = pd.read_csv('./archive/raw_partner_headlines.csv')
benzinga_headlines = pd.read_csv('./archive/raw_analyst_ratings.csv')
headlines = pd.concat([partner_headlines, benzinga_headlines]).drop('Unnamed: 0', axis = 1)
headlines

Unnamed: 0,headline,url,publisher,date,stock
0,Agilent Technologies Announces Pricing of $5……...,http://www.gurufocus.com/news/1153187/agilent-...,GuruFocus,2020-06-01 00:00:00,A
1,Agilent (A) Gears Up for Q2 Earnings: What's i...,http://www.zacks.com/stock/news/931205/agilent...,Zacks,2020-05-18 00:00:00,A
2,J.P. Morgan Asset Management Announces Liquida...,http://www.gurufocus.com/news/1138923/jp-morga...,GuruFocus,2020-05-15 00:00:00,A
3,"Pershing Square Capital Management, L.P. Buys ...",http://www.gurufocus.com/news/1138704/pershing...,GuruFocus,2020-05-15 00:00:00,A
4,Agilent Awards Trilogy Sciences with a Golden ...,http://www.gurufocus.com/news/1134012/agilent-...,GuruFocus,2020-05-12 00:00:00,A
...,...,...,...,...,...
1407323,Top Narrow Based Indexes For August 29,https://www.benzinga.com/news/11/08/1888782/to...,Monica Gerson,2011-08-29 00:00:00,ZX
1407324,Recap: Wednesday's Top Percentage Gainers and ...,https://www.benzinga.com/news/earnings/11/06/1...,Benjamin Lee,2011-06-22 00:00:00,ZX
1407325,UPDATE: Oppenheimer Color on China Zenix Auto ...,https://www.benzinga.com/analyst-ratings/analy...,BenzingaStaffL,2011-06-21 00:00:00,ZX
1407326,Oppenheimer Initiates China Zenix At Outperfor...,https://www.benzinga.com/analyst-ratings/price...,Joe Young,2011-06-21 00:00:00,ZX


In [5]:
five_yr_hls = headlines.loc[(headlines['date'] >= "2010-01-01") & (headlines['date'] < "2015-01-01")]

In [6]:
positive = combined_tensor[:, 0].tolist()
negative = combined_tensor[:, 1].tolist()
neutral = combined_tensor[:, 2].tolist()


table = {'Headline':five_yr_hls['headline'].to_list(),
         'Ticker': five_yr_hls['stock'].to_list(),
         'Date': five_yr_hls['date'].to_list(),
         "Positive":positive,
         "Negative":negative, 
         "Neutral":neutral}
      
df = pd.DataFrame(table, columns = ["Headline", "Ticker", "Date", "Positive", "Negative", "Neutral"])

In [7]:
df['sentiment_embedding'] = np.tanh((np.log(df['Positive'] / df['Negative'])/ df['Neutral']))

In [9]:
means_df = df.groupby(['Ticker', 'Date']).agg({'sentiment_embedding': 'mean'}).reset_index()
means_df

Unnamed: 0,Ticker,Date,sentiment_embedding
0,A,2010-02-12 00:00:00,-0.866453
1,A,2010-02-17 00:00:00,-1.000000
2,A,2010-03-10 00:00:00,0.972619
3,A,2010-03-11 00:00:00,-0.626725
4,A,2010-03-23 00:00:00,-1.000000
...,...,...,...
593026,ZX,2014-10-22 00:00:00,-0.737813
593027,ZX,2014-11-11 00:00:00,0.998910
593028,ZX,2014-11-13 00:00:00,0.998910
593029,ZX,2014-12-19 00:00:00,0.677465


In [10]:
means_df.to_csv('./news_sentiment_data_5.csv')

In [13]:
with open('news_cmpy_list.txt', 'w') as f:
    for ticker in means_df['Ticker'].unique().tolist():
        f.write(f"{ticker}\n")