In [138]:
import pandas as pd, numpy as np, datetime, math
from textblob import TextBlob as tb
from sklearn.preprocessing import StandardScaler
from dateutil.relativedelta import *

In [116]:
filename = 'us_equities_news_dataset.csv' #not uploaded on github as of now – download dataset from https://www.kaggle.com/gennadiyr/us-equities-news-data?select=us_equities_news_dataset.csv
df = pd.read_csv('./news_data/' + filename, index_col=0) #load in data from news_data subdirectory

In [117]:
def date_parse(s): #date strings are in form '2015-01-01'
    d = s.split('-')
    return datetime.date(int(d[0]), int(d[1]), int(d[2]))

df['release_date'] = df['release_date'].map(date_parse)
df = df.query('release_date > datetime.date(2014, 12, 25)')

In [126]:
sents = [tb(x).sentiment for x in df['title'].values]
sents

[Sentiment(polarity=0.08, subjectivity=0.26999999999999996),
 Sentiment(polarity=-0.1, subjectivity=0.6),
 Sentiment(polarity=-0.2, subjectivity=0.2),
 Sentiment(polarity=0.0, subjectivity=0.0),
 Sentiment(polarity=0.0, subjectivity=0.0),
 Sentiment(polarity=0.0, subjectivity=1.0),
 Sentiment(polarity=-0.2, subjectivity=0.2),
 Sentiment(polarity=-0.2, subjectivity=0.2),
 Sentiment(polarity=0.0, subjectivity=0.0),
 Sentiment(polarity=0.0, subjectivity=0.0),
 Sentiment(polarity=0.0, subjectivity=0.0),
 Sentiment(polarity=0.0, subjectivity=0.0),
 Sentiment(polarity=0.4, subjectivity=0.8),
 Sentiment(polarity=0.0, subjectivity=0.0),
 Sentiment(polarity=-0.3333333333333333, subjectivity=0.6666666666666666),
 Sentiment(polarity=0.0, subjectivity=0.0),
 Sentiment(polarity=0.0, subjectivity=0.0),
 Sentiment(polarity=0.5, subjectivity=0.5),
 Sentiment(polarity=0.4166666666666667, subjectivity=0.4166666666666667),
 Sentiment(polarity=0.05000000000000002, subjectivity=0.5),
 Sentiment(polarity=0.

In [127]:
long_sents = [tb(str(x)).sentiment for x in df['content']]
long_sents

[Sentiment(polarity=0.139005439005439, subjectivity=0.45647824397824394),
 Sentiment(polarity=-0.2, subjectivity=0.2),
 Sentiment(polarity=-0.015909090909090914, subjectivity=0.16363636363636364),
 Sentiment(polarity=0.11272727272727275, subjectivity=0.34),
 Sentiment(polarity=0.044949494949494934, subjectivity=0.26464646464646463),
 Sentiment(polarity=0.09999999999999999, subjectivity=0.26666666666666666),
 Sentiment(polarity=-0.2, subjectivity=0.2),
 Sentiment(polarity=0.024999999999999994, subjectivity=0.275),
 Sentiment(polarity=0.4166666666666667, subjectivity=0.5666666666666667),
 Sentiment(polarity=0.2567965367965368, subjectivity=0.5661471861471862),
 Sentiment(polarity=0.07231812169312168, subjectivity=0.32320105820105816),
 Sentiment(polarity=0.06598660070088641, subjectivity=0.4362361368789939),
 Sentiment(polarity=0.06927577360910693, subjectivity=0.429748757415424),
 Sentiment(polarity=0.041767676767676766, subjectivity=0.5190404040404041),
 Sentiment(polarity=0.0227763096

In [128]:
title_df = pd.DataFrame(sents)
content_df = pd.DataFrame(long_sents)
print(title_df.mean())
print(title_df.std())
print(content_df.mean())
print(content_df.std())

polarity        0.058163
subjectivity    0.198191
dtype: float64
polarity        0.207581
subjectivity    0.270462
dtype: float64
polarity        0.062686
subjectivity    0.409219
dtype: float64
polarity        0.074419
subjectivity    0.092631
dtype: float64


In [129]:
scaled_titles = pd.DataFrame(StandardScaler().fit_transform(title_df), columns=title_df.columns)
scaled_content = pd.DataFrame(StandardScaler().fit_transform(content_df), columns=content_df.columns)

In [130]:
scaled_titles.describe()

Unnamed: 0,polarity,subjectivity
count,192888.0,192888.0
mean,7.367413e-17,-1.250987e-16
std,1.000003,1.000003
min,-5.097603,-0.7327918
25%,-0.2801946,-0.7327918
50%,-0.2801946,-0.7327918
75%,-0.01523719,0.7461655
max,4.537213,2.964601


In [131]:
scaled_content.describe()

Unnamed: 0,polarity,subjectivity
count,192888.0,192888.0
mean,-2.068769e-16,-1.021418e-15
std,1.000003,1.000003
min,-14.27981,-4.417753
25%,-0.559814,-0.51048
50%,0.005082119,0.03882381
75%,0.5608268,0.5732228
max,12.59514,6.377811


In [93]:
#positive, neutral, negative outlook is the feature column for sentiments
#how do we weight fact and opinion? equally?
#develop sentiment measurement that has average sentiments from articles on a certain day decay over time
#multiply by general market sentiment (average all articles)

In [136]:
df['title_polarities'] = scaled_titles['polarity'].values
df['content_polarities'] = scaled_content['polarity'].values
df['sentiment'] = df['title_polarities'] * df['content_polarities'] #keep both title and content polarities in case we want to change the weights later

In [167]:
df.to_csv('news_sentiments.csv', index='id', columns=['release_date', 'ticker', 'sentiment']) #export sentiment data to csv file

In [165]:
test = pd.read_csv('news_sentiments.csv', index_col=0)

In [166]:
test

Unnamed: 0_level_0,release_date,ticker,sentiment
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
221515,2020-01-15,NIO,0.107885
221516,2020-01-18,NIO,2.689504
221517,2020-01-15,NIO,1.313464
221518,2020-01-15,NIO,-0.188412
221519,2020-01-06,NIO,0.066778
...,...,...,...
443001,2016-02-18,T,-0.288163
443002,2015-10-22,T,-0.176073
443003,2015-07-26,T,0.189848
443004,2015-08-13,T,-0.049919
