In [1]:
import pandas as pd
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import swifter
from transformers import pipeline
from datasets import Dataset, DatasetDict

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
df_news = pd.read_parquet('../data_collection/data_warehouse/news_articles.parquet')
df_news = df_news[['title', 'summary', 'time_published', 'authors', 'source']]

print(df_news.shape)
df_news.head()

(1904666, 5)


Unnamed: 0_level_0,title,summary,time_published,authors,source
newsID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,Oil prices could determine how markets react t...,The heavy new round of sanctions on Russia by ...,2022-03-01 00:02:02,[Patti Domm],CNBC
2,Zoom provides disappointing revenue forecast f...,Zoom's revenue growth is continuing to slow af...,2022-03-01 00:15:56,[Jordan Novet],CNBC
3,Wall Street rallies as West hits Russia with n...,"The SP 500 rose more than 1%, ending a four-da...",2022-03-01 00:46:51,[],Money Control
4,"Weak manufacturing drags down Q3 GDP growth, o...",India's economy grew 5.4% in the three months ...,2022-03-01 02:23:00,[www.ETCFO.com],Economic Times
5,Singapore banks halt lending for Russian goods...,Singapore's biggest banks are restricting trad...,2022-03-01 02:30:56,[Bloomberg],South China Morning Post


In [3]:
analyzer = SentimentIntensityAnalyzer()

def analyze_sentiment_vader(text):
  sentiment = analyzer.polarity_scores(text)
  return sentiment['compound']

df_news['title_sentiment_vader'] = df_news['title'].swifter.apply(analyze_sentiment_vader)
df_news['summary_sentiment_vader'] = df_news['summary'].swifter.apply(analyze_sentiment_vader)

Pandas Apply: 100%|██████████| 1904666/1904666 [01:02<00:00, 30369.02it/s]
Pandas Apply: 100%|██████████| 1904666/1904666 [02:15<00:00, 14038.37it/s]


In [4]:
df_news.head()

Unnamed: 0_level_0,title,summary,time_published,authors,source,title_sentiment_vader,summary_sentiment_vader
newsID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,Oil prices could determine how markets react t...,The heavy new round of sanctions on Russia by ...,2022-03-01 00:02:02,[Patti Domm],CNBC,0.0,0.0
2,Zoom provides disappointing revenue forecast f...,Zoom's revenue growth is continuing to slow af...,2022-03-01 00:15:56,[Jordan Novet],CNBC,-0.4939,0.3818
3,Wall Street rallies as West hits Russia with n...,"The SP 500 rose more than 1%, ending a four-da...",2022-03-01 00:46:51,[],Money Control,0.0,-0.5106
4,"Weak manufacturing drags down Q3 GDP growth, o...",India's economy grew 5.4% in the three months ...,2022-03-01 02:23:00,[www.ETCFO.com],Economic Times,0.1531,-0.296
5,Singapore banks halt lending for Russian goods...,Singapore's biggest banks are restricting trad...,2022-03-01 02:30:56,[Bloomberg],South China Morning Post,-0.1027,-0.4215


In [5]:
dataset = Dataset.from_pandas(df_news)
print(dataset)

Dataset({
    features: ['title', 'summary', 'time_published', 'authors', 'source', 'title_sentiment_vader', 'summary_sentiment_vader', 'newsID'],
    num_rows: 1904666
})


In [6]:
pipe = pipeline("text-classification", model="yiyanghkust/finbert-tone", truncation=True, max_length=512)

def classify_batch(batch):
  return {
    'title_sentiment_finbert': pipe(batch['title']),
    'summary_sentiment_finbert': pipe(batch['summary'])
  }

dataset = dataset.map(classify_batch, batched=True, batch_size=5)

Device set to use cuda:0
Map:   0%|          | 25/1904666 [00:00<10:57:53, 48.25 examples/s]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
Map: 100%|██████████| 1904666/1904666 [9:54:28<00:00, 53.40 examples/s]  


In [7]:
df_sentiment = dataset.to_pandas()
df_sentiment.head()

Unnamed: 0,title,summary,time_published,authors,source,title_sentiment_vader,summary_sentiment_vader,newsID,title_sentiment_finbert,summary_sentiment_finbert
0,Oil prices could determine how markets react t...,The heavy new round of sanctions on Russia by ...,2022-03-01 00:02:02,[Patti Domm],CNBC,0.0,0.0,1,"{'label': 'Negative', 'score': 0.63124018907547}","{'label': 'Negative', 'score': 0.6197944283485..."
1,Zoom provides disappointing revenue forecast f...,Zoom's revenue growth is continuing to slow af...,2022-03-01 00:15:56,[Jordan Novet],CNBC,-0.4939,0.3818,2,"{'label': 'Negative', 'score': 0.9999985694885...","{'label': 'Negative', 'score': 0.9999996423721..."
2,Wall Street rallies as West hits Russia with n...,"The SP 500 rose more than 1%, ending a four-da...",2022-03-01 00:46:51,[],Money Control,0.0,-0.5106,3,"{'label': 'Neutral', 'score': 0.9479693174362183}","{'label': 'Positive', 'score': 0.9999978542327..."
3,"Weak manufacturing drags down Q3 GDP growth, o...",India's economy grew 5.4% in the three months ...,2022-03-01 02:23:00,[www.ETCFO.com],Economic Times,0.1531,-0.296,4,"{'label': 'Negative', 'score': 0.9999984502792...","{'label': 'Negative', 'score': 0.9954470992088..."
4,Singapore banks halt lending for Russian goods...,Singapore's biggest banks are restricting trad...,2022-03-01 02:30:56,[Bloomberg],South China Morning Post,-0.1027,-0.4215,5,"{'label': 'Negative', 'score': 0.9858503937721...","{'label': 'Negative', 'score': 0.9719559550285..."


In [8]:
df_sentiment = df_sentiment[['newsID', 'title_sentiment_finbert', 'summary_sentiment_finbert', 'title_sentiment_vader', 'summary_sentiment_vader']]
df_sentiment.set_index('newsID', inplace=True)
df_sentiment.head()

Unnamed: 0_level_0,title_sentiment_finbert,summary_sentiment_finbert,title_sentiment_vader,summary_sentiment_vader
newsID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,"{'label': 'Negative', 'score': 0.63124018907547}","{'label': 'Negative', 'score': 0.6197944283485...",0.0,0.0
2,"{'label': 'Negative', 'score': 0.9999985694885...","{'label': 'Negative', 'score': 0.9999996423721...",-0.4939,0.3818
3,"{'label': 'Neutral', 'score': 0.9479693174362183}","{'label': 'Positive', 'score': 0.9999978542327...",0.0,-0.5106
4,"{'label': 'Negative', 'score': 0.9999984502792...","{'label': 'Negative', 'score': 0.9954470992088...",0.1531,-0.296
5,"{'label': 'Negative', 'score': 0.9858503937721...","{'label': 'Negative', 'score': 0.9719559550285...",-0.1027,-0.4215


In [9]:
df_sentiment.to_parquet('data/news_sentiment.parquet')