#### Install and Import Dependencies

In [33]:
# install dependencies
!pip install transformers requests beautifulsoup4 pandas numpy

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [6]:
# import transformers necessary for sentiment analysis/webscraping
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import requests
from bs4 import BeautifulSoup
import re

#### Instantiate the Model

In [7]:
#load the tokenizer from a BERT model, which has been pretrained on multiple languages
tokenizer = AutoTokenizer.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')
#load model for sequence classification tasks 
model = AutoModelForSequenceClassification.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')

tokenizer_config.json:   0%|          | 0.00/39.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/953 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/872k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/669M [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()


#### Encode and Calculate Sentiment

In [9]:
# take tokenizer and convert into a sequence of integers, where each integer represents a specific token that is part of in the model's vocab
tokens = tokenizer.encode('It was good but couldve been better. Great', return_tensors='pt')
result = model(tokens)

In [11]:
# get scores of each class
result.logits

tensor([[-2.7768, -1.2353,  1.4419,  1.9804,  0.4584]],
       grad_fn=<AddmmBackward0>)

In [12]:
# find class with higher classification score
int(torch.argmax(result.logits))+1

4

#### Collect New Titles

Here, we would perform webscraping, which is done on the web server. For the purposes of testing, the data in the following cells are loaded in manually by creating csv files beforehand. 

#### Load News Titles into DataFrame and Score (Iter 1)

In [14]:
import numpy as np
import pandas as pd

In [20]:
# read in news titles
news = pd.read_csv('news_titles.csv')
news.head()

Unnamed: 0,Titles
0,The broad S&P 500 index has continued to climb...
1,Mike Khouw discusses how stocks don't typicall...
2,We'll review an options trade for investors wh...
3,"If history is any indication, stocks could be ..."
4,Asset managers were already cutting their fees...


In [21]:
news['Titles'].iloc[0]

"The broad S&P 500 index has continued to climb to new highs in recent days. Here's what to know about investing in funds that track the index."

In [23]:
# function that encodes each title and returns the classified sentiment score
def sentiment_score(title):
    tokens = tokenizer.encode(title, return_tensors='pt')
    result = model(tokens)
    return int(torch.argmax(result.logits))+1

In [25]:
sentiment_score(news['Titles'].iloc[1])

3

In [27]:
# add a new column to the dataframe that shows all title sentiments
news['sentiment'] = news['Titles'].apply(lambda x: sentiment_score(x[:22]))

In [28]:
news

Unnamed: 0,Titles,sentiment
0,The broad S&P 500 index has continued to climb...,5
1,Mike Khouw discusses how stocks don't typicall...,4
2,We'll review an options trade for investors wh...,3
3,"If history is any indication, stocks could be ...",3
4,Asset managers were already cutting their fees...,3
5,There's a current macroeconomic debate among n...,3
6,Jeff Kilburg's bullish view is that the recent...,1
7,Market volume may be dormant this time of the ...,3
8,Anger about higher prices appears to outweigh ...,2
9,Funds like the JPMorgan Premium Income ETF (JE...,5


#### Load News Titles into DataFrame and Score (Iter 2)

In [30]:
# read in news titles from japan
news_japan = pd.read_csv('news_titles - Sheet2.csv')
news_japan.head()

Unnamed: 0,Titles
0,"The U.S. dollar held losses on Friday, after d..."
1,Asian markets digested inflation numbers from ...
2,The U.S. dollar held firm on Friday after risi...
3,"Hong Kong, China stocks lead gains in Asia as ..."
4,"The U.S. dollar dropped on Wednesday, taking a..."


In [31]:
# add a new column to the dataframe that shows all title sentiments
news_japan['sentiment'] = news_japan['Titles'].apply(lambda x: sentiment_score(x[:]))

In [32]:
news_japan

Unnamed: 0,Titles,sentiment
0,"The U.S. dollar held losses on Friday, after d...",1
1,Asian markets digested inflation numbers from ...,1
2,The U.S. dollar held firm on Friday after risi...,1
3,"Hong Kong, China stocks lead gains in Asia as ...",2
4,"The U.S. dollar dropped on Wednesday, taking a...",1
5,Asian markets will look at trade numbers from ...,4
6,The Bank of Japan is expected to hold its nega...,3
7,The dollar steadied as looming central bank de...,2
8,Asia markets climbed after the S&P broke past ...,1
9,Kathy Lien of BK Asset Management expects 'sig...,3
