## Sentiment Analysis with BERT

In [37]:
from bs4 import BeautifulSoup
from torch import argmax
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import requests

### Load model

In [3]:
model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

### Web scraping

#### Define scraper

In [4]:
def getReviews(link):
    webpage = requests.get(link).text
    soup = BeautifulSoup(webpage, 'html.parser')
    return [i.text for i in soup.find_all('p', 'comment__09f24__gu0rG css-qgunke')]
    

### Intial Request

In [5]:
reviews = []
intial_link = "https://www.yelp.com/biz/ippudo-ny-new-york-7"
reviews.extend(getReviews(intial_link))

#### Crawling through remaining pages

In [6]:
for i in tqdm(range(10, 100, 10)):
    later_link = f"https://www.yelp.com/biz/ippudo-ny-new-york-7?start={i}"
    reviews.extend(getReviews(later_link)) 

### Sentiment Score

In [41]:
def sentiment_score(review):
    tokens = tokenizer.encode(review, return_tensors='pt')
    return argmax(model(tokens).logits[0])

In [8]:
sentiment = []
labels = {
    0 : 'Bad',
    1 : 'Poor',
    2 : 'Average',
    3 : 'Great',
    4 : 'Excellent'
}
for i in reviews:
    sentiment.append(labels[sentiment_score(i[:512])]) # maximum tokens accpeted by the bert model

### Results

In [15]:
dict(zip(range(len(reviews)), sentiment))

{0: 'Excellent',
 1: 'Excellent',
 2: 'Great',
 3: 'Bad',
 4: 'Great',
 5: 'Great',
 6: 'Excellent',
 7: 'Great',
 8: 'Excellent',
 9: 'Great',
 10: 'Great',
 11: 'Great',
 12: 'Average',
 13: 'Great',
 14: 'Excellent',
 15: 'Great',
 16: 'Excellent',
 17: 'Excellent',
 18: 'Excellent',
 19: 'Excellent',
 20: 'Excellent',
 21: 'Excellent',
 22: 'Excellent',
 23: 'Poor',
 24: 'Great',
 25: 'Average',
 26: 'Excellent',
 27: 'Excellent',
 28: 'Poor',
 29: 'Excellent',
 30: 'Great',
 31: 'Excellent',
 32: 'Excellent',
 33: 'Great',
 34: 'Excellent',
 35: 'Excellent',
 36: 'Average',
 37: 'Excellent',
 38: 'Excellent',
 39: 'Great',
 40: 'Excellent',
 41: 'Great',
 42: 'Great',
 43: 'Great',
 44: 'Great',
 45: 'Excellent',
 46: 'Excellent',
 47: 'Great',
 48: 'Great',
 49: 'Great',
 50: 'Excellent',
 51: 'Great',
 52: 'Poor',
 53: 'Excellent',
 54: 'Poor',
 55: 'Excellent',
 56: 'Excellent',
 57: 'Great',
 58: 'Excellent',
 59: 'Great',
 60: 'Great',
 61: 'Great',
 62: 'Great',
 63: 'Excell