1. Install and Import Dependencies  

In [1]:
!pip install torch torchvision torchaudio



In [2]:
!pip install transformers requests beautifulsoup4 pandas numpy torch



In [3]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


2. Instantiate Model    

In [4]:
tokenizer = AutoTokenizer.from_pretrained("nlptown/bert-base-multilingual-uncased-sentiment")

model = AutoModelForSequenceClassification.from_pretrained("nlptown/bert-base-multilingual-uncased-sentiment")

3. Encode & Calculate Sentiment

In [5]:
df2 = pd.read_csv('../raw_data/vino_verdict_raw_data/winemag-data-130k-v2.csv')
df_sorted = df2.sort_values('points', ascending=True)
df2.head(3)

Unnamed: 0.1,Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery
0,0,Italy,"Aromas include tropical fruit, broom, brimston...",Vulkà Bianco,87,,Sicily & Sardinia,Etna,,Kerin O’Keefe,@kerinokeefe,Nicosia 2013 Vulkà Bianco (Etna),White Blend,Nicosia
1,1,Portugal,"This is ripe and fruity, a wine that is smooth...",Avidagos,87,15.0,Douro,,,Roger Voss,@vossroger,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos
2,2,US,"Tart and snappy, the flavors of lime flesh and...",,87,14.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,Rainstorm


In [6]:
df1 = pd.read_csv('../raw_data/vino_verdict_raw_data/winemag-data_first150k.csv')
df_sorted = df1.sort_values('points', ascending=True)
df1.head(3)

Unnamed: 0.1,Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,variety,winery
0,0,US,This tremendous 100% varietal wine hails from ...,Martha's Vineyard,96,235.0,California,Napa Valley,Napa,Cabernet Sauvignon,Heitz
1,1,Spain,"Ripe aromas of fig, blackberry and cassis are ...",Carodorum Selección Especial Reserva,96,110.0,Northern Spain,Toro,,Tinta de Toro,Bodega Carmen Rodríguez
2,2,US,Mac Watson honors the memory of a wine once ma...,Special Selected Late Harvest,96,90.0,California,Knights Valley,Sonoma,Sauvignon Blanc,Macauley


In [7]:
description_01 = df1.loc[0, 'description']
description_02 = df2.loc[0, 'description']

In [8]:
tokens1 = tokenizer.encode(description_01, return_tensors='pt')
tokens2 = tokenizer.encode(description_02, return_tensors='pt')

In [9]:
tokens1

tensor([[  101, 10372, 11555, 11719, 23097, 10107, 10445,   110, 65896, 10159,
         28312, 91810, 10107, 10195, 26195, 12378, 10110, 10140, 29213, 10323,
         10916, 10868, 10104, 26195,   119, 12955, 38469, 10452,   118, 35862,
         25527, 10110,   143, 85065, 61318, 18885, 10123, 10108, 15281, 20229,
         50565, 10337, 10103, 55543, 10218,   117, 27177, 10163, 10151, 80768,
           117, 12922, 12022, 11978, 10107, 10110,   143, 13241, 21601, 14581,
         10158, 27433, 10104, 10103, 23158,   119, 31117, 10163, 10110, 92306,
         10285, 10195, 13982, 10114, 24919,   117, 10197, 10438, 10868, 29293,
         10108, 10197, 10114, 26514, 14194, 22537, 12897,   119, 61530, 73786,
           100, 73688,   119,   102]])

In [10]:
tokens2

tensor([[  101, 10133, 47517, 12622, 10754, 25527,   117, 26895, 10785,   117,
         18710, 73460, 19298, 10110, 16008, 10163, 33482,   119, 10103, 55543,
         10218, 65148,   112,   162, 10323, 10563, 16061, 12899,   117, 39646,
         10119, 56523, 28415, 17006,   117, 72356, 10110, 16008, 10163, 34939,
         24485, 18710, 14623, 20557, 12705,   119,   102]])

In [11]:
tokenizer.decode(tokens1[0])

'[CLS] this tremendous 100 % varietal wine hails from oakville and was aged over three years in oak. juicy red - cherry fruit and a compelling hint of caramel greet the palate, framed by elegant, fine tannins and a subtle minty tone in the background. balanced and rewarding from start to finish, it has years ahead of it to develop further nuance. enjoy 2022 [UNK] 2030. [SEP]'

In [12]:
tokenizer.decode(tokens2[0])

"[CLS] aromas include tropical fruit, broom, brimstone and dried herb. the palate isn't overly expressive, offering unripened apple, citrus and dried sage alongside brisk acidity. [SEP]"

In [13]:
result1 = model(tokens1)
result1

SequenceClassifierOutput(loss=None, logits=tensor([[-2.8863, -2.8505, -1.5543,  1.7217,  4.4777]],
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [14]:
result2 = model(tokens2)
result2

SequenceClassifierOutput(loss=None, logits=tensor([[ 0.1272,  2.1047,  2.0085, -0.4567, -2.9758]],
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [15]:
print(f"The resulting sentiment for the first wine is: ", int(torch.argmax(result1.logits))+1)
print(f"The resulting sentiment for the first wine is: ", int(torch.argmax(result2.logits))+1)

# Can add some logic to add +1 to scores of 0 or less

The resulting sentiment for the first wine is:  5
The resulting sentiment for the first wine is:  2


4. Load Reviews DF and Score DF

In [16]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

In [21]:
def sentiment_score(row):
    tokens = tokenizer.encode_plus(row['description'], 
                                   max_length=512, 
                                   truncation=True, 
                                   padding='max_length', 
                                   return_tensors='pt')
    input_ids = tokens['input_ids'].to(device)
    attention_mask = tokens['attention_mask'].to(device)
    result = model(input_ids, attention_mask=attention_mask)
    return int(torch.argmax(result.logits))+1

In [25]:
# copy the first 10 rows
df_first_10 = df1.head(10).copy()

# apply the function to the copied dataframe
df_first_10['sentiment_score'] = df_first_10.apply(sentiment_score, axis=1)

# now concatenate with the rest of the dataframe
df1 = pd.concat([df_first_10, df1.iloc[10:]])

In [26]:
df1.head(3)

Unnamed: 0.1,Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,variety,winery,sentiment_score
0,0,US,This tremendous 100% varietal wine hails from ...,Martha's Vineyard,96,235.0,California,Napa Valley,Napa,Cabernet Sauvignon,Heitz,5
1,1,Spain,"Ripe aromas of fig, blackberry and cassis are ...",Carodorum Selección Especial Reserva,96,110.0,Northern Spain,Toro,,Tinta de Toro,Bodega Carmen Rodríguez,5
2,2,US,Mac Watson honors the memory of a wine once ma...,Special Selected Late Harvest,96,90.0,California,Knights Valley,Sonoma,Sauvignon Blanc,Macauley,5
