In [None]:
pip install -r requirements.txt


In [None]:
import torch

from dataset_class import TermIdentificationDataset

from model_class   import TermIdentificationBaseModule, TermIdentificationModel
from model_class   import PreTrainedEmbeddingLayer

TRAIN_FILE_R = 'data/restaurants_train.json'
DEVEL_FILE_R = '../../data/restaurants_dev.json'
TRAIN_FILE_L = 'data/laptops_train.json'
DEVEL_FILE_L = 'data/laptops_dev.json'


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# dataset loader
train_dataset = TermIdentificationDataset(DEVEL_FILE_R, size=3)

# hyperparameters
hparams = { 'vocab'           : train_dataset.vocab, 
            'vocab_size'      : len(train_dataset.vocab),
            'embedding_dim'   : 100, 
            'lstm_hidden_dim' : 128,
            'lstm_bidirect'   : False, 
            'lstm_layers'     : 1, 
            'num_classes'     : 3, 
            'dropout'         : 0.0, 
            'device'          : device}


# retrieve embeddings
emb = PreTrainedEmbeddingLayer(hparams)   # loads glove pre trained embeddings

In [None]:
from model_class import HParams
train_dataset = TermIdentificationDataset(DEVEL_FILE_R, size=3)
hparams = HParams(train_dataset.vocab)
basemodel = TermIdentificationBaseModule(hparams, emb.get_embeddings()).cuda()


In [None]:
elem = train_dataset[0]
x = elem['idxs_vector']
batched_x = x.unsqueeze(1)
print(batched_x)
batched_logits = basemodel.forward(batched_x)
logits = batched_logits.squeeze()
print(logits, '\n')

In [None]:
torch.argmax(logits, -1)

# BERT TESTS


In [1]:

import torch
from transformers import AutoTokenizer, RobertaTokenizer
tokenizer_bert    = AutoTokenizer.from_pretrained("bert-base-cased")
tokenizer_roberta = RobertaTokenizer.from_pretrained("roberta-base")


In [2]:
sentence = "Hello I'm Luca and I have played a lot"
tokenize_out_bert    = tokenizer_bert.tokenize(sentence)   
tokenize_out_roberta = tokenizer_roberta.tokenize(sentence)

encode_out_bert      = tokenizer_bert.encode(sentence)
encode_out_roberta   = tokenizer_roberta.encode(sentence)

tokenizer_out_bert    = tokenizer_bert(sentence, return_tensors='pt')
tokenizer_out_roberta = tokenizer_roberta(sentence, return_tensors='pt')

print("BERT   ", tokenize_out_bert)
print("ROBERTA", tokenize_out_roberta)
print("BERT   ", encode_out_bert)
print("ROBERTA", encode_out_roberta)
print("BERT   ", tokenizer_out_bert)
print("ROBERTA", tokenizer_out_roberta)

BERT    ['Hello', 'I', "'", 'm', 'Luca', 'and', 'I', 'have', 'played', 'a', 'lot']
ROBERTA ['Hello', 'ĠI', "'m", 'ĠLuc', 'a', 'Ġand', 'ĠI', 'Ġhave', 'Ġplayed', 'Ġa', 'Ġlot']
BERT    [101, 8667, 146, 112, 182, 16730, 1105, 146, 1138, 1307, 170, 1974, 102]
ROBERTA [0, 31414, 38, 437, 7483, 102, 8, 38, 33, 702, 10, 319, 2]
BERT    {'input_ids': tensor([[  101,  8667,   146,   112,   182, 16730,  1105,   146,  1138,  1307,
           170,  1974,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
ROBERTA {'input_ids': tensor([[    0, 31414,    38,   437,  7483,   102,     8,    38,    33,   702,
            10,   319,     2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}


# Tensor Tests


In [None]:
import torch
NEUTRAL  = 'neutral'
POSITIVE = 'positive'
NEGATIVE = 'negative'
CONFLICT = 'conflict'
ABSENT   = 'absent'
l_vocab = { NEUTRAL  : 0,
            POSITIVE : 1,
            NEGATIVE : 2,
            CONFLICT : 3,
            ABSENT   : 4}

In [None]:
a = l_vocab[ABSENT]
print(a)
b = torch.LongTensor([a])
print(b)

In [None]:
þ = {'targets' : []}
print(len(þ['targets']))
þ['targets'].append(5)
print(þ)

In [None]:
þ = [{'targets': [['Appetizers', 'negative'], [['main dishes', 'negative']]]},
     {'targets': [['view', 'negative']]},
     {'targets': [['reservation', 'negative']]}]

for elem in þ:
    for term_pred in elem['targets']:
        print(term_pred)
        print(term_pred[1])

In [None]:
import torch
from collections import OrderedDict

path = "/media/nemo/DATA/uni/nlp-hw2/model/model_b/BERT-2-SeqCls-model_TASK_B_both-to-both_term_epoch=0_step=2360_train_loss=1.74_macro_f1=17.54.ckpt"
new_path = "/media/nemo/DATA/uni/nlp-hw2/model/model_b/BERT-2-SeqCls-model_TASK_B_both-to-both_term_epoch=0_step=2360_train_loss=1.74_macro_f1=17.54_CORRECT.ckpt"

checkpoint = torch.load(path)
new_checkpoint = OrderedDict()

for k, v in checkpoint.items():
    if k == 'state_dict':
        new_checkpoint[k] = OrderedDict()
        for key, value in checkpoint['state_dict'].items():
            new_key = key.replace('encoder', 'model.bert', 1)
            new_checkpoint['state_dict'][new_key] = value
            print(key, new_key)
    else:
        new_checkpoint[k] = v

torch.save(new_checkpoint, new_path)

# ROBERTA tests

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelWithLMHead

tokenizer  = AutoTokenizer.from_pretrained("roberta-base")   # is cased
model      = AutoModelWithLMHead.from_pretrained("roberta-base")
base_model = model.roberta 

In [None]:
text = "Hello my <mask> is Luca"
enc  = tokenizer.encode_plus(text)
enc

In [None]:
out = base_model(torch.tensor(enc["input_ids"]).unsqueeze(0), torch.tensor(enc["attention_mask"]).unsqueeze(0))
out#.last_hidden_state.size()

In [None]:
from transformers import BertForSequenceClassification

model = BertForSequenceClassification.from_pretrained('bert-base-cased')
model.classifier

In [None]:
progress = ["/", "-", "\\", "|", "/", "-", "\\", "|"]
for i in range(100):
    print(progress[i % len(progress)], end="\r")

In [None]:
a = [1,1,1,1,1,1]
for i in range(len(a)-1):
    print(i)

In [None]:
import json
raw_data = []
with open('/media/nemo/DATA/uni/nlp-hw2/data/restaurants_train.json', 'r') as f:
    raw_data.extend(json.load(f))

In [None]:
from collections import Counter
from pprint import pprint
c1 = Counter()
c2 = Counter()
c3 = Counter()
c4 = Counter()
zerouno = 0
zerodue = 0
zerotre = 0
zeroqua = 0
zerocin = 0

unouno = 0
unodue = 0
unotre = 0
unoqua = 0
unocin = 0

dueuno = 0
duedue = 0
duetre = 0
duequa = 0
duecin = 0

treuno = 0
tredue = 0
tretre = 0
trequa = 0
trecin = 0

quauno = 0
quadue = 0
quatre = 0
quaqua = 0
quacin = 0

cinuno = 0
cindue = 0
cintre = 0
cinqua = 0
cincin = 0

seiuno = 0
seidue = 0
seitre = 0
seiqua = 0
seicin = 0

setuno = 0
setdue = 0
settre = 0
setqua = 0
setcin = 0

ottuno = 0
ottdue = 0
otttre = 0
ottqua = 0
ottcin = 0

novuno = 0
novdue = 0
novtre = 0
novqua = 0
novcin = 0

for elem in raw_data:
    categories = elem['categories']
    c3[len(elem['categories'])] += 1
    c4[len(elem['targets'])] += 1
    
    if len(elem['categories']) == 1    and len(elem['targets']) == 0:
        zerouno += 1                  # no. sentences with zero targets and 1 category
    elif len(elem['categories']) == 2  and len(elem['targets']) == 0:
        zerodue += 1                  # no. sentences with zero targets and 1 category
    elif len(elem['categories']) == 3  and len(elem['targets']) == 0:
        zerotre += 1                  # no. sentences with zero targets and 1 category
    elif len(elem['categories']) == 4  and len(elem['targets']) == 0:
        zeroqua += 1                  # no. sentences with zero targets and 1 category
    elif len(elem['categories']) == 5  and len(elem['targets']) == 0:
        zerocin += 1 

    elif len(elem['categories']) == 1 and len(elem['targets']) == 1:
        unouno += 1
    elif len(elem['categories']) == 2 and len(elem['targets']) == 1:
        unodue += 1
    elif len(elem['categories']) == 3 and len(elem['targets']) == 1:
        unotre += 1
    elif len(elem['categories']) == 4 and len(elem['targets']) == 1:
        unoqua += 1
    elif len(elem['categories']) == 5 and len(elem['targets']) == 1:
        unocin += 1

    elif len(elem['categories']) == 1 and len(elem['targets']) == 2:
        dueuno += 1
    elif len(elem['categories']) == 2 and len(elem['targets']) == 2:
        duedue += 1
    elif len(elem['categories']) == 3 and len(elem['targets']) == 2:
        duetre += 1
    elif len(elem['categories']) == 4 and len(elem['targets']) == 2:
        duequa += 1
    elif len(elem['categories']) == 5 and len(elem['targets']) == 2:
        duecin += 1

    elif len(elem['categories']) == 1 and len(elem['targets']) == 3:
        treuno += 1
    elif len(elem['categories']) == 2 and len(elem['targets']) == 3:
        tredue += 1
    elif len(elem['categories']) == 3 and len(elem['targets']) == 3:
        tretre += 1
    elif len(elem['categories']) == 4 and len(elem['targets']) == 3:
        trequa += 1
    elif len(elem['categories']) == 5 and len(elem['targets']) == 3:
        trecin += 1

    elif len(elem['categories']) == 1 and len(elem['targets']) == 4:
        quauno += 1
    elif len(elem['categories']) == 2 and len(elem['targets']) == 4:
        quadue += 1
    elif len(elem['categories']) == 3 and len(elem['targets']) == 4:
        quatre += 1
    elif len(elem['categories']) == 4 and len(elem['targets']) == 4:
        quaqua += 1
    elif len(elem['categories']) == 5 and len(elem['targets']) == 4:
        quacin += 1
    
    elif len(elem['categories']) == 1 and len(elem['targets']) == 5:
        cinuno += 1
    elif len(elem['categories']) == 2 and len(elem['targets']) == 5:
        cindue += 1
    elif len(elem['categories']) == 3 and len(elem['targets']) == 5:
        cintre += 1
    elif len(elem['categories']) == 4 and len(elem['targets']) == 5:
        cinqua += 1
    elif len(elem['categories']) == 5 and len(elem['targets']) == 5:
        cincin += 1
    
    elif len(elem['categories']) == 1 and len(elem['targets']) == 6:
        seiuno += 1
    elif len(elem['categories']) == 2 and len(elem['targets']) == 6:
        seidue += 1
    elif len(elem['categories']) == 3 and len(elem['targets']) == 6:
        seitre += 1
    elif len(elem['categories']) == 4 and len(elem['targets']) == 6:
        seiqua += 1
    elif len(elem['categories']) == 5 and len(elem['targets']) == 6:
        seicin += 1

    elif len(elem['categories']) == 1 and len(elem['targets']) == 7:
        setuno += 1
    elif len(elem['categories']) == 2 and len(elem['targets']) == 7:
        setdue += 1
    elif len(elem['categories']) == 3 and len(elem['targets']) == 7:
        settre += 1
    elif len(elem['categories']) == 4 and len(elem['targets']) == 7:
        setqua += 1
    elif len(elem['categories']) == 5 and len(elem['targets']) == 7:
        setcin += 1

    elif len(elem['categories']) == 1 and len(elem['targets']) == 8:
        ottuno += 1
    elif len(elem['categories']) == 2 and len(elem['targets']) == 8:
        ottdue += 1
    elif len(elem['categories']) == 3 and len(elem['targets']) == 8:
        otttre += 1
    elif len(elem['categories']) == 4 and len(elem['targets']) == 8:
        ottqua += 1
    elif len(elem['categories']) == 5 and len(elem['targets']) == 8:
        ottcin += 1
    
    elif len(elem['categories']) == 1 and len(elem['targets']) == 9:
        novuno += 1
    elif len(elem['categories']) == 2 and len(elem['targets']) == 9:
        novdue += 1
    elif len(elem['categories']) == 3 and len(elem['targets']) == 9:
        novtre += 1
    elif len(elem['categories']) == 4 and len(elem['targets']) == 9:
        novqua += 1
    elif len(elem['categories']) == 5 and len(elem['targets']) == 9:
        novcin += 1

    for cat in categories:
        c1[cat[0]] += 1
        c2[cat[1]] += 1

print("category terms")
pprint(c1)
print("\ncategory polarity")
pprint(c2)
print("\nnumero di frasi con x categorie")
pprint(c3)
print("\nnumero di frasi con x termini")
pprint(c4)
print("\n")


In [None]:
print("numero di frasi con 0 termini e 1 categoria:", zerouno)
print("numero di frasi con 0 termini e 2 categorie:", zerodue)
print("numero di frasi con 0 termini e 3 categorie:", zerotre)
print("numero di frasi con 0 termini e 4 categorie:", zeroqua)
print("numero di frasi con 0 termini e 5 categorie:", zerocin)
print("\n")
print("numero di frasi con 1 termini e 1 categoria:", unouno)
print("numero di frasi con 1 termini e 2 categorie:", unodue)
print("numero di frasi con 1 termini e 3 categorie:", unotre)
print("numero di frasi con 1 termini e 4 categorie:", unoqua)
print("numero di frasi con 1 termini e 5 categorie:", unocin)

In [None]:
print("numero di frasi con 2 termini e 1 categoria:", dueuno)
print("numero di frasi con 2 termini e 2 categorie:", duedue)
print("numero di frasi con 2 termini e 3 categorie:", duetre)
print("numero di frasi con 2 termini e 4 categorie:", duequa)
print("numero di frasi con 2 termini e 5 categorie:", duecin)
print("\n")
print("numero di frasi con 3 termini e 1 categoria:", treuno)
print("numero di frasi con 3 termini e 2 categorie:", tredue)
print("numero di frasi con 3 termini e 3 categorie:", tretre)
print("numero di frasi con 3 termini e 4 categorie:", trequa)
print("numero di frasi con 3 termini e 5 categorie:", trecin)

In [None]:
print("numero di frasi con 4 termini e 1 categoria:", quauno)
print("numero di frasi con 4 termini e 2 categorie:", quadue)
print("numero di frasi con 4 termini e 3 categorie:", quatre)
print("numero di frasi con 4 termini e 4 categorie:", quaqua)
print("numero di frasi con 4 termini e 5 categorie:", quacin)
print("\n")
print("numero di frasi con 5 termini e 1 categoria:", cinuno)
print("numero di frasi con 5 termini e 2 categorie:", cindue)
print("numero di frasi con 5 termini e 3 categorie:", cintre)
print("numero di frasi con 5 termini e 4 categorie:", cinqua)
print("numero di frasi con 5 termini e 5 categorie:", cincin)

In [None]:
print("numero di frasi con 6 termini e 1 categoria:", seiuno)
print("numero di frasi con 6 termini e 2 categorie:", seidue)
print("numero di frasi con 6 termini e 3 categorie:", seitre)
print("numero di frasi con 6 termini e 4 categorie:", seiqua)
print("numero di frasi con 6 termini e 5 categorie:", seicin)
print("\n")
print("numero di frasi con 7 termini e 1 categoria:", setuno)
print("numero di frasi con 7 termini e 2 categorie:", setdue)
print("numero di frasi con 7 termini e 3 categorie:", settre)
print("numero di frasi con 7 termini e 4 categorie:", setqua)
print("numero di frasi con 7 termini e 5 categorie:", setcin)

In [None]:
print("numero di frasi con 8 termini e 1 categoria:", ottuno)
print("numero di frasi con 8 termini e 2 categorie:", ottdue)
print("numero di frasi con 8 termini e 3 categorie:", otttre)
print("numero di frasi con 8 termini e 4 categorie:", ottqua)
print("numero di frasi con 8 termini e 5 categorie:", ottcin)
print("\n")
print("numero di frasi con 9 termini e 1 categoria:", novuno)
print("numero di frasi con 9 termini e 2 categorie:", novdue)
print("numero di frasi con 9 termini e 3 categorie:", novtre)
print("numero di frasi con 9 termini e 4 categorie:", novqua)
print("numero di frasi con 9 termini e 5 categorie:", novcin)

In [None]:
t = [[0.7371, 0.7371]]
import torch
i = torch.argmax(torch.Tensor(t)).item()
pred = [0, 0]
pred[i] = 1
pred

In [None]:
out = [0, 0, 0, 0, 0]
print(1 in out)
import torch
out[torch.argmax(torch.Tensor([0,0,10,0,0])).item()] = 1
print(1 in out)
out

# Test uncased A&B


In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

text_sample = "Appetizers are somewhere around $7 each and the main dishes are between $11 and $16."
net_outputs = {"targets": [["appetizers", "neutral"], ["main dishes", "neutral"]]}

text = "It seemed to be a very nice laptop except I was not able to load my Garmin GPS software or Microsoft Office 2003."
text = "With the macbook pro it comes with freesecuritysoftware to protect it from viruses and other intrusive things from downloads and internet surfing or emails."
print(tokenizer.tokenize(text))

In [None]:
text_sample = "There are several programs for school or office use (Pages, Numbers, Keynote, etc.), music (Garageband), photo management (Photo Booth, iPhoto), video-editing or movie-making (iMovie), etc."
net_outputs = {'targets': [['iphoto'], ['garageband'], ['music'], ['imovie'], ['photo booth'], ['programs'], ['photo management']]}

listed_text = tokenizer.tokenize(text_sample, add_special_tokens=False)
print(net_outputs)
print(listed_text)
print()

for pred in net_outputs['targets']:
    term = pred[0]
    print(term, "in text:", term in text_sample)
    
    if term not in text_sample:
        listed_term = tokenizer.tokenize(term, add_special_tokens=False)
        print(listed_term)

        for i in range(len(listed_text)):
            word = listed_text[i]
            curr = listed_term[0]

            if curr == word.lower():
                if len(listed_text) >= i + len(listed_term):
                    aux1 = ''
                    aux2 = ''

                    for j in range(len(listed_term)):
                        tmp1 = listed_text[i+j]
                        if len(tmp1) > 2 and tmp1[0] == tmp1[1] == '#':
                            tmp1 = tmp1[2:]               # ignore the starting `##`
                            aux1 = aux1.strip(' ')      # remove the space before (if any)
                        aux1 += tmp1 + ' '

                        tmp2 = listed_term[j]
                        if len(tmp2) > 2 and tmp2[0] == tmp2[1] == '#':
                            tmp2 = tmp2[2:]               # ignore the starting `##`
                            aux2 = aux2.strip(' ')      # remove the space before (if any)
                        aux2 += tmp2  + ' '
                    
                    print("g", aux1)
                    print("p", aux2)
                    print()
                    
                    if aux1.strip().lower() == aux2.strip():
                        pred[0] = aux1.strip()
                        print(">> Changed to:", pred[0])    



print(net_outputs)
                


In [None]:
import re
def tokenize_line(line, pattern='\s'):
    line = re.sub('[\.,:;!@#$\(\)\-&\\<>]', '-', line)
    return [word.strip('-') for word in re.split(pattern, line) if word]

def tokenize_line_2(line,pattern='\s'):
    line = re.sub(' ', '-', line)
    return [word.strip('-') for word in re.split(pattern, line) if word]

text_sample = "There are several programs for school or office use (Pages, Numbers, Keynote, etc.), music (Garageband), photo management (Photo Booth, iPhoto), video-editing or movie-making (iMovie), etc."
net_outputs = {'targets': [['iphoto'], ['garageband'], ['music'], ['imovie'], ['photo booth'], ['programs'], ['photo management']]}

text_sample = "But when I received my replacement, I made BOTH recovery DVDs (4), and a driver/application DVD."
net_outputs = {'targets': [["driver/application dvd"], ["recovery dvds"]]}

#text_sample = "/ awesome cooling system/ much better grafics card (ATI 5870) / 8GB RAM/ LED backlit screen..."
#net_outputs = {'targets': [["8gb ram"]]}

#text_sample = "I love the multi-touch trackpad."
#net_outputs = {'targets': [["multi touch"]]}

text_sample = "the headphone and mic jack are in front of touch-pad making the touch-pad hard to use when using headphones/mic, not to mention the laptop was designed for right handed person."
net_outputs = {'targets': [["touch - pad"], ['mic jack'], ['headphone'], ['headphones'], ['mic']]}

listed_text = tokenize_line(text_sample)
print(net_outputs)
print(listed_text)

for pred in net_outputs['targets']:
    term = pred[0]
    print(term, "in text:", term in text_sample)
    
    if term not in text_sample:
        if ' - ' in term:
            term = term.replace(' - ', '-')
        listed_term = tokenize_line(term)
        print("tokenized term:", listed_term)

        for i in range(len(listed_text)):
            word = listed_text[i]
            curr = listed_term[0]
    
            if curr == word.lower():
                if len(listed_text) >= i + len(listed_term):
                    aux1 = ''
                    aux2 = ''
                    for j in range(len(listed_term)):
                        #if listed_text[i+j].lower() == listed_term[j]:
                        aux1 += listed_text[i+j] + ' '
                        aux2 += listed_term[j]   + ' '   
                        #elif listed_text[i+j] == '-' and listed_term == '':
                        #    aux1 = aux1.strip() + '-'
                        #    aux2 = aux2.strip() + '-'
                    
                    if aux1.strip().lower() == aux2.strip():
                        pred[0] = aux1.strip()
                        print("1 >> Changed to:", pred[0])
                    
                    aux2 = aux2.strip(' ') + '/'    # one case I can't write in regex: '8GB RAM/'
                    if aux1.strip().lower() == aux2.strip():
                        pred[0] = aux1.strip().strip('/')
                        print("2 >> Changed to:", pred[0])
        '''
        if pred[0] not in text_sample:
            if ' - ' in pred[0]:
                term = term.replace(' - ', '-')
                print(term)
            listed_term = tokenize_line_2(term)
            print("tokenized term 2", listed_term)

            for i in range(len(listed_text)):
                word = listed_text[i]
                curr = listed_term[0]
        
                if curr == word.lower():
                    if len(listed_text) >= i + len(listed_term):
                        aux1 = ''
                        aux2 = ''
                        for j in range(len(listed_term)):
                            aux1 += listed_text[i+j] + ' '
                            aux2 += listed_term[j]   + ' '   

                        if aux1.strip().lower() == aux2.strip():
                            pred[0] = aux1.strip()
                            print("3 >> Changed to:", pred[0])
                        
                        aux2 = aux2.strip(' ') + '/'    # one case I can't write in regex: '8GB RAM/'
                        if aux1.strip().lower() == aux2.strip():
                            pred[0] = aux1.strip().strip('/')
                            print("4 >> Changed to:", pred[0])
        '''
 
print(net_outputs)

# Stopwords tests

In [3]:
!pip install nltk
import nltk
nltk.download('stopwords', download_dir='./nltk_data')
nltk.data.path.append('./nltk_data')
stopset = set(nltk.corpus.stopwords.words('english'))
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")





[nltk_data] Downloading package stopwords to ./nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
sent = 'From the moment you enter till the moment you walk out the friendly and helpful staff was was just Fantastic.'
for word in stopset:
    if word in sent.split():
        sent = sent.replace(word, '')
print(sent)
tokenizer.tokenize(sent)

From  moment  enter till  moment  walk   friendly  helpful staff    Fantastic.


['from',
 'moment',
 'enter',
 'till',
 'moment',
 'walk',
 'friendly',
 'helpful',
 'staff',
 'fantastic',
 '.']

In [1]:
!pip install vaderSentiment

Collecting vaderSentiment
  Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl (125 kB)
[K     |████████████████████████████████| 125 kB 3.1 MB/s 
Installing collected packages: vaderSentiment
Successfully installed vaderSentiment-3.3.2


In [11]:
# import SentimentIntensityAnalyzer class
# from vaderSentiment.vaderSentiment module.
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
 
# function to print sentiments
# of the sentence.
def sentiment_scores(sentence):
 
    # Create a SentimentIntensityAnalyzer object.
    sid_obj = SentimentIntensityAnalyzer()
 
    # polarity_scores method of SentimentIntensityAnalyzer
    # object gives a sentiment dictionary.
    # which contains pos, neg, neu, and compound scores.
    sentiment_dict = sid_obj.polarity_scores(sentence)
    print(f"Negative: {sentiment_dict['neg']*100:.2f}%")
    print(f"Positive: {sentiment_dict['neu']*100:.2f}%")
    print(f"Neutral : {sentiment_dict['pos']*100:.2f}%")
    print(f"Compound: {sentiment_dict['compound']*100:.2f}%")
 
    print(">>", end = " ")
 
    # decide sentiment as positive, negative and neutral
    if sentiment_dict['compound'] >= 0.05 :
        print("Positive")
    elif sentiment_dict['compound'] <= - 0.05 :
        print("Negative")
    else :
        print("Neutral")
 
 
   
# Driver code
if __name__ == "__main__" :
 
    sentence = "I love you"
 
    # function calling
    sentiment_scores(sentence)
 
    sentence = "study is going on as usual"
    sentiment_scores(sentence)
 
    sentence = "I am vey sad today."
    sentiment_scores(sentence)

    sentence = "My wife and I always enjoy the young, not always well trained but nevertheless friendly, staff, all of whom have a story."
    sentiment_scores(sentence)

Negative: 0.00%
Positive: 32.30%
Neutral : 67.70%
Compound: 63.69%
>> Positive
Negative: 0.00%
Positive: 100.00%
Neutral : 0.00%
Compound: 0.00%
>> Neutral
Negative: 43.70%
Positive: 56.30%
Neutral : 0.00%
Compound: -47.67%
>> Negative
Negative: 5.20%
Positive: 70.90%
Neutral : 23.90%
Compound: 71.78%
>> Positive


In [1]:
a = [int(elem) for elem in '-1'.split(':')]

'concat' in 'concat-max'

usage: ipykernel_launcher.py [-h]
ipykernel_launcher.py: error: unrecognized arguments: -f /home/nemo/.local/share/jupyter/runtime/kernel-c425b673-e956-48a4-be62-84d559b0a21c.json


SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [1]:
from transformers import RobertaTokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

In [33]:
sent = "I love this PIZZA!"
shit = "shit"
encoded   = tokenizer.encode(sent, add_special_tokens=True)
decoded   = tokenizer.decode(encoded[1:-1])
aux = tokenizer.tokenize(sent)
aux.insert(0, '<s>')
aux.insert(len(aux), '</s>')
#print(encoded, len(encoded))
#print(decoded)
print(sent)
print(aux, len(aux))

þ = ''
for i in range(len(aux)):
    if aux[i] == '<s>' or aux[i] == '</s>':
        continue
    if aux[i] == '<pad>':
        break
    if aux[i] == '<unk>':
        þ += ' ' + '<unk>'

    if aux[i][0] == 'Ġ':
        þ += ' ' + aux[i][1:]
    else:
        þ += aux[i]

print(þ)

I love this PIZZA!
['<s>', 'I', 'Ġlove', 'Ġthis', 'ĠP', 'IZ', 'ZA', '!', '</s>'] 9
I love this PIZZA!


In [48]:
sent = 'I Love this PIZZA!'
inputs = tokenizer( sent,
                    truncation=True,
                    padding='longest',
                    return_tensors='pt')

# build the labels
import numpy as np
terms_vectors = np.full_like(inputs['input_ids'], dtype=int,
                            fill_value=0)
current = inputs['input_ids'][0]
encoded_targets = tokenizer.encode(' PIZZA', add_special_tokens=False)

print("text    ", tokenizer.tokenize(sent))
print("text ids", current.numpy())
print("targets ", encoded_targets)
print("label   ", terms_vectors)

for i in range(len(inputs['input_ids'][0])):
    if current[i] == encoded_targets[0]:
        term_len = len(encoded_targets)
        if len(current) - i >= term_len:    # prevents IndexOutOfError for prefix matches
            aux1 = []
            aux2 = []
            for j in range(term_len):
                aux1.append(current[i+j].item())
                aux2.append(encoded_targets[j])
            if aux1 == aux2:                            # I check all the sequence long 'term_len'
                for j in range(term_len):
                    terms_vectors[0][i+j] = 2
                terms_vectors[0][i]       = 1
                continue
print("label   ", terms_vectors)

for i in range(len(inputs['input_ids'][0])):
    if current[i] == 1:
        break
    elif current[i] in [0, 2]:
        continue
    else:
        if terms_vectors[0][i] == 0:
            terms_vectors[0][i] = 3
print("label   ", terms_vectors)

text     ['I', 'ĠLove', 'Ġthis', 'ĠP', 'IZ', 'ZA', '!']
text ids [    0   100  3437    42   221 17045 22447   328     2]
targets  [221, 17045, 22447]
label    [[0 0 0 0 0 0 0 0 0]]
label    [[0 3 3 3 1 2 2 3 0]]
