# Sentiment analysis

## Импорт библиотек

In [None]:
import torch   
from torchtext.legacy import data 
import pandas as pd
from torch.utils.data import DataLoader
from tqdm import tqdm

## Просмотр датасета

In [None]:
df = pd.read_csv('dataset_service.tsv', sep='\t', comment='#', header=None)
df.head()

Unnamed: 0,0,1,2,3,4
0,http://twitter.com/reneesa\_devin/statuses/628...,"@DB_Bahn ja, weil in Wuppertal Bauarbeiten sin...",True,neutral,Allgemein
1,http://twitter.com/MrElch/statuses/68737328126...,@nordschaf theoretisch kannste dir überall im ...,True,positive,Zugfahrt
2,http://twitter.com/wolfi\_wiese/statuses/64924...,Bahn verspätet sich..gleich kommt noch jemand ...,True,negative,Zugfahrt
3,http://wirtschaftsnachrichten-online.de/2015/1...,Ihre Anfragen brachten uns zu neuen Leistungen...,False,neutral,
4,http://community.bahn.de/questions/1034649-db-...,Kann ich mit dem DB Geschenk Ticket den ICE Sp...,True,neutral,Allgemein


In [None]:
SEED = 2022

torch.manual_seed(SEED)

torch.backends.cudnn.deterministic = True  

# Предобработка датасета

In [None]:
!python -m spacy download de

import re
import spacy  

spacy_ger = spacy.load("de")

def cleanup_text(texts):
    cleaned_text = []
    for text in texts:
        # remove punctuation
        text = re.sub('[^a-zA-Z0-9]', ' ', text)
        # remove multiple spaces
        text = re.sub(r' +', ' ', text)
        # remove newline
        text = re.sub(r'\n', ' ', text)

        cleaned_text.append(text)
    return cleaned_text

def tokenize_ger(text):
    return [tok.text for tok in spacy_ger.tokenizer(text)]

TEXT = data.Field(tokenize=tokenize_ger, 
                  preprocessing=cleanup_text,
                  batch_first=True, 
                  include_lengths=True, 
                  lower=True)
LABEL = data.LabelField()

Collecting de_core_news_sm==2.2.5
  Downloading https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-2.2.5/de_core_news_sm-2.2.5.tar.gz (14.9 MB)
[K     |████████████████████████████████| 14.9 MB 23.2 MB/s 
Building wheels for collected packages: de-core-news-sm
  Building wheel for de-core-news-sm (setup.py) ... [?25l[?25hdone
  Created wheel for de-core-news-sm: filename=de_core_news_sm-2.2.5-py3-none-any.whl size=14907055 sha256=dcdc5655285fa26947ca7512f4bbc8144cc89821cc85bd1b2229c1b6f5eff224
  Stored in directory: /tmp/pip-ephem-wheel-cache-5__3lbtz/wheels/00/66/69/cb6c921610087d2cab339062345098e30a5ceb665360e7b32a
Successfully built de-core-news-sm
Installing collected packages: de-core-news-sm
Successfully installed de-core-news-sm-2.2.5
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('de_core_news_sm')
[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.7/dist-packages/de_core_news_sm -->
/usr/local/lib

## Определение колонок необходимых для модели

In [None]:
fields = [(None, None), ('text', TEXT), (None, None), ('label', LABEL)]

In [None]:
training_data = data.TabularDataset(path = 'dataset_service.tsv', fields = fields, format = 'tsv', skip_header = True)

#print preprocessed text
print(vars(training_data.examples[0]))

{'text': [' nordschaf', 'theoretisch', 'kannste', 'dir', ' berall', 'im', 'k lner', 'stadtbereich', 'was', 'suchen', ' ', 'mit', 'der', 'kvb', ' ', 's bahn', 'kommt', 'man', ' berall', 'fix', 'hin', ' '], 'label': 'positive'}


## Разделение данных

In [None]:
import random
train_data, valid_data = training_data.split(split_ratio=0.7, 
                                             random_state = random.seed(SEED))
                                             

## Загрузка germany embeddings 

In [None]:
# Download model
!wget https://int-emb-glove-de-wiki.s3.eu-central-1.amazonaws.com/vectors.txt

--2022-01-24 17:37:31--  https://int-emb-glove-de-wiki.s3.eu-central-1.amazonaws.com/vectors.txt
Resolving int-emb-glove-de-wiki.s3.eu-central-1.amazonaws.com (int-emb-glove-de-wiki.s3.eu-central-1.amazonaws.com)... 52.219.169.134
Connecting to int-emb-glove-de-wiki.s3.eu-central-1.amazonaws.com (int-emb-glove-de-wiki.s3.eu-central-1.amazonaws.com)|52.219.169.134|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3744610526 (3.5G) [text/plain]
Saving to: ‘vectors.txt’


2022-01-24 17:39:23 (32.0 MB/s) - ‘vectors.txt’ saved [3744610526/3744610526]



In [None]:
import torchtext.vocab as vocab

custom_embeddings = vocab.Vectors(name = 'vectors.txt')

100%|█████████▉| 1309280/1309281 [02:31<00:00, 8670.42it/s]


In [None]:
TEXT.build_vocab(train_data, min_freq=5, vectors=custom_embeddings)  
LABEL.build_vocab(train_data)

#No. of unique tokens in text
print("Size of TEXT vocabulary:",len(TEXT.vocab))

#No. of unique tokens in label
print("Size of LABEL vocabulary:",len(LABEL.vocab))

#Commonly used words
print(TEXT.vocab.freqs.most_common(10))  

#Word dictionary
print(TEXT.vocab.stoi)   

#Label dictionary
print(LABEL.vocab.stoi)

Size of TEXT vocabulary: 15281
Size of LABEL vocabulary: 3
[(' ', 200355), ('die', 28722), ('der', 26658), ('und', 23880), ('in', 15502), ('bahn', 11314), ('das', 10589), ('ich', 9968), ('mit', 9662), ('ist', 9024)]
defaultdict(<bound method Vocab._default_unk_index of <torchtext.legacy.vocab.Vocab object at 0x7fe4c7dbc410>>, {'<unk>': 0, '<pad>': 1, ' ': 2, 'die': 3, 'der': 4, 'und': 5, 'in': 6, 'bahn': 7, 'das': 8, 'ich': 9, 'mit': 10, 'ist': 11, 'von': 12, 'zu': 13, 'den': 14, 'f r': 15, 'auf': 16, 'nicht': 17, 'es': 18, 'im': 19, 'ein': 20, 'auch': 21, 'eine': 22, 'sich': 23, 'dem': 24, 'sie': 25, 'nach': 26, 'des': 27, 'an': 28, 'bei': 29, 'am': 30, 'wir': 31, 'so': 32, 'man': 33, 'oder': 34, 'aber': 35, 'wie': 36, 'noch': 37, 'als': 38, 'dass': 39, 'hat': 40, 'aus': 41, 'sind': 42, 're': 43, 'nur': 44, 'um': 45, 'wird': 46, 'wenn': 47, 'werden': 48, 'dann': 49, 'einen': 50, 'mehr': 51, 'zum': 52, 'kann': 53, 'da': 54, 'war': 55, 'was': 56, 'mal': 57, 'haben': 58, 'einer': 59, ' b

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')  

BATCH_SIZE = 64

train_iterator, valid_iterator = data.BucketIterator.splits(
    (train_data, valid_data), 
    batch_size = BATCH_SIZE,
    sort_key = lambda x: len(x.text),
    sort_within_batch=True,
    device = device)

## Классификатор

In [None]:
import torch.nn as nn

class classifier(nn.Module):
    
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, 
                 bidirectional, dropout):

        super().__init__()          
        
        #embedding layer
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        
        #lstm layer
        self.lstm = nn.LSTM(embedding_dim, 
                           hidden_dim, 
                           num_layers=n_layers, 
                           bidirectional=bidirectional, 
                           dropout=dropout,
                           batch_first=True)
        
        #dense layer
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
        
        #activation function
        self.act = nn.Sigmoid()
        
    def forward(self, text, text_lengths):
        
        #text = [batch size,sent_length]
        embedded = self.embedding(text)
        #embedded = [batch size, sent_len, emb dim]
      
        #packed sequence
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths,batch_first=True)
        
        packed_output, (hidden, cell) = self.lstm(packed_embedded)
        #hidden = [batch size, num layers * num directions,hid dim]
        #cell = [batch size, num layers * num directions,hid dim]
        
        #concat the final forward and backward hidden state
        hidden = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1)
                
        #hidden = [batch size, hid dim * num directions]
        dense_outputs=self.fc(hidden)

        #Final activation function
        outputs=self.act(dense_outputs)
        
        return outputs

## Гиперпараметры

In [None]:
size_of_vocab = len(TEXT.vocab)
embedding_dim = 300
num_hidden_nodes = 32
num_output_nodes = 1
num_layers = 2
bidirection = True
dropout = 0.2

#instantiate the model
model = classifier(size_of_vocab, embedding_dim, num_hidden_nodes,num_output_nodes, num_layers, 
                   bidirectional = True, dropout = dropout)

## Информация о модели

In [None]:
print(model)

def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)
    
print(f'The model has {count_parameters(model):,} trainable parameters')

pretrained_embeddings = TEXT.vocab.vectors
model.embedding.weight.data.copy_(pretrained_embeddings)

print(pretrained_embeddings.shape)

classifier(
  (embedding): Embedding(15281, 300)
  (lstm): LSTM(300, 32, num_layers=2, batch_first=True, dropout=0.2, bidirectional=True)
  (fc): Linear(in_features=64, out_features=1, bias=True)
  (act): Sigmoid()
)
The model has 4,694,957 trainable parameters
torch.Size([15281, 300])


## Вспомогательные функции

In [None]:
import torch.optim as optim

optimizer = optim.Adam(model.parameters())
criterion = nn.BCELoss()

 
def binary_accuracy(preds, y):
    rounded_preds = torch.round(preds)
    
    correct = (rounded_preds == y).float() 
    acc = correct.sum() / len(correct)
    return acc
    
model = model.to(device)
criterion = criterion.to(device)

In [None]:
def train(model, iterator, optimizer, criterion):
    epoch_loss = 0
    epoch_acc = 0
     
    model.train()  
    i = 0
    for batch in iterator:
        i += 1
        print(f'batch #{i}')
        #resets the gradients after every batch
        optimizer.zero_grad()   
        
        #retrieve text and no. of words
        text, text_lengths = batch.text   
        
        #convert to 1D tensor
        predictions = model(text, text_lengths).squeeze()  
        
        #compute the loss
        loss = criterion(predictions, batch.label)        
        
        #compute the binary accuracy
        acc = binary_accuracy(predictions, batch.label)   
        
        #backpropage the loss and compute the gradients
        loss.backward()       
        
        #update the weights
        optimizer.step()      
        
        #loss and accuracy
        epoch_loss += loss.item()  
        epoch_acc += acc.item()    
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

## Функция оценки

In [None]:
def evaluate(model, iterator, criterion):
    
    #initialize every epoch
    epoch_loss = 0
    epoch_acc = 0

    #deactivating dropout layers
    model.eval()
    
    #deactivates autograd
    with torch.no_grad():
    
        for batch in iterator:
        
            #retrieve text and no. of words
            text, text_lengths = batch.text
            
            #convert to 1d tensor
            predictions = model(text, text_lengths).squeeze()
            
            #compute loss and accuracy
            loss = criterion(predictions, batch.label)
            acc = binary_accuracy(predictions, batch.label)
            
            #keep track of loss and accuracy
            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

## Обучение модели

In [None]:
N_EPOCHS = 3
best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    print(f'epoch {epoch}')
    #train the model
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    
    #evaluate the model
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
    #save the best model
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'model.pt')
    
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

epoch 0
batch #1
batch #2
batch #3
batch #4
batch #5
batch #6
batch #7
batch #8
batch #9
batch #10
batch #11
batch #12
batch #13
batch #14
batch #15
batch #16
batch #17
batch #18
batch #19
batch #20
batch #21
batch #22
batch #23
batch #24
batch #25
batch #26
batch #27
batch #28
batch #29
batch #30
batch #31
batch #32
batch #33
batch #34
batch #35
batch #36
batch #37
batch #38
batch #39
batch #40
batch #41
batch #42
batch #43
batch #44
batch #45
batch #46
batch #47
batch #48
batch #49
batch #50
batch #51
batch #52
batch #53
batch #54
batch #55
batch #56
batch #57
batch #58
batch #59
batch #60
batch #61
batch #62
batch #63
batch #64
batch #65
batch #66
batch #67
batch #68
batch #69
batch #70
batch #71
batch #72
batch #73
batch #74
batch #75
batch #76
batch #77
batch #78
batch #79
batch #80
batch #81
batch #82
batch #83
batch #84
batch #85
batch #86
batch #87
batch #88
batch #89
batch #90
batch #91
batch #92
batch #93
batch #94
batch #95
batch #96
batch #97
batch #98
batch #99
batch #100


## Загрузка модели

In [None]:
#load weights
model.load_state_dict(torch.load('model.pt'));
model.eval();                   

In [None]:
from sklearn.metrics import classification_report


def cr(model, test_loader, version='title', threshold=0.5):
    y_pred = []
    y_true = []

    model.eval()
    with torch.no_grad():
        for batch in test_loader:
      
            text, text_lengths = batch.text
            
            labels = batch.label
            labels = labels.to(device)
            text = text.to(device)
            text_lengths = text_lengths.to(device)
            output = model(text, text_lengths)

            output = (output > threshold).int()
            y_pred.extend(output.tolist())
            y_true.extend(labels.tolist())
    
    print('Classification Report:')
    print(classification_report(y_true, y_pred, labels=[0, 1], digits=4))
            

In [None]:
valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

	 Val. Loss: -0.080 |  Val. Acc: 71.56%


In [None]:
cr(model, valid_iterator)

Classification Report:
              precision    recall  f1-score   support

           0     0.8325    0.8005    0.8162      4351
           1     0.4752    0.6255    0.5401      1594

   micro avg     0.7131    0.7536    0.7328      5945
   macro avg     0.6538    0.7130    0.6781      5945
weighted avg     0.7367    0.7536    0.7421      5945



## Предсказание

In [None]:
def predict(model, sentences):
    predicts = []
    for sentence in sentences:
        tokenized = [tok.text for tok in spacy_ger.tokenizer(sentence)]    
        indexed = [TEXT.vocab.stoi[t] for t in tokenized]           
        length = [len(indexed)]                                     
        tensor = torch.LongTensor(indexed).to(device)              
        tensor = tensor.unsqueeze(1).T                              
        length_tensor = torch.LongTensor(length)                   
        prediction = model(tensor, length_tensor)
        if 0 <= prediction.item() <= 0.33:
            predicts.append('neutral')
        elif 0.33 < prediction.item() <= 0.66:
            predicts.append('positive')
        elif 0.66 < prediction.item() <= 1:
            predicts.append('negative')              
    return predicts

In [None]:
print('result: ', predict(model, ['ich hasse diese Welt', 'Ich habe gute Laune' ]))

result:  ['negative', 'positive']
