<a href="https://colab.research.google.com/github/YaroNemec/NLP-Sentimental-Analysis/blob/master/NLP_Sentiment_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.19.0-py3-none-any.whl (4.2 MB)
[K     |████████████████████████████████| 4.2 MB 5.3 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 49.0 MB/s 
[?25hCollecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 49.7 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.6.0-py3-none-any.whl (84 kB)
[K     |████████████████████████████████| 84 kB 3.0 MB/s 
Installing collected packages: pyyaml, tokenizers, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninstalling PyYAML-3.13:
      Successfully uninstalled PyYAML-3.13
Successfully installed huggingface-

In [4]:
from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup
import torch
import numpy as np
from sklearn.model_selection import train_test_split
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from textwrap import wrap

In [5]:
#inicializacion
RANDOM_SEED = 37
MAX_LEN = 200
BATCH_SIZE = 8
DATASET_PATH = '/content/drive/My Drive/Artificial_Intelligence/TweetsModified.csv'
NCLASSES = 2

np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cpu


In [6]:
from google.colab import drive
drive.mount('/content/drive')

df = pd.read_csv(DATASET_PATH)
df = df[0:10000]

Mounted at /content/drive


In [7]:
print(df.head())
print(df.shape)
print("\n".join(wrap(df['text'][200])))

                                                text airline_sentiment
0                @VirginAmerica What @dhepburn said.           neutral
1  @VirginAmerica plus you've added commercials t...          positive
2  @VirginAmerica I didn't today... Must mean I n...           neutral
3  @VirginAmerica it's really aggressive to blast...          negative
4  @VirginAmerica and it's a really big bad thing...          negative
(10000, 2)
@VirginAmerica Your website is down and I'm trying to check in!


In [8]:
#Reajustar dataset
df['label'] = (df['airline_sentiment'] == 'positive').astype(int)
df.drop('airline_sentiment', axis= 1, inplace=True)
df.head()

Unnamed: 0,text,label
0,@VirginAmerica What @dhepburn said.,0
1,@VirginAmerica plus you've added commercials t...,1
2,@VirginAmerica I didn't today... Must mean I n...,0
3,@VirginAmerica it's really aggressive to blast...,0
4,@VirginAmerica and it's a really big bad thing...,0


In [9]:
#Tokenizacion
PRE_TRAINED_MODEL_NAME = 'bert-base-cased'
tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)

Downloading:   0%|          | 0.00/208k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [10]:
sample_text = 'I really loved that city!'
tokens = tokenizer.tokenize(sample_text)
token_ids = tokenizer.convert_tokens_to_ids(tokens)
print('Frase: ', sample_text)
print('Tokens: ', tokens)
print('Tokens numericos: ', token_ids)

Frase:  I really loved that city!
Tokens:  ['I', 'really', 'loved', 'that', 'city', '!']
Tokens numericos:  [146, 1541, 3097, 1115, 1331, 106]


In [11]:
#Codificacion para introducir a Bert
encoding = tokenizer.encode_plus(
    sample_text,
    max_length = 10,
    truncation = True,
    add_special_tokens = True,
    return_token_type_ids = False,
    pad_to_max_length = True,
    return_attention_mask = True,
    return_tensors = 'pt'
)



In [12]:
encoding.keys()

dict_keys(['input_ids', 'attention_mask'])

In [13]:
print(tokenizer.convert_ids_to_tokens(encoding['input_ids'][0]))
print(encoding['input_ids'][0])
print(encoding['attention_mask'][0])

['[CLS]', 'I', 'really', 'loved', 'that', 'city', '!', '[SEP]', '[PAD]', '[PAD]']
tensor([ 101,  146, 1541, 3097, 1115, 1331,  106,  102,    0,    0])
tensor([1, 1, 1, 1, 1, 1, 1, 1, 0, 0])


In [14]:
# CREACIÓN DATASET

class IMDBDataset(Dataset):

  def __init__(self,texts,labels,tokenizer,max_len):
    self.texts = texts
    self.labels = labels
    self.tokenizer = tokenizer
    self.max_len = max_len

  def __len__(self):
      return len(self.texts)
    
  def __getitem__(self, item):
    text = str(self.texts[item])
    label = self.labels[item]
    encoding = tokenizer.encode_plus(
        text,
        max_length = self.max_len,
        truncation = True,
        add_special_tokens = True,
        return_token_type_ids = False,
        pad_to_max_length = True,
        return_attention_mask = True,
        return_tensors = 'pt'
        )
    

    return {
          'text': text,
          'input_ids': encoding['input_ids'].flatten(),
          'attention_mask': encoding['attention_mask'].flatten(),
          'label': torch.tensor(label, dtype=torch.long)
      } 

In [15]:
# Data loader:

def data_loader(df, tokenizer, max_len, batch_size):
  dataset = IMDBDataset(
      reviews = df.text.to_numpy(),
      labels = df.label.to_numpy(),
      tokenizer = tokenizer,
      max_len = MAX_LEN
  )

  return DataLoader(dataset, batch_size = BATCH_SIZE, num_workers = 2)

In [16]:
df_train, df_test = train_test_split(df, test_size = 0.2, random_state = RANDOM_SEED)

train_data_loader = data_loader(df_train, tokenizer, MAX_LEN, BATCH_SIZE)
test_data_loader = data_loader(df_test, tokenizer, MAX_LEN, BATCH_SIZE)

In [17]:
#El Modelo

class BERTSentimentClassifier(nn.Module):
  def __init__(self,n_classes):
    super(BERTSentimentClassifier, self).__init__()
    self.bert = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME,return_dict=False)
    self.drop = nn.Dropout(p=0.3) #Durante el entrenamiento apagamos de forma aleatoria el 30% de las neuronas
    self.linear = nn.Linear(self.bert.config.hidden_size, n_classes) #nn = neuronal network, linear de 768 neuronas de entrada y la cantidad especifica de neuronas de salida

  def forward(self, input_ids, attention_mask):
    _, cls_output = self.bert(
        input_ids = input_ids,
        attention_mask = attention_mask
    )
    drop_output = self.drop(cls_output)
    output = self.linear(drop_output)
    return output

In [18]:
model = BERTSentimentClassifier(NCLASSES)
model = model.to(device)

Downloading:   0%|          | 0.00/416M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [19]:
# ENTRENAMIENTO
EPOCHS = 5
optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)
total_steps = len(train_data_loader) * EPOCHS  #Numero total de iteraciones en el entrenamiento
scheduler = get_linear_schedule_with_warmup( #Tomado del paper, la taza de aprendizaje 
    optimizer,
    num_warmup_steps = 0,
    num_training_steps = total_steps
)
loss_fn = nn.CrossEntropyLoss().to(device) #Definir el tipo de error que se quiere minimizar 



In [20]:
# Iteración entrenamiento
def train_model(model, data_loader, loss_fn, optimizer, device, scheduler, n_examples):
  model = model.train() #al modelo se lo pone en modo de entrenamiento
  losses = []
  correct_predictions = 0
  for batch in data_loader: #mandamos el trabajo a la GPU
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['label'].to(device)
    outputs = model(input_ids = input_ids, attention_mask = attention_mask)
    _, preds = torch.max(outputs, dim=1) # calcula las predicciones
    loss = loss_fn(outputs, labels)# calculamos la perdida
    correct_predictions += torch.sum(preds == labels)
    losses.append(loss.item())
    loss.backward()
    nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
    optimizer.step()
    scheduler.step()
    optimizer.zero_grad()
  return correct_predictions.double()/n_examples, np.mean(losses)

def eval_model(model, data_loader, loss_fn, device, n_examples):
  model = model.eval()
  losses = []
  correct_predictions = 0
  with torch.no_grad():
    for batch in data_loader:
      input_ids = batch['input_ids'].to(device)
      attention_mask = batch['attention_mask'].to(device)
      labels = batch['label'].to(device)
      outputs = model(input_ids = input_ids, attention_mask = attention_mask)
      _, preds = torch.max(outputs, dim=1)
      loss = loss_fn(outputs, labels)
      correct_predictions += torch.sum(preds == labels)
      losses.append(loss.item())
  return correct_predictions.double()/n_examples, np.mean(losses)

In [73]:
# Entrenamiento!!!

for epoch in range(EPOCHS):
  print('Epoch {} de {}'.format(epoch+1, EPOCHS))
  print('------------------')
  train_acc, train_loss = train_model(
      model, train_data_loader, loss_fn, optimizer, device, scheduler, len(df_train)
  )
  test_acc, test_loss = eval_model(
      model, test_data_loader, loss_fn, device, len(df_test)
  )
  print('Entrenamiento: Loss: {}, accuracy: {}'.format(train_loss, train_acc))
  print('Validación: Loss: {}, accuracy: {}'.format(test_loss, test_acc))
  print('')

Epoch 1 de 5
------------------




Entrenamiento: Loss: 0.3296662205515895, accuracy: 0.890625
Validación: Loss: 0.32407435586024075, accuracy: 0.9045

Epoch 2 de 5
------------------




Entrenamiento: Loss: 0.1992932712439215, accuracy: 0.944375
Validación: Loss: 0.4143703719693003, accuracy: 0.9045

Epoch 3 de 5
------------------




Entrenamiento: Loss: 0.11518477769414312, accuracy: 0.97375
Validación: Loss: 0.5247552155798185, accuracy: 0.9

Epoch 4 de 5
------------------




Entrenamiento: Loss: 0.07121821544451813, accuracy: 0.9848750000000001
Validación: Loss: 0.5623956057822798, accuracy: 0.9005

Epoch 5 de 5
------------------




Entrenamiento: Loss: 0.047131522076408146, accuracy: 0.990375
Validación: Loss: 0.571707188378321, accuracy: 0.9055



In [21]:
def classifySentiment(review_text):
  encoding_review = tokenizer.encode_plus(
      review_text,
      max_length = MAX_LEN,
      truncation = True,
      add_special_tokens = True,
      return_token_type_ids = False,
      pad_to_max_length = True,
      return_attention_mask = True,
      return_tensors = 'pt'
      )
  
  input_ids = encoding_review['input_ids'].to(device)
  attention_mask = encoding_review['attention_mask'].to(device)
  output = model(input_ids, attention_mask)
  _, prediction = torch.max(output, dim=1)
  print("\n".join(wrap(review_text)))
  if prediction:
    print('Sentimiento predicho: Positivo')
  else:
    print('Sentimiento predicho: Indiferente/Negativo')


In [36]:
review_text = "Today I had an accident, my arm got brokenp"

classifySentiment(review_text)



Today I had an accident, my arm got brokenp
Sentimiento predicho: Indiferente/Negativo
