<a href="https://colab.research.google.com/github/btasde42/BertSentimentAnalysis/blob/master/Tasdelen_Analyse_de_sentiment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **DEEP LEARNING MODEL FOR SENTIMENT ANALIYSIS**

## **Preprocessing data and train/valid set split**

---



In [None]:
import torch
import pandas as pd
import numpy as np
import nltk #NLP library.
nltk.download("stopwords")
from nltk.corpus import stopwords
import re
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
#Check if GPU is avaible then activate GPU, else activate cpu

if torch.cuda.is_available():       
    device = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

No GPU available, using the CPU instead.


In [None]:
#upload file from files section in left
datafile='dataset.txt' #define main dataset

def preprocess(text):
  """Function to preprocess text input: lowercase, special char removing, stopword removing
  Args:
    text: str
  Returns:
    text: modified str"""
  text=text.lower() #put strings on lower
  text=text.replace("\\n",'') #remove "\\n" char from text
  text=text.replace("\'",'') #remove "\'" char from text
  text=text.replace("\n",'') #remove "\n" char from text
  text=text.replace('"','') #remove '"' char from text
  text=re.sub(r'\s+', ' ', text).strip() #unify space between words
  
  pattern = re.compile(r'\b(' + r'|'.join(stopwords.words('english')) + r')\b\s*') #pattern for stopwords
  text = pattern.sub('', text) #remove english stopwords
  
  return text 
  



def read_data_file(datafile,labelled=True):
  """File reader function
  Args:
    datafile: labelled or non labelled text file
    labelled: boolean flag for labelled text
  """

  list_data=[] #if labelled data in form of [label,[token_list]] else [str]
  with open(datafile, 'r') as f: #get reviews from datafile
    next(f) #skip headers
    for lines in f:
      if labelled==True:
        label, text =lines.split('\t')[0], lines.split('\t')[1]
        preprocessed_text=preprocess(text) #preprocess text
        if int(label)==1: #transforming classes to [0;1]
          label=0
        else:
          label=1
        preprocessed_text=[label, preprocessed_text] #return a list of list [[label,text]]
      else:
        preprocessed_text=preprocess(lines)
      list_data.append(preprocessed_text)    
  return list_data

In [None]:
#Creating balanced valid-train-test sets
df_data=pd.DataFrame(read_data_file(datafile),columns=['label','text'])

x_train,x_test,y_train,y_test=train_test_split(df_data['text'],
                                               df_data['label'],
                                               test_size=0.30,
                                               shuffle=True,
                                               stratify=df_data['label']) #test-train split from data with balanced classes

x_test,x_valid,y_test,y_valid=train_test_split(x_test,
                                               y_test,test_size=0.50,
                                               shuffle=True,
                                               stratify=y_test) #train-valid split from train with balenced classes
print("Train label counts:")
print(y_train.value_counts())
print()
print("Test label counts:")
print(y_test.value_counts())
print()
print("Validation label counts:")
print(y_valid.value_counts())

#create for each dataset text and label values in separated lists 
train_x=x_train.astype(str).tolist()
train_y=y_train.astype(int).tolist()

valid_x=x_valid.astype(str).tolist()
valid_y=y_valid.astype(int).tolist()

test_x=x_test.astype(str).tolist()
test_y=y_test.astype(int).tolist()


Train label counts:
0    18782
1    16218
Name: label, dtype: int64

Test label counts:
0    4025
1    3475
Name: label, dtype: int64

Validation label counts:
0    4024
1    3476
Name: label, dtype: int64


## **Set Bert tokenizer**

---



In [None]:
#installing tokenizer transformers package
print("Installing BertModel, BertConfig and BertTokenizer!")
try: #if transformers model already installed
  from transformers import BertModel, BertConfig, BertTokenizer
except ImportError: #else
  !pip install transformers #install transformers
  from transformers import BertModel, BertConfig, BertTokenizer


Installing BertModel, BertConfig and BertTokenizer!


In [None]:
# charge tokenizer from bert-base-uncased model
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case=True) 

In [None]:
def tokenize_Bert(dataset,MAX_LEN):
  """Function to implement bert tokenization/truncation
  Args:
    dataset:list of reviews (list(str))
  Returns:
    return tokenized/truncated reviews
    """
  all_ids=[]
  #print(len(dataset))
  for review in dataset:
    review_encodings = tokenizer.encode(review,
                                    add_special_tokens=True, #add all special tokens
                                    truncation=True, #truncate longer reviews than max_len
                                    max_length = MAX_LEN, 
                                    pad_to_max_length=False) #disable padding for this step
                                    
    all_ids.append(review_encodings)
    
  return all_ids



## **Smart batch creation for datasets**

---



In [None]:
def pad_batches(batch_reviews,batch_labels):
  """Padding funtion on already selected batches
  Args:
    batch_reviews: list of lists (str)
    batch_labels: list of lists (int)
  Returns:
    list of tensor values
  """
    #lists to transform to tensors for output values
  tensor_inputs = []
  tensor_attention_masks = []
  tensor_labels = []

  for (batch_sent,batch_labels) in zip(batch_reviews,batch_labels): #iteration on both sentence and label batch
    
    batch_padded_sentences=[]
    batch_attention_masks=[] #we're creating attention mask list

    #for each batch we need to find longest sentence to pad other sentences to its lenght
    max_len=max(len(sent) for sent in batch_sent)
    
    # iterate over the values of the current batch
    for sent in batch_sent:

      #pad sentence 
      padded_sent = sent + [tokenizer.pad_token_id]*(max_len-(len(sent)))
      
      #create attention mask; pad values should take 0 and all other values 1
      attion_mask = [1] * len(sent) + [0] * (max_len-(len(sent)))
      
      # Add the padded results to the batch.
      batch_padded_sentences.append(padded_sent)
      batch_attention_masks.append(attion_mask)

    #transform padded input batches, newly created attion mask and labels to tensors and add to list
    tensor_inputs.append(torch.tensor(batch_padded_sentences))
    tensor_attention_masks.append(torch.tensor(batch_attention_masks))
    tensor_labels.append(torch.tensor(batch_labels))
  
  return [tensor_inputs,tensor_attention_masks,tensor_labels]

In [None]:
import random

def select_batches(review_list,label_list,batch_size):
  
  """Smart batching technique for bert to reduce training time
  Args:
    review_list: list of str for extracted reviews
    label_list: list of int for extracted labels
    batch_size: number of examples in each batch
  Returns:
     a list of batches :tensor_inputs(len of batch size), tensor attention mask and tensor labels for
    """

  input_ids=tokenize_Bert(review_list,500) #generate tokenization on text dataset
  examples = sorted(zip(input_ids, label_list), key=lambda x: len(x[0])) #sort examples by the length of reviews in input_ids

  # List of batches that we'll construct.
  batch_ordered_sentences = []
  batch_ordered_labels = []
  
  while len(examples) > 0: #iterate untill no exemple remains 

    #if take minimum exemple size if batch_size not smaller
    batch_s=min(batch_size,len(examples)) #new batch size
    
    random_ind = random.randint(0, len(examples) - batch_s) #choose a random index in examples for first sentence of batch
    #create batch
    batch = examples[random_ind:(random_ind + batch_s)]
    

    batch_ordered_sentences.append([s[0] for s in batch])
    batch_ordered_labels.append([s[1] for s in batch])

    # Remove these examples rom the list.
    del examples[random_ind:(random_ind + batch_s)]

  return pad_batches(batch_ordered_sentences,batch_ordered_labels)

In [None]:
import time
start_time = time.time()

#Create train batches
train_data=select_batches(train_x,train_y,batch_size=30) #[train_inputs,train_att_mask,train_labels]

#create valid batches
valid_data=select_batches(valid_x,valid_y,batch_size=30) #valid_inputs,valid_att_mask,valid_labels

train_data=select_batches(train_x,train_y,batch_size=30) #valid_inputs,valid_att_mask,valid_labels
print("--- %s minutes ---" % ((time.time() - start_time)/60))

--- 2.143338477611542 minutes ---


## **Bert for Model AnalyseSentiment**

---



In [None]:
#import bert model
print("Defining model and config parameters")
config = BertConfig.from_pretrained('bert-base-uncased',num_labels=2) #specify number of labels [1,2]
bert = BertModel.from_pretrained('bert-base-uncased',config=config)


Defining model and config parameters


In [None]:
class AnalyseSentimentBert(torch.nn.Module):
  """
  Text classification model with BERT and 
  feedforward fully connected neural network with dropout and softmax layers
  """
  def __init__(self, bert,freeze_bert=False):
    super(AnalyseSentimentBert, self).__init__()
    # Specify hidden size of BERT, hidden size of our classifier, and number of labels
    D_in, H, D_out = 768, 50, 2 #hidden size of BERT, hidden size of classifier, and number of labels
    self.bert=bert
    self.dropout=torch.nn.Dropout(0.1) #dropout layer
    self.activation_relu=torch.nn.ReLU() #relu activation
    self.linear1=torch.nn.Linear(D_in,H) #linear layer
    self.linear2=torch.nn.Linear(H,D_out) #output layer
    self.activation_softmax=torch.nn.Softmax(dim=1) #out probabilities
    
    # Freeze the BERT model
    if freeze_bert:
      for param in self.bert.parameters():
        param.requires_grad = False

  def forward(self, input_ids,attention_masks):
    #print(input_ids)
    bert_out=self.bert(input_ids,attention_masks)
    #print(bert_out)
    last_hidden_state_cls=bert_out[0][:, 0, :]
    x=self.linear1(last_hidden_state_cls)
    x=self.activation_relu(x)
    x=self.dropout(x)
    x=self.linear2(x)
    x=self.activation_softmax(x)

    return x


In [None]:
model=AnalyseSentimentBert(bert)
model = model.to(device)

In [None]:
#model(train_data[0][2],train_data[1][2])

## **Fine-tuning BERT model**

In [None]:
#setting optimizer
from transformers import AdamW
optimizer = AdamW(model.parameters(),lr = 1e-5)

# define the loss function
cross_entropy  = torch.nn.CrossEntropyLoss() #because we used logsoftmax as output activation + bineary classification task

def train(model, traindata, validdata, epochs=1, evaluate=False, device=device):
  """Training for bert sentiment classifier"""
  
  min_loss = float('inf') #set symbolic valid loss
  
  for epoch_i in range(epochs): #loop on epoch number 
    
    total_train_loss= 0.0
    model.train() #training mode on

    for batch in range(0, len(traindata)): #loop on every batch in traindata
      
      
      #add batch features to GPU
      batch_input_id=traindata[0][batch].to(device)
      batch_att_mask=traindata[1][batch].to(device)
      batch_labels=traindata[2][batch].to(device)

      model.zero_grad()

      predictions = model(batch_input_id, batch_att_mask) #make predictions
      loss=cross_entropy(predictions,batch_labels) #calculate loss
      
      total_train_loss+=loss.item()

      #norm of the gradients==1.0 to prevent "exploding gradients"
      torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

      optimizer.step() #update parameters

    #train loss for epoch
    average_train_loss=total_train_loss/len(traindata)

    if evaluate == True: #if evaluation mode 
      
      
      model.eval() #desactivate dropout layer
      
      total_eval_loss=0.0
      
      for batch in range(0, len(validdata)):

        #add batch features to GPU
        batch_input_id=validdata[0][batch].to(device)
        batch_att_mask=validdata[1][batch].to(device)
        batch_labels=validdata[2][batch].to(device)

        with torch.no_grad(): #we don't need gradient descent
          predictions=model(batch_input_id, batch_att_mask)
          loss=cross_entropy(predictions,batch_labels) #calculate loss
          total_eval_loss+=loss.item()
      
      #compute valid loss of the epoch
      average_valid_loss=total_eval_loss/len(validdata)

      if average_valid_loss < min_loss: #if valid loss is better
        min_loss=average_valid_loss
        torch.save(model.state_dict(), 'saved_model.pt') #save model prameters

      print("Train loss of epoch:"+str(epoch_i)+" ::: "+str(average_train_loss))
      print("Valid loss of epoch:"+str(epoch_i)+" ::: "+str(average_valid_loss))
      print()

In [None]:
train(model,train_data,valid_data,evaluate=True)

Train loss of epoch:0 ::: 0.6975581844647726
Valid loss of epoch:0 ::: 0.682419498761495



## **Make Predictions**

In [None]:
#call model
path = 'saved_model.pt'
model.load_state_dict(torch.load(path)) #load best model

<All keys matched successfully>

In [None]:
#Tokenize for test data or never seen data
def prepare(data,is_Test=True):
  """Process non seen datafile
  Args:
    data:data file
    is_Test=bool flag for test file
  Returns:
    list (input ids,attention masks) assigned to file"""
  input_ids=[]
  attention_masks=[]
  if is_Test==False: #if new data/string
    data=read_data_file(data,labelled=False)
  
  for sent in data:
    tokenize_data = tokenizer(sent,
                                     add_special_tokens=True,
                                     truncation=True,
                                     max_length = 100,
                                     pad_to_max_length=True,
                                     return_attention_mask=True)
    input_ids.append(tokenize_data['input_ids'])
    attention_masks.append(tokenize_data['attention_mask'])


  input_ids = torch.tensor(input_ids)
  attention_masks = torch.tensor(attention_masks)
  return [input_ids, attention_masks]


In [None]:
#try sur test data

test_data=prepare(test_x[:200])
test_y=torch.tensor(test_y[:200])


with torch.no_grad():
  preds = model(test_data[0].to(device), test_data[1].to(device)) #make prediction
  preds = preds.clone().detach().cpu().numpy()




In [None]:

final_preds = np.argmax(preds,axis=1)

print(classification_report(test_y, final_preds))

              precision    recall  f1-score   support

           0       0.63      1.00      0.77       126
           1       0.00      0.00      0.00        74

    accuracy                           0.63       200
   macro avg       0.32      0.50      0.39       200
weighted avg       0.40      0.63      0.49       200



  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
def pred_sequence(seq):

  cleaned=preprocess(seq)
  truncated=tokenizer(cleaned,add_special_tokens=True,truncation=True,max_length = 100,pad_to_max_length=True,return_attention_mask=True,return_tensors='pt')

  with torch.no_grad():
    pred = model(truncated['input_ids'].to(device), truncated['attention_mask'].to(device))
  if np.argmax(pred)==1:
    return 2
  elif np.argmax(pred)==0:
    return 1
  else:
    print("Problem in classification")

In [None]:
text="I got 'new' tires from them and within two weeks got a flat. I took my car to a local mechanic to see if i could get the hole patched, but they said the reason I had a flat was because the previous patch had blown - WAIT, WHAT? I just got the tire and never needed to have it patched? This was supposed to be a new tire. \nI took the tire over to Flynn's and they told me that someone punctured my tire, then tried to patch it. So there are resentful tire slashers? I find that very unlikely. After arguing with the guy and telling him that his logic was far fetched he said he'd give me a new tire \this time\. \nI will never go back to Flynn's b/c of the way this guy treated me and the simple fact that they gave me a used tire!"
print(pred_sequence(text))



1
