<a href="https://colab.research.google.com/github/akhilkapil/NLP-Transfer_Learning_Models/blob/main/XLNeT_Pytorch_Sentimental_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers

In [2]:
import pandas as pd 
import numpy as np 
import torch
import re
import transformers
from gensim.parsing.preprocessing import remove_stopwords
from transformers import AdamW, XLNetTokenizer, XLNetModel, XLNetLMHeadModel, XLNetConfig, XLNetForSequenceClassification
from sklearn.model_selection import train_test_split

In [3]:
from tqdm import tqdm, trange 
import io

In [None]:
data = pd.read_csv('/content/tripadvisor_hotel_reviews.csv')
data.head()

In [5]:
# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    hhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhh
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla T4


In [7]:
puncts = [',', '.', '"', ':', ')', '(', '-', '!', '?', '|', ';', "'", '$', '&', '/', '[', ']', '>', '%', '=', '#', '*', '+', '\\', '•',  '~', '@', '£', 
 '·', '_', '{', '}', '©', '^', '®', '`',  '<', '→', '°', '€', '™', '›',  '♥', '←', '×', '§', '″', '′', 'Â', '█', '½', 'à', '…', 
 '“', '★', '”', '–', '●', 'â', '►', '−', '¢', '²', '¬', '░', '¶', '↑', '±', '¿', '▾', '═', '¦', '║', '―', '¥', '▓', '—', '‹', '─', 
 '▒', '：', '¼', '⊕', '▼', '▪', '†', '■', '’', '▀', '¨', '▄', '♫', '☆', 'é', '¯', '♦', '¤', '▲', 'è', '¸', '¾', 'Ã', '⋅', '‘', '∞', 
 '∙', '）', '↓', '、', '│', '（', '»', '，', '♪', '╩', '╚', '³', '・', '╦', '╣', '╔', '╗', '▬', '❤', 'ï', 'Ø', '¹', '≤', '‡', '√']

def clean_text(text):
  text = str(text)
  for punc in puncts:
      if punc in text:
          text = text.replace(punc, ' ')
  return text

def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)
    
data['Review'] = data['Review'].apply(lambda x: remove_emoji(x)) 
data['Review'] = data['Review'].apply(lambda x: clean_text(x)) 
data['Review'] = data['Review'].apply(lambda x: re.sub(r'http\S+','',x))
data['Review'] = data['Review'].apply(lambda x: re.sub("@[\w]*", '', x))
data['Review'] = data['Review'].apply(lambda x:' '.join(x.split()))
data['Review'] = data['Review'].apply(lambda x: remove_stopwords(x))

In [8]:
encode = {1:0,
          2:1,
          3:2,
          4:3,
          5:4}

data['Rating'].replace(encode, inplace=True)         

In [9]:
data['Rating'].unique()

array([3, 1, 2, 4, 0])

In [10]:
reviews = data['Review']
rating = data.Rating.values

In [None]:
#Add [CLS] and [SEP] operator to the end of the sentence

reviews = [review + '[SEP] [CLS]' for review in reviews]
reviews 

In [12]:
#tokenize them
from keras.preprocessing.sequence import pad_sequences

tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased', do_lower_case=True)

tokenized_texts = [tokenizer.tokenize(review) for review in reviews ]
# print('Tokenized the first sentence')
# print(tokenized_texts[0])

MAX_LEN = 512

input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype='long', padding='post', truncating='post')


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=798011.0, style=ProgressStyle(descripti…




In [13]:
#Retrieve attention masks
attention_masks = []

for seq in input_ids:
  seq_mask = [float(i>0) for i in seq]
  attention_masks.append(seq_mask)

In [14]:
#use train test aplit to split the dataset into training and validation dataset

train_inputs, val_inputs, train_labels, val_labels = train_test_split(input_ids, rating,
                                                                      random_state=2018, test_size=0.1)

train_masks, val_masks, _,_ = train_test_split(attention_masks, input_ids,
                                               test_size=0.1, random_state=2018)

In [15]:
train_inputs = torch.tensor(train_inputs)
val_inputs = torch.tensor(val_inputs)

train_labels = torch.tensor(train_labels)
val_labels = torch.tensor(val_labels)

train_masks = torch.tensor(train_masks)
val_masks = torch.tensor(val_masks)

In [16]:
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

batch_size = 3

train_data = TensorDataset(train_inputs, train_labels, train_masks)
val_data = TensorDataset(val_inputs, val_labels, val_masks)

train_dataloader = DataLoader(train_data,
                              sampler=RandomSampler(train_data),
                              batch_size=batch_size)

val_dataloader = DataLoader(val_data,
                            sampler=SequentialSampler(val_data),
                            batch_size=batch_size)

In [17]:
data['Rating'].unique()

array([3, 1, 2, 4, 0])

In [None]:
model = XLNetForSequenceClassification.from_pretrained('xlnet-base-cased', num_labels = len(data['Rating'].unique()))

In [None]:
model.to(device)

In [20]:
param_optimizer =  list(model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.0}
]

In [21]:
optimizer = AdamW(optimizer_grouped_parameters, lr = 3e-5, eps=1e-10)

In [22]:
#define a acccuracy function
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [None]:
train_loss_set = []
epochs = 4
for _ in trange(epochs, desc='Epochs'):
  model.train()

  #tracking variables
  tr_loss = 0
  nb_tr_examples, nb_tr_steps = 0,0

  for step, batch in enumerate(train_dataloader):
    batch = tuple(t.to(device) for t in batch)
    b_inputs = batch[0]
    b_labels = batch[1]
    b_attention_mask = batch[2]

    optimizer.zero_grad()
    outputs = model(b_inputs, 
                    attention_mask = b_attention_mask, 
                    token_type_ids=None, 
                    labels = b_labels)
    
    loss = outputs[0]
    logits = outputs[1]
    train_loss_set.append(loss.item())
    #Backward pass
    loss.backward()
    #Update parameters and take a step using the computed gradient 
    optimizer.step()

    #Update tracking variables
    tr_loss += loss.item()
    nb_tr_examples += b_inputs.size(0)
    nb_tr_steps += 1

  print('Training Loss : {}'.format(tr_loss/nb_tr_steps))

#VALIDATION

model.eval()
eval_loss, eval_accuracy = 0, 0
nb_eval_steps , nb_eval_examples = 0,0

for batch in val_dataloader:
  model.eval()
  batch = tuple(t.to(device) for t in batch)
  b_inputs = batch[0]
  b_labels = batch[1]
  b_attention_mask = batch[2]

  with torch.no_grad():
    outputs = model(b_inputs, b_attention_mask, token_type_ids=None)
    logits = outputs[0]

    logits = logits.detach().cpu().numpy()
    labels = b_labels.to('cpu').numpy()

    tmp_eval_accuracy = flat_accuracy(logits, labels)

    eval_accuracy += tmp_eval_accuracy
    nb_eval_steps += 1

print("Validation Accuracy: {}".format(eval_accuracy/nb_eval_steps))