In [13]:
!pip install transformers



In [14]:
import pandas as pd
import numpy as np
import random
import time
import pickle

import matplotlib.pyplot as plt
import seaborn as sbn

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_curve, auc
from sklearn.decomposition import PCA

import re

from tqdm import tqdm
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import torch.nn as nn
import torch.nn.functional as F

from transformers import DistilBertTokenizer, DistilBertModel,AdamW, get_linear_schedule_with_warmup

In [15]:
if torch.cuda.is_available():       
    device = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

No GPU available, using the CPU instead.


In [16]:
def set_seed(seed_value=42):
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    torch.cuda.manual_seed_all(seed_value)


In [17]:
model_name = "distilbert-base-uncased"
tokenizer = DistilBertTokenizer.from_pretrained(model_name)

mBERT = DistilBertModel.from_pretrained(model_name,
                                  output_hidden_states = True,
                                  output_attentions = True)

In [18]:
amazon_reviews_path = "../input/amazon-reviews-unlocked-mobile-phones/Amazon_Unlocked_Mobile.csv"

In [19]:
df = pd.read_csv(amazon_reviews_path)
df = df.sample(frac = 1.0,random_state = 42).reset_index(drop = True)
df.head()

Unnamed: 0,Product Name,Brand Name,Price,Rating,Reviews,Review Votes
0,Apple iPhone 6 Plus 128GB Factory Unlocked GSM...,,699.95,1,Defective phone. Works only for two days. Phon...,5.0
1,HTC Desire 816 Dual Sim Unlocked Smartphone (W...,HTC,221.0,3,Cool,0.0
2,"BLU Studio 5.0 C HD Unlocked Cellphone, Black",BLU,173.44,5,Beautiful and excellent quality,1.0
3,Apple iPhone 5c 16GB (Pink) - AT&T,Apple,519.0,2,Gave 3 stars because it did not come with a ch...,0.0
4,BLU PURE XL Smartphone - 4G LTE GSM Unlocked -...,BLU,129.99,1,High resolution camera so you can zoom in afte...,1.0


In [20]:
print(len(df))

413840


In [21]:
df_v1 = df.iloc[20000:]
df_v1.head()

Unnamed: 0,Product Name,Brand Name,Price,Rating,Reviews,Review Votes
20000,Samsung Galaxy Note 3 N900 32GB Unlocked GSM 4...,Samsung,249.99,4,Where is a set of instructions for us 'older u...,1.0
20001,Sony XPERIA Z2 D6503 FACTORY UNLOCKED Internat...,,244.95,5,Great phone,0.0
20002,Apple iPhone 5C Green 8GB Unlocked GSM Smartph...,,159.99,3,Wouldnt know my daughter lost it 3days after p...,0.0
20003,Apple iPhone 5c 32GB - Unlocked - (Blue),Apple,224.77,5,"The phone arrived on time,I'm happy with the p...",2.0
20004,"UHAPPY Touch Screen 5"" QHD MTK6582 Dual SIM Qu...",Wmicro,69.99,4,"Its a great phone, so far no problems. Fast to...",7.0


In [22]:
# neutral = df_v1[df["sentiment"] == 1]
# positive = df_v1[df["sentiment"] == 2][:700]
# negative = df_v1[df["sentiment"] == 0][:1800]

In [23]:
# df_major_n = positive.copy()
# df_major_n = df_major_n.append(neutral,ignore_index = True)
# df_major_n = df_major_n.append(negative,ignore_index = True)
# df_major_n = df_major_n.sample(frac = 1.0).reset_index(drop = True)
# df_major_n.head(15)

In [24]:
X = df["Reviews"].astype(str)
y = df["Rating"]

In [25]:
X.describe()

count     413840
unique    162492
top         Good
freq        2879
Name: Reviews, dtype: object

In [26]:
def project(targets):
    L = []
    targets = np.asarray(targets)
    for i in range(len(targets)):
        if targets[i] > 4:
            L.append(2)
        elif targets[i] < 2:
            L.append(0)
        else:
            L.append(1)
    return pd.Series(L)

In [27]:
y = project(y)

In [28]:
X_tr,X_val,y_tr,y_val = train_test_split(X,y,test_size = 0.30)
X_val,X_te,y_val,y_te = train_test_split(X_val,y_val,test_size = 0.40)
assert(X_tr.shape[0] == y_tr.shape[0])

In [29]:
MAX_LEN = 200
def preprocessing_for_bert(data):
    input_ids = []
    attention_masks = []

    # For every sentence...
    for sent in tqdm(data):
        #print(type(sent))
        encoded_sent = tokenizer(
            text=sent,
            #add_special_tokens = True,
            max_length=MAX_LEN,             
            padding = 'max_length',
            return_tensors='pt',           
            return_attention_mask=True,
            truncation = True   
            )
        input_ids.append(encoded_sent.get('input_ids'))
        attention_masks.append(encoded_sent.get('attention_mask'))

    # Convert lists to tensors
    #print(type(input_ids))
    input_ids = torch.cat(input_ids,dim=0)
    attention_masks = torch.cat(attention_masks,dim=0)

    return input_ids, attention_masks

In [30]:
train_inputs, train_masks = preprocessing_for_bert(X_tr)
val_inputs, val_masks = preprocessing_for_bert(X_val)

  6%|▋         | 18156/289688 [00:29<07:23, 611.81it/s]


KeyboardInterrupt: 

In [31]:

train_labels = torch.tensor(y_tr.values)
val_labels = torch.tensor(y_val.values)

batch_size = 32

# Create the DataLoader for our training set
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

# Create the DataLoader for our validation set
val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

NameError: name 'train_inputs' is not defined

In [None]:
# Create the BertClassfier class

class BertClassifier(nn.Module):
    def __init__(self, freeze_bert=False):
        super(BertClassifier, self).__init__()
        
        D_in, H, D_out = 768, 20, 3    # Specify hidden size of BERT, hidden size of our classifier, and number of labels
        self.bert = mBERT
        self.classifier = nn.Sequential(
            nn.Linear(D_in, H),
            nn.ReLU(),
            nn.Linear(H, D_out)
        )
        if freeze_bert:                 # Freeze the BERT model
            for param in self.bert.parameters():
                param.requires_grad = False
        
    def forward(self, input_ids, attention_mask):
        
        outputs = self.bert(input_ids=input_ids,
                            attention_mask=attention_mask)     # Feed input to BERT
        
        #print(outputs[0].size())
        last_hidden_state_cls = outputs[0][:, 0, :]
        # Feed input to classifier to compute logits
        
        logits = self.classifier(last_hidden_state_cls)
        return logits

In [None]:
def initialize_model(epochs=1):
    
    bert_classifier = BertClassifier(freeze_bert=False)
    bert_classifier.to(device)

    # Create the optimizer
    optimizer = AdamW(bert_classifier.parameters(),
                      lr=1e-5,    # Default learning rate
                      eps=1e-8    # Default epsilon value
                      )

    # Total number of training steps
    total_steps = len(train_dataloader) * epochs

    # Set up the learning rate scheduler
    scheduler = get_linear_schedule_with_warmup(optimizer,
                                                num_warmup_steps=0, # Default value
                                                num_training_steps=total_steps)
    return bert_classifier, optimizer, scheduler

In [None]:


# Specify loss function
loss_fn = nn.CrossEntropyLoss()

def train(model, train_dataloader, val_dataloader=None, epochs=4, evaluation=False):
    
    print("Start training...\n")
    for epoch_i in range(epochs):
        # =======================================
        #               Training
        # =======================================
        # Print the header of the result table
        print(f"{'Epoch':^7} | {'Batch':^7} | {'Train Loss':^12} | {'Val Loss':^10} | {'Val Acc':^9} | {'Elapsed':^9}")
        print("-"*70)

        # Measure the elapsed time of each epoch
        t0_epoch, t0_batch = time.time(), time.time()

        # Reset tracking variables at the beginning of each epoch
        total_loss, batch_loss, batch_counts = 0, 0, 0

        # Put the model into the training mode
        model.train()

        # For each batch of training data...
        for step, batch in enumerate(train_dataloader):
            #print(step)
            
            batch_counts +=1
            # Load batch to GPU
            b_input_ids, b_attn_mask, b_labels = tuple(t.to(device) for t in batch)

            # Zero out any previously calculated gradients
            model.zero_grad()

            # Perform a forward pass. This will return logits.
            logits = model(b_input_ids, b_attn_mask)

            # Compute loss and accumulate the loss values
            loss = loss_fn(logits, b_labels)
            batch_loss += loss.item()
            total_loss += loss.item()

            # Perform a backward pass to calculate gradients
            loss.backward()

            # Clip the norm of the gradients to 1.0 to prevent "exploding gradients"
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

            # Update parameters and the learning rate
            optimizer.step()
            scheduler.step()

            # Print the loss values and time elapsed for every 20 batches
            if (step % 20 == 0 and step != 0) or (step == len(train_dataloader) - 1):
                # Calculate time elapsed for 20 batches
                time_elapsed = time.time() - t0_batch

                # Print training and validation results
                #val_loss, val_accuracy = evaluate(model, val_dataloader)
                print(f"{epoch_i + 1:^7} | {step:^7} | {batch_loss / batch_counts:^12.6f} | {'-':^9} | {'-':^9} | {time_elapsed:^9.2f}")

                # Reset batch tracking variables
                batch_loss, batch_counts = 0, 0
                t0_batch = time.time()

        # Calculate the average loss over the entire training data
        avg_train_loss = total_loss / len(train_dataloader)

        print("-"*70)
        # =======================================
        #               Evaluation
        # =======================================
        if evaluation == True:
            # After the completion of each training epoch, measure the model's performance
            # on our validation set.
            val_loss, val_accuracy = evaluate(model, val_dataloader)

            # Print performance over the entire training data
            time_elapsed = time.time() - t0_epoch
            
            print(f"{epoch_i + 1:^7} | {'-':^7} | {avg_train_loss:^12.6f} | {val_loss:^10.6f} | {val_accuracy:^9.2f} | {time_elapsed:^9.2f}")
            print("-"*70)
        print("\n")
    
    print("Training complete!")


def evaluate(model, val_dataloader):
    
    # Put the model into the evaluation mode. The dropout layers are disabled during
    # the test time.
    model.eval()

    # Tracking variables
    val_accuracy = []
    val_loss = []

    # For each batch in our validation set...
    for batch in val_dataloader:
        # Load batch to GPU
        b_input_ids, b_attn_mask, b_labels = tuple(t.to(device) for t in batch)

        # Compute logits
        with torch.no_grad():
            logits = model(b_input_ids, b_attn_mask)

        # Compute loss
        loss = loss_fn(logits, b_labels)
        val_loss.append(loss.item())

        # Get the predictions
        preds = torch.argmax(logits, dim=1).flatten()

        # Calculate the accuracy rate
        accuracy = (preds == b_labels).cpu().numpy().mean() * 100
        val_accuracy.append(accuracy)

    # Compute the average accuracy and loss over the validation set.
    val_loss = np.mean(val_loss)
    val_accuracy = np.mean(val_accuracy)

    return val_loss, val_accuracy

In [None]:
print(train_inputs.size(),train_masks.size())

In [None]:
set_seed()
bert_classifier, optimizer, scheduler = initialize_model(epochs=1)
train(bert_classifier, train_dataloader, val_dataloader, epochs=1, evaluation=True)

In [None]:
train_loss, train_acc = evaluate(bert_classifier,train_dataloader)
print("Train Loss : {0:.5f}\nTrain Accuracy : {1:.2f}".format(train_loss,train_acc))
val_loss, val_acc = evaluate(bert_classifier,val_dataloader)
print("Validation Loss : {0:.5f}\nValidation Accuracy : {1:.2f}".format(train_loss,train_acc))

In [None]:
checkpoint = {"model" : bert_classifier.state_dict(),
              "optimizer" : optimizer.state_dict(),
              "scheduler" : scheduler.state_dict(),
              "val_loss" : val_loss,
              "val_acc" : val_acc,
              }
PATH = "distilbert_sentiment.pt"

In [None]:
torch.save(checkpoint,PATH)

In [None]:
def bert_predict(model, test_dataloader):
    model.eval()

    all_logits = []
    for batch in test_dataloader:
        # Load batch to GPU
        b_input_ids, b_attn_mask = tuple(t.to(device) for t in batch)[:2]

        # Compute logits
        with torch.no_grad():
            logits = model(b_input_ids, b_attn_mask)
        all_logits.append(logits)
    
    # Concatenate logits from each batch
    all_logits = torch.cat(all_logits, dim=0)

    # Apply softmax to calculate probabilities
    probs = F.softmax(all_logits, dim=1).cpu()

    return probs

In [None]:
def evaluate_thru_pipeline(model,X,y):
    model.eval()
    
    labels = torch.tensor(project(y))
    X_tokenized, X_masks = preprocessing_for_bert(X)

    piped_ds = TensorDataset(X_tokenized, X_masks, labels)
    piped_dl = DataLoader(piped_ds,batch_size = 16)
    
    #Get the probabilities
    probs = bert_predict(model, piped_dl)

    # Get the predictions
    preds = torch.argmax(probs, dim=1).flatten()
    
    # Calculate the accuracy rate
    accuracy = (preds == labels).cpu().numpy().mean() * 100

    return preds,probs,accuracy

In [None]:
sample_x = ["The phone offers less features"]
sample_y = [1]
preds,probs,accuracy = evaluate_thru_pipeline(bert_classifier,sample_x,sample_y)
print(probs)
print("sentiment is: ",preds)

In [None]:
def evaluate_roc(probs, y_true):
    preds = probs[:, 1]
    fpr, tpr, threshold = roc_curve(y_true, preds)
    roc_auc = auc(fpr, tpr)
    print(f'AUC: {roc_auc:.4f}')
       
    # Get accuracy over the test set
    y_pred = np.where(preds >= 0.5, 1, 0)
    accuracy = accuracy_score(y_true, y_pred)
    print(f'Accuracy: {accuracy*100:.2f}%')
    
    # Plot ROC AUC
    plt.title('Receiver Operating Characteristic')
    plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
    plt.legend(loc = 'lower right')
    plt.plot([0, 1], [0, 1],'r--')
    plt.xlim([0, 1])
    plt.ylim([0, 1])
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')
    plt.show()

In [None]:
# from sklearn.metrics import classification_report,confusion_matrix
# preds = probs[:, 1]
# y_pred = np.where(preds >= 0.5, 1, 0)
# print(classification_report(y_val, y_pred))

In [None]:
"""
marked_text = "[CLS] " + text + " [SEP]"
tokenized_text = tokenizer.tokenize(marked_text)
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)  ## here ids are the indices in vocabulary of Bert
segment_ids = [1] * len(tokenized_text)
tokens_tensor = torch.tensor([indexed_tokens])
segments_tensor = torch.tensor([segment_ids])
"""

In [None]:
# BERT.eval()
# with torch.no_grad():
#     outputs = BERT(tokens_tensor,segments_tensor)
#     hidden_states = outputs.hidden_states
# token_embeddings = torch.stack(hidden_states,dim = 0)
# token_embeddings = torch.squeeze(token_embeddings,dim = 1)
# token_embeddings = token_embeddings.permute(1,0,2)
# token_embeddings.size()

In [None]:
# # Stores the token vectors, with shape [22 x 768]
# token_vecs_sum = []

# # `token_embeddings` is a [22 x 12 x 768] tensor.

# # For each token in the sentence...
# for token in token_embeddings:

#     # `token` is a [12 x 768] tensor

#     # Sum the vectors from the last four layers.
#     sum_vec = torch.sum(token[-4:], dim=0)
    
#     # Use `sum_vec` to represent `token`.
#     token_vecs_sum.append(sum_vec)

In [None]:
# idx1 = tokenized_text.index("apple")
# idx2 = tokenized_text.index("samsung")
# print(idx1,idx2)

In [None]:
# Display the words with their indeces.
# for tup in zip(tokenized_text, indexed_tokens):
#     print('{:<12} {:>6,}'.format(tup[0], tup[1]))

In [None]:
# context_vec1 = token_vecs_sum[idx1]
# context_vec2 = token_vecs_sum[idx2]
# print(context_vec1.size(),context_vec2.size())

In [None]:
# cos = nn.CosineSimilarity(dim=0)
# print(cos(context_vec1,context_vec2))

In [None]:
# attention_text = outputs.attentions
# attn = torch.stack(attention_text,dim = 0)
# attn = torch.squeeze(attn,dim = 1)
# print(attn.size())

In [None]:
# plt.figure(figsize = (25,25))
# sbn.heatmap(data = attn[0][0],
#             xticklabels = tokenized_text,
#             yticklabels = tokenized_text)

# plt.show()

In [None]:
# plt.plot(attn[3][11][idx1])
# plt.show()
# print(tokenized_text)

In [None]:
# plt.plot(attn[5][11][idx2])
# plt.show()
# print(tokenized_text)

In [None]:
# tot = torch.mean(attn,dim = 0)
# tot = torch.mean(tot,dim = 0)
# print(tot.size())

In [None]:
# plt.plot(tot[idx1])
# plt.show()
# print(tokenized_text)

In [None]:
# logits = outputs.pooler_output
# probs = F.softmax(logits,dim = 1)
# print(probs.size())