In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/14-million-cell-phone-reviews/phone_user_review_file_1.csv
/kaggle/input/14-million-cell-phone-reviews/phone_user_review_file_5.csv
/kaggle/input/14-million-cell-phone-reviews/phone_user_review_file_2.csv
/kaggle/input/14-million-cell-phone-reviews/phone_user_review_file_4.csv
/kaggle/input/14-million-cell-phone-reviews/phone_user_review_file_3.csv
/kaggle/input/14-million-cell-phone-reviews/phone_user_review_file_6.csv


In [3]:
train_path = ['../input/14-million-cell-phone-reviews/phone_user_review_file_1.csv','../input/14-million-cell-phone-reviews/phone_user_review_file_2.csv',
              '../input/14-million-cell-phone-reviews/phone_user_review_file_3.csv','../input/14-million-cell-phone-reviews/phone_user_review_file_4.csv',
              '../input/14-million-cell-phone-reviews/phone_user_review_file_5.csv','../input/14-million-cell-phone-reviews/phone_user_review_file_6.csv']

In [87]:
from sklearn.model_selection import train_test_split

In [33]:
def extract_data(path):
    x = pd.read_csv(path,engine='python')
    x = x[x['lang'] == 'en']
    x.reset_index(inplace=True)
    return x[['score','extract','product']]
data = extract_data('../input/14-million-cell-phone-reviews/phone_user_review_file_3.csv')

In [37]:
data.dropna(inplace=True)

In [38]:
np.unique(data.score)

array([ 1. ,  2. ,  2.4,  2.8,  3. ,  3.2,  3.6,  4. ,  4.4,  4.8,  5. ,
        5.2,  5.6,  6. ,  6.4,  6.8,  7. ,  7.1,  7.2,  7.3,  7.4,  7.5,
        7.6,  7.7,  7.8,  7.9,  8. ,  8.3,  8.4,  8.6,  8.8,  9. ,  9.2,
        9.3,  9.6, 10. ])

In [80]:
def apply_trans(p):
    if p >= 4. and p <= 6.:
        return 1
    elif p < 4.:
        return 0
    else:
        return 2
        
data['score'] = data['score'].apply(lambda x :apply_trans(x))

In [83]:
X = data.extract.values
Y = data.score.values

In [98]:
import torch

if torch.cuda.is_available():       
    device = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
Device name: Tesla P100-PCIE-16GB


In [84]:
MAX_LEN = 300

In [85]:
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
def preprocessing_for_bert(data):
    input_ids = []
    attention_masks = []

    # For every sentence...
    for sent in data:
        encoded_sent = tokenizer.encode_plus(
            text=sent,  
            add_special_tokens=True,        
            max_length=MAX_LEN,             
            pad_to_max_length=True,         
            #return_tensors='pt',           
            return_attention_mask=True   
            )
        input_ids.append(encoded_sent.get('input_ids'))
        attention_masks.append(encoded_sent.get('attention_mask'))

    # Convert lists to tensors
    input_ids = torch.tensor(input_ids)
    attention_masks = torch.tensor(attention_masks)

    return input_ids, attention_masks

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

In [88]:
X_train, X_val, y_train, y_val =\
    train_test_split(X, Y, test_size=0.1, random_state=2021)

In [90]:
import torch
import torch.nn as nn

In [91]:
train_inputs, train_masks = preprocessing_for_bert(X_train)
val_inputs, val_masks = preprocessing_for_bert(X_val)



In [92]:
train_inputs.shape

torch.Size([127808, 300])

In [93]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
train_labels = torch.tensor(y_train)
val_labels = torch.tensor(y_val)
batch_size = 32

# Create the DataLoader for our training set
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

# Create the DataLoader for our validation set
val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

In [94]:
%time
import torch
import torch.nn as nn
from transformers import BertModel

# Create the BertClassfier class
class BertClassifier(nn.Module):
    def __init__(self, freeze_bert=False):
        super(BertClassifier, self).__init__()
        # Specify hidden size of BERT, hidden size of our classifier, and number of labels
        D_in, H, D_out = 768, 80, 3

        # Instantiate BERT model
        self.bert = BertModel.from_pretrained('bert-base-uncased')

        # Instantiate an one-layer feed-forward classifier
        self.classifier = nn.Sequential(
            nn.Linear(D_in, H),
            nn.ReLU(),
            #nn.Dropout(0.5),
            nn.Linear(H, D_out)
        )

        # Freeze the BERT model
        if freeze_bert:
            for param in self.bert.parameters():
                param.requires_grad = False
        
    def forward(self, input_ids, attention_mask):
        # Feed input to BERT
        outputs = self.bert(input_ids=input_ids,
                            attention_mask=attention_mask)
        
        # Extract the last hidden state of the token `[CLS]` for classification task
        last_hidden_state_cls = outputs[0][:, 0, :]

        # Feed input to classifier to compute logits
        logits = self.classifier(last_hidden_state_cls)

        return logits

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 5.72 µs


In [95]:
from transformers import AdamW, get_linear_schedule_with_warmup

def initialize_model(epochs=4):
    """Initialize the Bert Classifier, the optimizer and the learning rate scheduler.
    """
    # Instantiate Bert Classifier
    bert_classifier = BertClassifier(freeze_bert=False)

    # Tell PyTorch to run the model on GPU
    bert_classifier.to(device)

    # Create the optimizer
    optimizer = AdamW(bert_classifier.parameters(),
                      lr=1e-4,    # Default learning rate
                      eps=1e-8    # Default epsilon value
                      )

    # Total number of training steps
    total_steps = len(train_dataloader) * epochs

    # Set up the learning rate scheduler
    scheduler = get_linear_schedule_with_warmup(optimizer,
                                                num_warmup_steps=0, # Default value
                                                num_training_steps=total_steps)
    return bert_classifier, optimizer, scheduler

In [96]:
import random
import time

# Specify loss function
loss_fn = nn.CrossEntropyLoss()

def set_seed(seed_value=42):
    """Set seed for reproducibility.
    """
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    torch.cuda.manual_seed_all(seed_value)

def train(model, train_dataloader, val_dataloader=None, epochs=4, evaluation=False):
    """Train the BertClassifier model.
    """
    # Start training loop
    print("Start training...\n")
    for epoch_i in range(epochs):
        # =======================================
        #               Training
        # =======================================
        # Print the header of the result table
        print(f"{'Epoch':^7} | {'Batch':^7} | {'Train Loss':^12} | {'Val Loss':^10} | {'Val Acc':^9} | {'Elapsed':^9}")
        print("-"*70)

        # Measure the elapsed time of each epoch
        t0_epoch, t0_batch = time.time(), time.time()

        # Reset tracking variables at the beginning of each epoch
        total_loss, batch_loss, batch_counts = 0, 0, 0

        # Put the model into the training mode
        model.train()

        # For each batch of training data...
        for step, batch in enumerate(train_dataloader):
            batch_counts +=1
            # Load batch to GPU
            b_input_ids, b_attn_mask, b_labels = tuple(t.to(device) for t in batch)

            # Zero out any previously calculated gradients
            model.zero_grad()

            # Perform a forward pass. This will return logits.
            logits = model(b_input_ids, b_attn_mask)

            # Compute loss and accumulate the loss values
            loss = loss_fn(logits, b_labels)
            batch_loss += loss.item()
            total_loss += loss.item()

            # Perform a backward pass to calculate gradients
            loss.backward()

            # Clip the norm of the gradients to 1.0 to prevent "exploding gradients"
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

            # Update parameters and the learning rate
            optimizer.step()
            scheduler.step()

            # Print the loss values and time elapsed for every 20 batches
            if (step % 20 == 0 and step != 0) or (step == len(train_dataloader) - 1):
                # Calculate time elapsed for 20 batches
                time_elapsed = time.time() - t0_batch

                # Print training results
                print(f"{epoch_i + 1:^7} | {step:^7} | {batch_loss / batch_counts:^12.6f} | {'-':^10} | {'-':^9} | {time_elapsed:^9.2f}")

                # Reset batch tracking variables
                batch_loss, batch_counts = 0, 0
                t0_batch = time.time()

        # Calculate the average loss over the entire training data
        avg_train_loss = total_loss / len(train_dataloader)

        print("-"*70)
        # =======================================
        #               Evaluation
        # =======================================
        if evaluation == True:
            # After the completion of each training epoch, measure the model's performance
            # on our validation set.
            val_loss, val_accuracy = evaluate(model, val_dataloader)

            # Print performance over the entire training data
            time_elapsed = time.time() - t0_epoch
            
            print(f"{epoch_i + 1:^7} | {'-':^7} | {avg_train_loss:^12.6f} | {val_loss:^10.6f} | {val_accuracy:^9.2f} | {time_elapsed:^9.2f}")
            print("-"*70)
        print("\n")
    
    print("Training complete!")


def evaluate(model, val_dataloader):
    """After the completion of each training epoch, measure the model's performance
    on our validation set.
    """
    # Put the model into the evaluation mode. The dropout layers are disabled during
    # the test time.
    model.eval()

    # Tracking variables
    val_accuracy = []
    val_loss = []

    # For each batch in our validation set...
    for batch in val_dataloader:
        # Load batch to GPU
        b_input_ids, b_attn_mask, b_labels = tuple(t.to(device) for t in batch)

        # Compute logits
        with torch.no_grad():
            logits = model(b_input_ids, b_attn_mask)

        # Compute loss
        loss = loss_fn(logits, b_labels)
        val_loss.append(loss.item())

        # Get the predictions
        preds = torch.argmax(logits, dim=1).flatten()

        # Calculate the accuracy rate
        accuracy = (preds == b_labels).cpu().numpy().mean() * 100
        val_accuracy.append(accuracy)

    # Compute the average accuracy and loss over the validation set.
    val_loss = np.mean(val_loss)
    val_accuracy = np.mean(val_accuracy)

    return val_loss, val_accuracy

In [99]:
set_seed(42)   
bert_classifier, optimizer, scheduler = initialize_model(epochs=1)
train(bert_classifier, train_dataloader, val_dataloader, epochs=1, evaluation=True)

Start training...

 Epoch  |  Batch  |  Train Loss  |  Val Loss  |  Val Acc  |  Elapsed 
----------------------------------------------------------------------
   1    |   20    |   0.742062   |     -      |     -     |   20.61  
   1    |   40    |   0.569347   |     -      |     -     |   19.01  
   1    |   60    |   0.532064   |     -      |     -     |   18.98  
   1    |   80    |   0.522825   |     -      |     -     |   19.00  
   1    |   100   |   0.528502   |     -      |     -     |   18.99  
   1    |   120   |   0.532949   |     -      |     -     |   19.00  
   1    |   140   |   0.453823   |     -      |     -     |   19.02  
   1    |   160   |   0.523114   |     -      |     -     |   18.99  
   1    |   180   |   0.489745   |     -      |     -     |   19.02  
   1    |   200   |   0.564769   |     -      |     -     |   18.98  
   1    |   220   |   0.532470   |     -      |     -     |   19.01  
   1    |   240   |   0.528589   |     -      |     -     |   19.04  


   1    |  2320   |   0.450012   |     -      |     -     |   19.00  
   1    |  2340   |   0.405264   |     -      |     -     |   18.98  
   1    |  2360   |   0.445745   |     -      |     -     |   19.00  
   1    |  2380   |   0.474855   |     -      |     -     |   19.02  
   1    |  2400   |   0.376611   |     -      |     -     |   19.00  
   1    |  2420   |   0.443840   |     -      |     -     |   19.02  
   1    |  2440   |   0.435626   |     -      |     -     |   19.05  
   1    |  2460   |   0.446441   |     -      |     -     |   18.99  
   1    |  2480   |   0.401807   |     -      |     -     |   19.02  
   1    |  2500   |   0.411928   |     -      |     -     |   19.02  
   1    |  2520   |   0.422835   |     -      |     -     |   18.97  
   1    |  2540   |   0.395509   |     -      |     -     |   19.03  
   1    |  2560   |   0.405557   |     -      |     -     |   18.99  
   1    |  2580   |   0.432078   |     -      |     -     |   19.01  
   1    |  2600   | 

In [100]:
import torch.nn.functional as F

def bert_predict(model, test_dataloader):
    model.eval()

    all_logits = []
    for batch in test_dataloader:
        # Load batch to GPU
        b_input_ids, b_attn_mask = tuple(t.to(device) for t in batch)[:2]

        # Compute logits
        with torch.no_grad():
            logits = model(b_input_ids, b_attn_mask)
        all_logits.append(logits)
    
    # Concatenate logits from each batch
    all_logits = torch.cat(all_logits, dim=0)

    # Apply softmax to calculate probabilities
    probs = F.softmax(all_logits, dim=1).cpu().numpy()

    return probs

In [101]:
probs = bert_predict(bert_classifier, val_dataloader)

In [102]:
pred = np.argmax(probs,axis=1)

In [103]:
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report,confusion_matrix
print(classification_report(y_val,pred))

              precision    recall  f1-score   support

           0       0.73      0.76      0.74      2346
           1       0.58      0.43      0.49      2264
           2       0.91      0.96      0.93      9591

    accuracy                           0.84     14201
   macro avg       0.74      0.71      0.72     14201
weighted avg       0.83      0.84      0.83     14201



In [104]:
print(f1_score(y_val,pred,average='weighted'))

0.8302402272033141


In [105]:
print(confusion_matrix(y_val,pred))

[[1784  402  160]
 [ 550  965  749]
 [ 122  308 9161]]


In [106]:
test_csv = pd.read_pickle('../input/sentimenttrans/deta.pkl')
test_csv

Unnamed: 0,Text
0,OPPO K7X launched with 5G support and quad-rea...
1,"February 7, 2021 | 1: 46 | IST"
2,Motorola can launch two new smartphones - Moto...
3,Nokia 5. 4 features a 6. 39-inch HD+ IPS LCD d...
4,Buy Amazon Cell 2020 within 7000 RS These attr...
...,...
749,"Vivo S9 could launch on March 6, as per a tips..."
750,Samsung Your Next Flagship i.e. Galaxy S30 Ser...
751,Poco launched the Poco M3 smartphone in India ...
752,Great Indian Festival Sale is getting started ...


In [107]:
test_inputs, test_masks = preprocessing_for_bert(test_csv.Text)

# Create the DataLoader for our test set
test_dataset = TensorDataset(test_inputs, test_masks)
test_dataloader = DataLoader(test_dataset,batch_size=32)



In [108]:
probs = bert_predict(bert_classifier, test_dataloader)
pred = np.argmax(probs,axis=1)
test_csv.loc[:,'label'] = pred
revdic = {1:'neutral' , 0:'negative',2:'positive'}
test_csv.loc[:,'sentiment'] = test_csv.label.map(revdic)

In [109]:
test_csv

Unnamed: 0,Text,label,sentiment
0,OPPO K7X launched with 5G support and quad-rea...,2,positive
1,"February 7, 2021 | 1: 46 | IST",2,positive
2,Motorola can launch two new smartphones - Moto...,2,positive
3,Nokia 5. 4 features a 6. 39-inch HD+ IPS LCD d...,2,positive
4,Buy Amazon Cell 2020 within 7000 RS These attr...,2,positive
...,...,...,...
749,"Vivo S9 could launch on March 6, as per a tips...",2,positive
750,Samsung Your Next Flagship i.e. Galaxy S30 Ser...,2,positive
751,Poco launched the Poco M3 smartphone in India ...,2,positive
752,Great Indian Festival Sale is getting started ...,2,positive


In [110]:
np.unique(test_csv.label)

array([0, 1, 2])

In [None]:
test_csv.to_csv('trans.csv')