# Imports

In [None]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.14.1-py3-none-any.whl (3.4 MB)
[K     |████████████████████████████████| 3.4 MB 8.3 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 43.8 MB/s 
[?25hCollecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 41.6 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.2.1-py3-none-any.whl (61 kB)
[K     |████████████████████████████████| 61 kB 548 kB/s 
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 42.2 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
 

In [None]:
import pandas as pd
import numpy as np
import re
from transformers import BertTokenizer
import torch
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from transformers import BertForSequenceClassification, Trainer, TrainingArguments, BertConfig
from transformers import logging

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

%cd drive
%cd MyDrive
%cd Colab Notebooks
%cd Innopolis DS
%cd Opinion mining

Mounted at /content/drive
/content/drive
/content/drive/MyDrive
/content/drive/MyDrive/Colab Notebooks
/content/drive/MyDrive/Colab Notebooks/Innopolis DS
/content/drive/MyDrive/Colab Notebooks/Innopolis DS/Opinion mining


# Main

In [None]:
datasets = ['data/5 products 2004/Canon G3.txt',
           'data/5 products 2004/Creative Labs Nomad Jukebox Zen Xtra 40GB.txt',
           'data/5 products 2004/Nikon coolpix 4300.txt',
           'data/5 products 2004/Nokia 6610.txt',
           'data/9 products 2008/Canon PowerShot SD500.txt',
           'data/9 products 2008/Canon S100.txt',
           'data/9 products 2008/Diaper Champ.txt',
           'data/9 products 2008/Hitachi router.txt',
           'data/9 products 2008/ipod.txt',
           'data/9 products 2008/Linksys Router.txt',
           'data/9 products 2008/MicroMP3.txt',
           'data/9 products 2008/Nokia 6600.txt',
           'data/9 products 2008/norton.txt']

In [None]:
reviews = []
for dataset in datasets:
    with open(dataset) as f:
        review = f.readlines()
    reviews += review

In [None]:
len(reviews)

8004

In [None]:
data = []
tags_to_remove = ['[t]', '[u]', '[p]', '[s]', '[cc]', '[cs]']

for i in range(len(reviews)):
    temp = {}
    if any(tag in reviews[i] for tag in tags_to_remove):
        continue
    else:
        item = reviews[i].split('##')
        text = item[1].replace("\n", "").strip()
        keys = item[0].split(',')

        if keys[0] != '':
            for key in keys:
                temp = {}
                aspect = re.sub(r"\[.]", "", key)
                aspect = aspect.replace('[+', '_1_')
                aspect = aspect.replace('[-', '_0_')

                temp['review'] = text
                temp['aspect'] = aspect.split('_')[0].strip()
                temp['sentiment'] = aspect.split('_')[1]
                data.append(temp)
            
df = pd.DataFrame(data)
df.sentiment = df.sentiment.astype(int)

In [None]:
df.head(5)

Unnamed: 0,review,aspect,sentiment
0,i recently purchased the canon powershot g3 an...,canon powershot g3,1
1,"the camera is very easy to use , in fact on a ...",use,1
2,they fired away and the picture turned out qui...,picture,1
3,a few of my work constituants owned the g2 and...,picture quality,1
4,i 'm easily enlarging pictures to 8 1/2 x 11 w...,picture quality,1


In [None]:
len(df)

3377

In [None]:
df.sentiment.value_counts()

1    2294
0    1083
Name: sentiment, dtype: int64

In [None]:
# 60% train, 20% val, 20% test
df_train, df_val, df_test = np.split(df.sample(frac=1, random_state=42), [int(.6*len(df)), int(.8*len(df))])

def split_df(df):
    reviews = df.review.tolist()
    aspects = df.aspect.tolist()
    labels = df.sentiment.tolist()
    return reviews, aspects, labels

train_reviews, train_aspects, train_labels = split_df(df_train)
val_reviews, val_aspects, val_labels = split_df(df_val)
test_reviews, test_aspects, test_labels = split_df(df_test)

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

train_encodings = tokenizer(train_reviews, train_aspects, truncation=True, padding=True)
val_encodings = tokenizer(val_reviews, val_aspects, truncation=True, padding=True)
test_encodings = tokenizer(test_reviews, test_aspects, truncation=True, padding=True)

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels:
            item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])

In [None]:
train_dataset = Dataset(train_encodings, train_labels)
val_dataset = Dataset(val_encodings, val_labels)
test_dataset = Dataset(test_encodings, test_labels)

In [None]:
def compute_metrics(p):
    pred, labels = p
    pred = np.argmax(pred, axis=1)

    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    recall = recall_score(y_true=labels, y_pred=pred)
    precision = precision_score(y_true=labels, y_pred=pred)
    f1 = f1_score(y_true=labels, y_pred=pred)

    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

In [None]:
load_finetuned_model = False

logging.set_verbosity_debug()

epochs = 4
batch_size = 32 # or 24
num_steps = len(train_dataset) * epochs // batch_size
warmup_steps = num_steps // 10
num_classes = 2


training_args = TrainingArguments(
    output_dir = 'model',          
    num_train_epochs = epochs,              
    per_device_train_batch_size = batch_size,  
    per_device_eval_batch_size = batch_size,   
    warmup_steps = warmup_steps,   
    weight_decay = 0.01,               
    logging_dir = 'logs',            
    logging_steps = 10,
    evaluation_strategy = 'epoch',
    learning_rate = 2e-5,
)

config = BertConfig.from_pretrained(
    'bert-base-uncased',
    architectures = ['BertForSequenceClassification'],
    hidden_size = 768,
    num_hidden_layers = 12,
    num_attention_heads = 12,
    hidden_dropout_prob = 0.1,
    num_labels = num_classes
)    


if not load_finetuned_model:
    model = BertForSequenceClassification.from_pretrained('bert-base-uncased', config=config)

    trainer = Trainer(
        model=model,                         
        args=training_args,                  
        train_dataset=train_dataset,         
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics             
    )
    trainer.train()

    model.save_pretrained('model/last_step')

else:
    model = BertForSequenceClassification.from_pretrained('model/last_step')

    trainer = Trainer(
        model=model,                         
        args=training_args,                  
        train_dataset=train_dataset,         
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics             
    )


PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
loading configuration file https://huggingface.co/bert-base-uncased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/3c61d016573b14f7f008c02c4e51a366c67ab274726fe2910691e2a761acf43e.37395cee442ab11005bcd270f3c34464dc1704b715b5d7d52b1a461abe3b9e4e
Model config BertConfig {
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_h

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4769,0.352714,0.86963,0.915433,0.900208,0.907757
2,0.2341,0.249085,0.906667,0.960352,0.906445,0.93262
3,0.0996,0.254763,0.911111,0.950749,0.923077,0.936709
4,0.0509,0.261605,0.914074,0.949045,0.929314,0.939076


***** Running Evaluation *****
  Num examples = 675
  Batch size = 32
***** Running Evaluation *****
  Num examples = 675
  Batch size = 32
***** Running Evaluation *****
  Num examples = 675
  Batch size = 32
***** Running Evaluation *****
  Num examples = 675
  Batch size = 32


Training completed. Do not forget to share your model on huggingface.co/models =)


Configuration saved in model/last_step/config.json
Model weights saved in model/last_step/pytorch_model.bin


In [None]:
evaluation_result = trainer.evaluate(test_dataset)

for key, value in evaluation_result.items():
    print(key, value)

***** Running Evaluation *****
  Num examples = 676
  Batch size = 32


eval_loss 0.24553631246089935
eval_accuracy 0.9215976331360947
eval_precision 0.9372197309417041
eval_recall 0.9435665914221218
eval_f1 0.9403824521934758
eval_runtime 2.1936
eval_samples_per_second 308.168
eval_steps_per_second 10.029
epoch 4.0
