# Imports and functions

In [None]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.14.1-py3-none-any.whl (3.4 MB)
[K     |████████████████████████████████| 3.4 MB 8.4 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 33.9 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.2.1-py3-none-any.whl (61 kB)
[K     |████████████████████████████████| 61 kB 454 kB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 47.8 MB/s 
[?25hCollecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 38.1 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
 

In [None]:
import pandas as pd
import numpy as np
import re
from transformers import BertTokenizer
import torch
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from transformers import BertForSequenceClassification, Trainer, TrainingArguments, BertConfig

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

%cd drive
%cd MyDrive
%cd Colab Notebooks
%cd Innopolis DS
%cd Opinion mining

Mounted at /content/drive
/content/drive
/content/drive/MyDrive
/content/drive/MyDrive/Colab Notebooks
/content/drive/MyDrive/Colab Notebooks/Innopolis DS
/content/drive/MyDrive/Colab Notebooks/Innopolis DS/Opinion mining


In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels:
            item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])


def compute_metrics(p):
    pred, labels = p
    pred = np.argmax(pred, axis=1)

    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    recall = recall_score(y_true=labels, y_pred=pred)
    precision = precision_score(y_true=labels, y_pred=pred)
    f1 = f1_score(y_true=labels, y_pred=pred)

    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}


def prepare_dataset(product, df):
    
    df = df[df['product'] == product]
    
    df_reviews = df.review.tolist()
    df_aspects = df.aspect.tolist()
    
    df_encodings = tokenizer(df_reviews, df_aspects, truncation=True, padding=True)

    processed_dataset = Dataset(df_encodings)
    
    return processed_dataset


def label_predictions(df_predict):
    
    result_df = []
    key_aspects = df_predict.key_aspect.unique().tolist()

    for aspect in key_aspects:
        temp = {}
        temp_df = df_predict[df_predict['key_aspect'] == aspect].reset_index(drop=True)
        score = temp_df.sentiment.mean()

        temp['aspect'] = aspect
        temp['score'] = score

        if score > 0.5:
            temp['sentiment'] = 'positive'
        else:
            temp['sentiment'] = 'negative'

        temp['count'] = temp_df.loc[0, 'count']

        result_df.append(temp)

    result_df = pd.DataFrame(result_df)
    
    return result_df


def print_aspects(product, result_df, top_num):
    positive = []
    negative = []
    result_df_cut = result_df[0:top_num].sort_values(by=['sentiment', 'count'], ascending=False)

    print('Reviews summary for')
    print(product)
    print(f'(top {top_num} aspects)')
    print()

    for ind, row in result_df_cut.iterrows():
        if row['sentiment'] == 'positive':
            print(f"+ {row['aspect']}")
        else:
            print(f"- {row['aspect']}")

# Main

In [None]:
df = pd.read_csv('./data/reviews_preprocessed.csv')

In [None]:
df

Unnamed: 0,review,aspect,key_aspect,count,product
0,i was impressed by this very inexpensive playe...,inexpensive player,best player,16,Apex AD2600 Progressive-scan DVD player
1,i looked into buying an inexpensive dvd player...,inexpensive player,best player,16,Apex AD2600 Progressive-scan DVD player
2,this is the best dvd player i have purchased,best player,best player,16,Apex AD2600 Progressive-scan DVD player
3,i think apex is the best dvd player you can ge...,best player,best player,16,Apex AD2600 Progressive-scan DVD player
4,for the first few weeks this player was everyt...,affordable player,best player,16,Apex AD2600 Progressive-scan DVD player
...,...,...,...,...,...
3354,i give it 4stars only because the installation...,average user,average user,1,norton
3355,this program would not let me send outlook ema...,audible books,audible books,1,norton
3356,i am an advanced computer user so i cannot eve...,advanced user,advanced user,1,norton
3357,i do not know what is happened to norton but 2...,absolute garbage,absolute garbage,1,norton


In [None]:
product = 'Apex AD2600 Progressive-scan DVD player'

In [None]:
print(product)
print()
product_df = df[df['product'] == product]
print(f'Total number of unique pairs aspects+sentences: {len(product_df)}')
print(f'Number of unique aspects: {len(product_df["key_aspect"].unique())}')

Apex AD2600 Progressive-scan DVD player

Total number of unique pairs aspects+sentences: 206
Number of unique aspects: 142


In [None]:
df_predict = prepare_dataset(product, df)

In [None]:
epochs = 4
batch_size = 32
num_classes = 2

training_args = TrainingArguments(
    output_dir = 'model',          
    num_train_epochs = epochs,              
    per_device_train_batch_size = batch_size,  
    per_device_eval_batch_size = batch_size,
    weight_decay = 0.01,               
    logging_dir = 'logs',            
    logging_steps = 10,
    evaluation_strategy = 'epoch',
    learning_rate = 2e-5,
)

model = BertForSequenceClassification.from_pretrained('./model/last_step')

trainer = Trainer(
        model=model,                         
        args=training_args,
        compute_metrics=compute_metrics             
    )

In [None]:
raw_pred, _, _ = trainer.predict(df_predict)
y_pred = np.argmax(raw_pred, axis=1)

***** Running Prediction *****
  Num examples = 206
  Batch size = 32


In [None]:
result_df = df.loc[df['product'] == product]
result_df['sentiment'] = y_pred

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [None]:
result_df

Unnamed: 0,review,aspect,key_aspect,count,product,sentiment
0,i was impressed by this very inexpensive playe...,inexpensive player,best player,16,Apex AD2600 Progressive-scan DVD player,1
1,i looked into buying an inexpensive dvd player...,inexpensive player,best player,16,Apex AD2600 Progressive-scan DVD player,1
2,this is the best dvd player i have purchased,best player,best player,16,Apex AD2600 Progressive-scan DVD player,1
3,i think apex is the best dvd player you can ge...,best player,best player,16,Apex AD2600 Progressive-scan DVD player,1
4,for the first few weeks this player was everyt...,affordable player,best player,16,Apex AD2600 Progressive-scan DVD player,1
...,...,...,...,...,...,...
201,i use this with a home theater system and its ...,amazing system,amazing system,1,Apex AD2600 Progressive-scan DVD player,1
202,it plays alternate video formats vcds svcds cv...,alternate formats,alternate formats,1,Apex AD2600 Progressive-scan DVD player,1
203,imagine my shock and absolute instant disdain ...,absolute disdain,absolute disdain,1,Apex AD2600 Progressive-scan DVD player,0
204,the 2nd dvd player had a faulty power supply w...,2nd player,2nd player,1,Apex AD2600 Progressive-scan DVD player,0


In [None]:
result_df.to_csv(f'output/{product}.csv', index=False)

In [None]:
mapped_result_df = label_predictions(result_df)

In [None]:
mapped_result_df

Unnamed: 0,aspect,score,sentiment,count
0,best player,0.8125,positive,16
1,remote control,0.2500,negative,8
2,extra features,0.6000,positive,5
3,apex player,1.0000,positive,5
4,several times,0.0000,negative,4
...,...,...,...,...
137,amazing system,1.0000,positive,1
138,alternate formats,1.0000,positive,1
139,absolute disdain,0.0000,negative,1
140,2nd player,0.0000,negative,1


In [None]:
print_aspects(product, mapped_result_df, 20)

Reviews summary for
Apex AD2600 Progressive-scan DVD player
(top 20 aspects)

+ best player
+ extra features
+ apex player
+ right price
+ progressive player
+ nice machines
+ great picture
+ cheap brand
+ apex product
+ small buttons
+ sleek design
+ sharp jpeg pictures
+ scan players
+ regular disk
+ new smell
- remote control
- several times
- bad quality
- poor video
- normal size
