In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install datasets transformers==4.28.0

Collecting datasets
  Downloading datasets-2.13.1-py3-none-any.whl (486 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/486.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m486.2/486.2 kB[0m [31m31.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting transformers==4.28.0
  Downloading transformers-4.28.0-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m97.5 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0 (from transformers==4.28.0)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m33.6 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers==4.28.0)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [3]:
from datasets import Dataset
import pandas as pd
import math
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding, pipeline
import torch
from transformers import TrainingArguments, Trainer

In [4]:
torch.cuda.empty_cache()

In [5]:
df = pd.read_pickle('/content/drive/My Drive/dataset/yelp_2013_balanced.pkl')

In [6]:
df = df[['text','stars']]
df = df.reset_index(drop=True)

In [7]:
df

Unnamed: 0,text,stars
0,It's a shame that my family has been frequenti...,1
1,My general opinion of writing Yelp reviews is ...,1
2,made reservations on thursday night for saturd...,1
3,"It was a night I hope to forget, but I'll stil...",1
4,Trainwreck is appropriately named based on tod...,1
...,...,...
12495,Had a wonderful brunch with girlfriends today!...,5
12496,"Newer is always better, but not in this case. ...",5
12497,I love this place! The prices are reasonable a...,5
12498,"I ordered a whole tomato pie, a whole chicken ...",5


In [8]:
df = df.reset_index(drop=True)

In [9]:
df['stars'] = df['stars'] - 1

In [10]:
df

Unnamed: 0,text,stars
0,It's a shame that my family has been frequenti...,0
1,My general opinion of writing Yelp reviews is ...,0
2,made reservations on thursday night for saturd...,0
3,"It was a night I hope to forget, but I'll stil...",0
4,Trainwreck is appropriately named based on tod...,0
...,...,...
12495,Had a wonderful brunch with girlfriends today!...,4
12496,"Newer is always better, but not in this case. ...",4
12497,I love this place! The prices are reasonable a...,4
12498,"I ordered a whole tomato pie, a whole chicken ...",4


In [11]:
dataX = df['text']
dataY = df['stars']
dataX.head(5), dataY.head(5)

(0    It's a shame that my family has been frequenti...
 1    My general opinion of writing Yelp reviews is ...
 2    made reservations on thursday night for saturd...
 3    It was a night I hope to forget, but I'll stil...
 4    Trainwreck is appropriately named based on tod...
 Name: text, dtype: object,
 0    0
 1    0
 2    0
 3    0
 4    0
 Name: stars, dtype: int64)

In [12]:
train_ratio = 0.80
validation_ratio = 0.10
test_ratio = 0.10

# train is now 75% of the entire data set
x_train, x_test, y_train, y_test = train_test_split(dataX, dataY, test_size=1 - train_ratio)

# test is now 10% of the initial data set
# validation is now 15% of the initial data set
x_val, x_test, y_val, y_test = train_test_split(x_test, y_test, test_size=test_ratio/(test_ratio + validation_ratio))

In [13]:
raw_train_df = pd.concat([x_train, y_train], axis=1)
raw_test_df = pd.concat([x_test, y_test], axis=1)
raw_val_df = pd.concat([x_val, y_val], axis=1)

In [14]:
raw_train_ds = Dataset.from_pandas(raw_train_df)
raw_test_ds = Dataset.from_pandas(raw_test_df)
raw_val_ds = Dataset.from_pandas(raw_val_df)

In [15]:
print(raw_train_ds, raw_val_ds, raw_test_ds)

Dataset({
    features: ['text', 'stars', '__index_level_0__'],
    num_rows: 10000
}) Dataset({
    features: ['text', 'stars', '__index_level_0__'],
    num_rows: 1250
}) Dataset({
    features: ['text', 'stars', '__index_level_0__'],
    num_rows: 1250
})


In [16]:
BASE_MODEL = "bert-base-uncased"
LEARNING_RATE = 2e-5
MAX_LENGTH = 512
BATCH_SIZE = 16
EPOCHS = 5

id2label = {k:k for k in range(5)}
label2id = {k:k for k in range(5)}

tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
model = AutoModelForSequenceClassification.from_pretrained(BASE_MODEL, id2label=id2label, label2id=label2id)

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [17]:
ds = {"train": raw_train_ds, "validation": raw_val_ds, "test": raw_test_ds}

def preprocess_function(examples):
    label = examples["stars"]
    examples = tokenizer(examples["text"], truncation=True, padding="max_length", max_length=MAX_LENGTH)
    examples["label"] = label
    return examples

for split in ds:
    ds[split] = ds[split].map(preprocess_function, remove_columns=["text", "stars"])

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1250 [00:00<?, ? examples/s]

Map:   0%|          | 0/1250 [00:00<?, ? examples/s]

In [None]:
import numpy as np
from datasets import load_metric

metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="../models/yelp-2014-bert",
    learning_rate=LEARNING_RATE,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=EPOCHS,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    metric_for_best_model="accuracy",
    load_best_model_at_end=True,
    weight_decay=0.01,
)

In [None]:
from transformers import Trainer

torch.manual_seed(42)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=ds["train"],
    eval_dataset=ds["validation"],
    compute_metrics=compute_metrics
)

trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.991532,0.571
2,No log,0.931623,0.662
3,No log,0.941632,0.702
4,0.854800,0.926234,0.722
5,0.854800,0.971227,0.729


TrainOutput(global_step=625, training_loss=0.7829299499511718, metrics={'train_runtime': 260.3295, 'train_samples_per_second': 38.413, 'train_steps_per_second': 2.401, 'total_flos': 2631181424640000.0, 'train_loss': 0.7829299499511718, 'epoch': 5.0})

In [None]:
trainer.eval_dataset=ds["test"]
trainer.evaluate()

{'eval_loss': 0.86152324835654561,
 'eval_accuracy': 0.691,
 'eval_runtime': 2.2322,
 'eval_samples_per_second': 121.13,
 'eval_steps_per_second': 7.752,
 'epoch': 5.0}

In [None]:
input_texts = ["This restaurant is amazing and has the best soup",
               "This restaurant is just okay, can be better. But the drinks are good",
               "Best sushi I've ever had in Tokyo",
               "The drinks are decent. I really like the dumplings",
               "The price is too expensive and the food quality is bad. The waiters were really nice to me, though."
              ]

# Encode the text
encoded = tokenizer(input_texts, truncation=True, padding="max_length", max_length=512, return_tensors="pt").to("cuda")

# Call the model to predict under the format of logits of 5 classes
logits = model(**encoded).logits

# Get the class
torch.argmax(logits, axis=1)

tensor([4, 2, 4, 3, 1], device='cuda:0')

In [None]:
#trainer.save_model('./drive/MyDrive/yelp-longformer-500')

In [None]:
#tokenizer.save_pretrained("./drive/MyDrive/yelp-longformer-tokenizer-500")