In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install datasets transformers==4.28.0



In [3]:
from datasets import Dataset
import pandas as pd
import math
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding, pipeline
import torch
from transformers import TrainingArguments, Trainer

In [4]:
torch.cuda.empty_cache()

In [73]:
df = pd.read_pickle('/content/drive/My Drive/dataset/yelp_all_balanced.pkl')

In [74]:
df = df[['text','stars']]
df = df.reset_index(drop=True)

In [75]:
df

Unnamed: 0,text,stars
0,Steaks taste better at Texas Roadhouse than th...,1
1,Food was two star. Zero star if I could for ki...,1
2,Disappointing that this location's milk shake ...,1
3,there is very bad service. don't come here. th...,1
4,So after 5 months later from writing this revi...,1
...,...,...
24995,We happened upon Palm and Pine by accident. Ca...,5
24996,Margaritas were great and nacho pizza was deli...,5
24997,Great tenderloins and yummy fries. Come and su...,5
24998,So happy to finally have a Thai place within 1...,5


In [76]:
df = df.reset_index(drop=True)

In [77]:
df['stars'] = df['stars'] - 1

In [78]:
df

Unnamed: 0,text,stars
0,Steaks taste better at Texas Roadhouse than th...,0
1,Food was two star. Zero star if I could for ki...,0
2,Disappointing that this location's milk shake ...,0
3,there is very bad service. don't come here. th...,0
4,So after 5 months later from writing this revi...,0
...,...,...
24995,We happened upon Palm and Pine by accident. Ca...,4
24996,Margaritas were great and nacho pizza was deli...,4
24997,Great tenderloins and yummy fries. Come and su...,4
24998,So happy to finally have a Thai place within 1...,4


In [79]:
dataX = df['text']
dataY = df['stars']
dataX.head(5), dataY.head(5)

(0    Steaks taste better at Texas Roadhouse than th...
 1    Food was two star. Zero star if I could for ki...
 2    Disappointing that this location's milk shake ...
 3    there is very bad service. don't come here. th...
 4    So after 5 months later from writing this revi...
 Name: text, dtype: object,
 0    0
 1    0
 2    0
 3    0
 4    0
 Name: stars, dtype: int64)

In [80]:
train_ratio = 0.80
validation_ratio = 0.10
test_ratio = 0.10

# train is now 75% of the entire data set
x_train, x_test, y_train, y_test = train_test_split(dataX, dataY, test_size=1 - train_ratio)

# test is now 10% of the initial data set
# validation is now 15% of the initial data set
x_val, x_test, y_val, y_test = train_test_split(x_test, y_test, test_size=test_ratio/(test_ratio + validation_ratio))

In [81]:
raw_train_df = pd.concat([x_train, y_train], axis=1)
raw_test_df = pd.concat([x_test, y_test], axis=1)
raw_val_df = pd.concat([x_val, y_val], axis=1)

In [82]:
raw_train_ds = Dataset.from_pandas(raw_train_df)
raw_test_ds = Dataset.from_pandas(raw_test_df)
raw_val_ds = Dataset.from_pandas(raw_val_df)

In [83]:
print(raw_train_ds, raw_val_ds, raw_test_ds)

Dataset({
    features: ['text', 'stars', '__index_level_0__'],
    num_rows: 20000
}) Dataset({
    features: ['text', 'stars', '__index_level_0__'],
    num_rows: 2500
}) Dataset({
    features: ['text', 'stars', '__index_level_0__'],
    num_rows: 2500
})


In [84]:
BASE_MODEL = "bert-base-uncased"
LEARNING_RATE = 2e-5
MAX_LENGTH = 512
BATCH_SIZE = 16
EPOCHS = 5

id2label = {k:k for k in range(5)}
label2id = {k:k for k in range(5)}

tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
model = AutoModelForSequenceClassification.from_pretrained(BASE_MODEL, id2label=id2label, label2id=label2id)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [86]:
ds = {"train": raw_train_ds, "validation": raw_val_ds, "test": raw_test_ds}

def preprocess_function(examples):
    label = examples["stars"]
    examples = tokenizer(examples["text"], truncation=True, padding="max_length", max_length=MAX_LENGTH)
    examples["label"] = label
    return examples

for split in ds:
    ds[split] = ds[split].map(preprocess_function, remove_columns=["text", "stars"])

Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2500 [00:00<?, ? examples/s]

Map:   0%|          | 0/2500 [00:00<?, ? examples/s]

In [66]:
import numpy as np
from datasets import load_metric

metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [67]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="../models/yelp-all-bert",
    learning_rate=LEARNING_RATE,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=EPOCHS,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    metric_for_best_model="accuracy",
    load_best_model_at_end=True,
    weight_decay=0.01,
)

In [68]:
from transformers import Trainer

torch.manual_seed(42)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=ds["train"],
    eval_dataset=ds["validation"],
    compute_metrics=compute_metrics
)

trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.971603,0.581
2,No log,0.921122,0.658
3,No log,0.920092,0.719
4,0.874800,0.923832,0.701
5,0.874800,0.911774,0.715


TrainOutput(global_step=625, training_loss=0.7829299499511718, metrics={'train_runtime': 260.3295, 'train_samples_per_second': 38.413, 'train_steps_per_second': 2.401, 'total_flos': 2631181424640000.0, 'train_loss': 0.7829299499511718, 'epoch': 5.0})

In [69]:
trainer.eval_dataset=ds["test"]
trainer.evaluate()

{'eval_loss': 0.084828384177343,
 'eval_accuracy': 0.693,
 'eval_runtime': 1.9924,
 'eval_samples_per_second': 121.13,
 'eval_steps_per_second': 7.752,
 'epoch': 5.0}

In [70]:
input_texts = ["This restaurant is amazing and has the best soup",
               "This restaurant is just okay, can be better. But the drinks are good",
               "Best sushi I've ever had in Tokyo",
               "The drinks are decent. I really like the dumplings",
               "The price is too expensive and the food quality is bad. The waiters were really nice to me, though."
              ]

# Encode the text
encoded = tokenizer(input_texts, truncation=True, padding="max_length", max_length=512, return_tensors="pt").to("cuda")

# Call the model to predict under the format of logits of 5 classes
logits = model(**encoded).logits

# Get the class
torch.argmax(logits, axis=1)

tensor([4, 2, 4, 3, 1], device='cuda:0')

In [71]:
#trainer.save_model('./drive/MyDrive/yelp-longformer-500')

In [72]:
#tokenizer.save_pretrained("./drive/MyDrive/yelp-longformer-tokenizer-500")