In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install datasets transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.13.0-py3-none-any.whl (485 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m485.6/485.6 kB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting transformers==4.28.0
  Downloading transformers-4.28.0-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m47.0 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0 (from transformers==4.28.0)
  Downloading huggingface_hub-0.15.1-py3-none-any.whl (236 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m236.8/236.8 kB[0m [31m28.9 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers==4.28.0)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [3

In [3]:
from datasets import Dataset
import pandas as pd
import math
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding, pipeline
import torch
from transformers import TrainingArguments, Trainer

In [4]:
torch.cuda.empty_cache()

In [5]:
df = pd.read_pickle('/content/drive/My Drive/dataset/yelp_dataset_undersampled.pkl')

In [6]:
df.sample(10)

Unnamed: 0,text,stars,length
3735876,Maybe it was just an off night in the middle o...,2,425
1289853,After several years of wanting to try Pete and...,2,201
3704544,My boyfriend and I took a trip to p.a just to ...,4,293
1084734,"Looks like this newbie replaced Urban Element,...",3,313
1193074,I came to Zahav this past weekend and ate the ...,5,262
1119147,I must have showed up on a bad day. After seei...,1,228
2802005,"Let me start off by saying, service at a resta...",2,320
1238522,Abandon all hope ye who enter here. For if it'...,1,697
2969379,Everyone's 3 star reviews pretty much accurate...,3,526
2681672,I hate writing reviews that are negative but i...,1,364


In [8]:
res = df.groupby('stars').apply(lambda x: x.sample(n=min(500, len(x))))
pd.set_option('display.max_rows', 250)
df = res
df = df.reset_index(drop=True)
df['stars'] = df['stars'] - 1
df

1    500
2    500
3    500
4    500
5    500
Name: stars, dtype: int64


In [12]:
df.to_pickle('./drive/MyDrive/yelp_dataset_2500.pkl')

In [13]:
dataX = df['text']
dataY = df['stars']
dataX.head(5), dataY.head(5)

(0    Went to Bar Louie and ordered a quesadilla, wi...
 1    At the Fresco Café, I always felt there was a ...
 2    It was truly a very bad experience.  I walked ...
 3    On our way home from DC, my savvy driving comp...
 4    This is my third time to give this place a cha...
 Name: text, dtype: object,
 0    0
 1    0
 2    0
 3    0
 4    0
 Name: stars, dtype: int64)

In [14]:
train_ratio = 0.80
validation_ratio = 0.10
test_ratio = 0.10

# train is now 75% of the entire data set
x_train, x_test, y_train, y_test = train_test_split(dataX, dataY, test_size=1 - train_ratio)

# test is now 10% of the initial data set
# validation is now 15% of the initial data set
x_val, x_test, y_val, y_test = train_test_split(x_test, y_test, test_size=test_ratio/(test_ratio + validation_ratio))

In [15]:
raw_train_df = pd.concat([x_train, y_train], axis=1)
raw_test_df = pd.concat([x_test, y_test], axis=1)
raw_val_df = pd.concat([x_val, y_val], axis=1)

In [16]:
raw_train_ds = Dataset.from_pandas(raw_train_df)
raw_test_ds = Dataset.from_pandas(raw_test_df)
raw_val_ds = Dataset.from_pandas(raw_val_df)

In [17]:
print(raw_train_ds, raw_val_ds, raw_test_ds)

Dataset({
    features: ['text', 'stars', '__index_level_0__'],
    num_rows: 2000
}) Dataset({
    features: ['text', 'stars', '__index_level_0__'],
    num_rows: 250
}) Dataset({
    features: ['text', 'stars', '__index_level_0__'],
    num_rows: 250
})


In [18]:
BASE_MODEL = "bert-base-uncased"
LEARNING_RATE = 2e-5
MAX_LENGTH = 256
BATCH_SIZE = 16
EPOCHS = 5

id2label = {k:k for k in range(5)}
label2id = {k:k for k in range(5)}

tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
model = AutoModelForSequenceClassification.from_pretrained(BASE_MODEL, id2label=id2label, label2id=label2id)

Downloading (…)lve/main/config.json:   0%|          | 0.00/694 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/597M [00:00<?, ?B/s]

Some weights of the model checkpoint at allenai/longformer-base-4096 were not used when initializing LongformerForSequenceClassification: ['lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.bias']
- This IS expected if you are initializing LongformerForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LongformerForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of LongformerForSequenceClassification were not initialized from the model checkpoint at allenai/longformer-base-4096 and are newly initialized: ['classifier.out_proj.bias', 'classifier.dense.weight'

In [19]:
ds = {"train": raw_train_ds, "validation": raw_val_ds, "test": raw_test_ds}

def preprocess_function(examples):
    label = examples["stars"]
    examples = tokenizer(examples["text"], truncation=True, padding="max_length", max_length=MAX_LENGTH)
    examples["label"] = label
    return examples

for split in ds:
    ds[split] = ds[split].map(preprocess_function, remove_columns=["text", "stars"])

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

In [20]:
import numpy as np
from datasets import load_metric

metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

  metric = load_metric("accuracy")


Downloading builder script:   0%|          | 0.00/1.65k [00:00<?, ?B/s]

In [21]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="../models/yelp-longformer-classification",
    learning_rate=LEARNING_RATE,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=EPOCHS,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    metric_for_best_model="accuracy",
    load_best_model_at_end=True,
    weight_decay=0.01,
)

In [23]:
from transformers import Trainer

torch.manual_seed(42)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=ds["train"],
    eval_dataset=ds["validation"],
    compute_metrics=compute_metrics
)

trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy
1,0.8366,0.918931,0.612
2,0.6406,0.962342,0.648
3,0.461813,1.216211,0.68
4,0.304,1.669542,0.69
5,0.1888,1.868149,0.632


TrainOutput(global_step=2500, training_loss=0.5010922790527343, metrics={'train_runtime': 1205.598, 'train_samples_per_second': 8.295, 'train_steps_per_second': 2.074, 'total_flos': 6568657858560000.0, 'train_loss': 0.5010922790527343, 'epoch': 5.0})

In [37]:
trainer.eval_dataset=ds["test"]
trainer.evaluate()


{'eval_loss': 0.716800,
'eval_accuracy': 0.682,
'eval_runtime': 7.91783,
'eval_samples_per_second': 31.121,
'eval_steps_per_second': 7.911,
'epoch': 5.0}

    


In [None]:
input_texts = ["This restaurant is amazing and has the best soup",
               "This restaurant is just okay, can be better. But the drinks are good",
               "Best sushi I've ever had in Tokyo",
               "The drinks are decent. I really like the dumplings",
               "The price is too expensive and the food quality is bad. The waiters were really nice to me, though."
              ]

# Encode the text
encoded = tokenizer(input_texts, truncation=True, padding="max_length", max_length=1024, return_tensors="pt").to("cuda")

# Call the model to predict under the format of logits of 5 classes
logits = model(**encoded).logits

# Get the class
torch.argmax(logits, axis=1)

tensor([4, 1, 4, 3, 2], device='cuda:0')