In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install datasets transformers==4.28.0

Collecting datasets
  Downloading datasets-2.13.1-py3-none-any.whl (486 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m486.2/486.2 kB[0m [31m29.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting transformers==4.28.0
  Downloading transformers-4.28.0-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m101.3 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0 (from transformers==4.28.0)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m30.8 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers==4.28.0)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m107.6 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.7,>=0.3.0 (from data

In [3]:
from datasets import Dataset
import pandas as pd
import math
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding, pipeline
import torch
from transformers import TrainingArguments, Trainer

In [4]:
torch.cuda.empty_cache()

In [5]:
df = pd.read_pickle('/content/drive/My Drive/dataset/yelp_2013_balanced.pkl')

In [6]:
df = df[['text','stars']]
df = df.reset_index(drop=True)

In [7]:
df

Unnamed: 0,text,stars
0,It's a shame that my family has been frequenti...,1
1,My general opinion of writing Yelp reviews is ...,1
2,made reservations on thursday night for saturd...,1
3,"It was a night I hope to forget, but I'll stil...",1
4,Trainwreck is appropriately named based on tod...,1
...,...,...
12495,Had a wonderful brunch with girlfriends today!...,5
12496,"Newer is always better, but not in this case. ...",5
12497,I love this place! The prices are reasonable a...,5
12498,"I ordered a whole tomato pie, a whole chicken ...",5


In [8]:
nMax = 2500

res = df.groupby('stars').apply(lambda x: x.sample(n=min(nMax, len(x))))
pd.set_option('display.max_rows', 250)
print(res['stars'].value_counts())

1    2500
2    2500
3    2500
4    2500
5    2500
Name: stars, dtype: int64


In [9]:
df = res
df = df.reset_index(drop=True)

In [10]:
df['stars'] = df['stars'] - 1

In [11]:
df

Unnamed: 0,text,stars
0,Wow- the family went here a few past 6 to get ...,0
1,It's quite interesting to see a Vietnamese res...,0
2,"Last time I was here was 30 years ago or so, I...",0
3,The customer service is horrible. Paid $5 for ...,0
4,My husband and I were told to try Monks. When...,0
...,...,...
12495,Holy Brunch! Bloody Mary bar and scones on th...,4
12496,Brian the owner is AWESOME!! What is a Forbes ...,4
12497,One of the best meals I have had. Everything ...,4
12498,I already have a review posted but had to add ...,4


In [12]:
dataX = df['text']
dataY = df['stars']
dataX.head(5), dataY.head(5)

(0    Wow- the family went here a few past 6 to get ...
 1    It's quite interesting to see a Vietnamese res...
 2    Last time I was here was 30 years ago or so, I...
 3    The customer service is horrible. Paid $5 for ...
 4    My husband and I were told to try Monks.  When...
 Name: text, dtype: object,
 0    0
 1    0
 2    0
 3    0
 4    0
 Name: stars, dtype: int64)

In [13]:
train_ratio = 0.80
validation_ratio = 0.10
test_ratio = 0.10

# train is now 75% of the entire data set
x_train, x_test, y_train, y_test = train_test_split(dataX, dataY, test_size=1 - train_ratio)

# test is now 10% of the initial data set
# validation is now 15% of the initial data set
x_val, x_test, y_val, y_test = train_test_split(x_test, y_test, test_size=test_ratio/(test_ratio + validation_ratio))

In [14]:
raw_train_df = pd.concat([x_train, y_train], axis=1)
raw_test_df = pd.concat([x_test, y_test], axis=1)
raw_val_df = pd.concat([x_val, y_val], axis=1)

In [15]:
raw_train_ds = Dataset.from_pandas(raw_train_df)
raw_test_ds = Dataset.from_pandas(raw_test_df)
raw_val_ds = Dataset.from_pandas(raw_val_df)

In [16]:
print(raw_train_ds, raw_val_ds, raw_test_ds)

Dataset({
    features: ['text', 'stars', '__index_level_0__'],
    num_rows: 10000
}) Dataset({
    features: ['text', 'stars', '__index_level_0__'],
    num_rows: 1250
}) Dataset({
    features: ['text', 'stars', '__index_level_0__'],
    num_rows: 1250
})


In [None]:
BASE_MODEL = "allenai/longformer-base-4096"
LEARNING_RATE = 2e-5
MAX_LENGTH = 2048
BATCH_SIZE = 4
EPOCHS = 5

id2label = {k:k for k in range(5)}
label2id = {k:k for k in range(5)}

tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
model = AutoModelForSequenceClassification.from_pretrained(BASE_MODEL, id2label=id2label, label2id=label2id)

Some weights of the model checkpoint at allenai/longformer-base-4096 were not used when initializing LongformerForSequenceClassification: ['lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing LongformerForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LongformerForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of LongformerForSequenceClassification were not initialized from the model checkpoint at allenai/longformer-base-4096 and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', '

In [None]:
ds = {"train": raw_train_ds, "validation": raw_val_ds, "test": raw_test_ds}

def preprocess_function(examples):
    label = examples["stars"]
    examples = tokenizer(examples["text"], truncation=True, padding="max_length", max_length=MAX_LENGTH)
    examples["label"] = label
    return examples

for split in ds:
    ds[split] = ds[split].map(preprocess_function, remove_columns=["text", "stars"])

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

In [None]:
import numpy as np
from datasets import load_metric

metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

  metric = load_metric("accuracy")


In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="../models/yelp-2013-longformer",
    learning_rate=LEARNING_RATE,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=EPOCHS,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    metric_for_best_model="accuracy",
    load_best_model_at_end=True,
    weight_decay=0.01,
)

In [None]:
from transformers import Trainer

torch.manual_seed(42)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=ds["train"],
    eval_dataset=ds["validation"],
    compute_metrics=compute_metrics
)

trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy
1,1.1496,0.850331,0.616
2,0.8162,0.88871,0.676
3,0.6172,0.686352,0.728
4,0.4216,0.563521,0.751
5,0.3067,0.473149,0.711


TrainOutput(global_step=2500, training_loss=0.6622587768554687, metrics={'train_runtime': 2215.8638, 'train_samples_per_second': 4.513, 'train_steps_per_second': 1.128, 'total_flos': 1.313731571712e+16, 'train_loss': 0.6622587768554687, 'epoch': 5.0})

In [None]:
trainer.eval_dataset=ds["test"]
trainer.evaluate()

{'eval_loss': 0.4892248964309692,
 'eval_accuracy': 0.71,
 'eval_runtime': 14.1841,
 'eval_samples_per_second': 17.849,
 'eval_steps_per_second': 4.498,
 'epoch': 5.0}

In [None]:
#trainer.save_model('./drive/MyDrive/yelp-longformer-500')

In [None]:
#tokenizer.save_pretrained("./drive/MyDrive/yelp-longformer-tokenizer-500")