In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install datasets transformers==4.28.0



In [3]:
from datasets import Dataset
import pandas as pd
import math
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding, pipeline
import torch
from transformers import TrainingArguments, Trainer

In [4]:
torch.cuda.empty_cache()

In [35]:
df = pd.read_pickle('/content/drive/My Drive/dataset/yelp_all_balanced.pkl')

In [36]:
df = df[['text','stars']]
df = df.reset_index(drop=True)

In [37]:
df

Unnamed: 0,text,stars
0,Steaks taste better at Texas Roadhouse than th...,1
1,Food was two star. Zero star if I could for ki...,1
2,Disappointing that this location's milk shake ...,1
3,there is very bad service. don't come here. th...,1
4,So after 5 months later from writing this revi...,1
...,...,...
24995,We happened upon Palm and Pine by accident. Ca...,5
24996,Margaritas were great and nacho pizza was deli...,5
24997,Great tenderloins and yummy fries. Come and su...,5
24998,So happy to finally have a Thai place within 1...,5


In [38]:
nMax = 5000

res = df.groupby('stars').apply(lambda x: x.sample(n=min(nMax, len(x))))
pd.set_option('display.max_rows', 250)
print(res['stars'].value_counts())

1    5000
2    5000
3    5000
4    5000
5    5000
Name: stars, dtype: int64


In [39]:
df = res
df = df.reset_index(drop=True)

In [40]:
df['stars'] = df['stars'] - 1

In [41]:
df

Unnamed: 0,text,stars
0,POOR CUSTOMER SERVICE & HORRIBLE COFFEE!! Owne...,0
1,I went here to pick up a quick breakfast befor...,0
2,Sunday October 6th About 11:30 am\nwe stopped ...,0
3,Wasn't great..I was so hopeful because I liked...,0
4,Kitchen should be ashamed and those pics of fo...,0
...,...,...
24995,Sushi burritos. What more do I even need to sa...,4
24996,Great! I make sure to visit here whenever I go...,4
24997,After a long night out in the neighborhood of ...,4
24998,Old Northeast Pizza is a cash only place but t...,4


In [42]:
dataX = df['text']
dataY = df['stars']
dataX.head(5), dataY.head(5)

(0    POOR CUSTOMER SERVICE & HORRIBLE COFFEE!! Owne...
 1    I went here to pick up a quick breakfast befor...
 2    Sunday October 6th About 11:30 am\nwe stopped ...
 3    Wasn't great..I was so hopeful because I liked...
 4    Kitchen should be ashamed and those pics of fo...
 Name: text, dtype: object,
 0    0
 1    0
 2    0
 3    0
 4    0
 Name: stars, dtype: int64)

In [43]:
train_ratio = 0.80
validation_ratio = 0.10
test_ratio = 0.10

# train is now 75% of the entire data set
x_train, x_test, y_train, y_test = train_test_split(dataX, dataY, test_size=1 - train_ratio)

# test is now 10% of the initial data set
# validation is now 15% of the initial data set
x_val, x_test, y_val, y_test = train_test_split(x_test, y_test, test_size=test_ratio/(test_ratio + validation_ratio))

In [44]:
raw_train_df = pd.concat([x_train, y_train], axis=1)
raw_test_df = pd.concat([x_test, y_test], axis=1)
raw_val_df = pd.concat([x_val, y_val], axis=1)

In [45]:
raw_train_ds = Dataset.from_pandas(raw_train_df)
raw_test_ds = Dataset.from_pandas(raw_test_df)
raw_val_ds = Dataset.from_pandas(raw_val_df)

In [46]:
print(raw_train_ds, raw_val_ds, raw_test_ds)

Dataset({
    features: ['text', 'stars', '__index_level_0__'],
    num_rows: 20000
}) Dataset({
    features: ['text', 'stars', '__index_level_0__'],
    num_rows: 2500
}) Dataset({
    features: ['text', 'stars', '__index_level_0__'],
    num_rows: 2500
})


In [17]:
BASE_MODEL = "allenai/longformer-base-4096"
LEARNING_RATE = 2e-5
MAX_LENGTH = 2048
BATCH_SIZE = 4
EPOCHS = 5

id2label = {k:k for k in range(5)}
label2id = {k:k for k in range(5)}

tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
model = AutoModelForSequenceClassification.from_pretrained(BASE_MODEL, id2label=id2label, label2id=label2id)

Some weights of the model checkpoint at allenai/longformer-base-4096 were not used when initializing LongformerForSequenceClassification: ['lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing LongformerForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LongformerForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of LongformerForSequenceClassification were not initialized from the model checkpoint at allenai/longformer-base-4096 and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', '

In [18]:
ds = {"train": raw_train_ds, "validation": raw_val_ds, "test": raw_test_ds}

def preprocess_function(examples):
    label = examples["stars"]
    examples = tokenizer(examples["text"], truncation=True, padding="max_length", max_length=MAX_LENGTH)
    examples["label"] = label
    return examples

for split in ds:
    ds[split] = ds[split].map(preprocess_function, remove_columns=["text", "stars"])

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

In [19]:
import numpy as np
from datasets import load_metric

metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

  metric = load_metric("accuracy")


In [20]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="../models/yelp-all-longformer",
    learning_rate=LEARNING_RATE,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=EPOCHS,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    metric_for_best_model="accuracy",
    load_best_model_at_end=True,
    weight_decay=0.01,
)

In [21]:
from transformers import Trainer

torch.manual_seed(42)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=ds["train"],
    eval_dataset=ds["validation"],
    compute_metrics=compute_metrics
)

trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy
1,0.9353,0.850331,0.644
2,0.8261,0.91262,0.719
3,0.6051,0.824477,0.746
4,0.4216,0.525217,0.72
5,0.4167,0.533121,0.728


TrainOutput(global_step=2500, training_loss=0.6622587768554687, metrics={'train_runtime': 2215.8638, 'train_samples_per_second': 4.513, 'train_steps_per_second': 1.128, 'total_flos': 1.313731571712e+16, 'train_loss': 0.6622587768554687, 'epoch': 5.0})

In [22]:
trainer.eval_dataset=ds["test"]
trainer.evaluate()

{'eval_loss': 0.4712528354399112,
 'eval_accuracy': 0.728,
 'eval_runtime': 26.1811,
 'eval_samples_per_second': 17.4249,
 'eval_steps_per_second': 7.448,
 'epoch': 5.0}

In [23]:
input_texts = ["This restaurant is amazing and has the best soup",
               "This restaurant is just okay, can be better. But the drinks are good",
               "Best sushi I've ever had in Tokyo",
               "The drinks are decent. I really like the dumplings",
               "The price is too expensive and the food quality is bad. The waiters were really nice to me, though."
              ]

# Encode the text
encoded = tokenizer(input_texts, truncation=True, padding="max_length", max_length=1024, return_tensors="pt").to("cuda")

# Call the model to predict under the format of logits of 5 classes
logits = model(**encoded).logits

# Get the class
torch.argmax(logits, axis=1)

tensor([4, 2, 4, 2, 2], device='cuda:0')

In [24]:
#trainer.save_model('./drive/MyDrive/yelp-longformer-500')

In [25]:
#tokenizer.save_pretrained("./drive/MyDrive/yelp-longformer-tokenizer-500")