In [1]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from transformers import Trainer, TrainingArguments, EarlyStoppingCallback
import torch
import pandas as pd
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset


model_name = "mideind/IceBERT"
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
tokenizer = AutoTokenizer.from_pretrained(model_name)
RANDOM_SEED = 42


def tokenize_data(data, tokenizer, max_len=512):
    return tokenizer(
        data.tolist(), padding="max_length", truncation=True, max_length=max_len)

df = pd.read_csv("../Google-without-lem.csv")

df = df.sample(n=10000, random_state=RANDOM_SEED)
df.drop(["Unnamed: 0"], axis=1, inplace=True)


def convert(sentiment):
    return 1 if sentiment == "positive" else 0


df["sentiment"] = df.sentiment.apply(convert)

# show how many positive and negative reviews we have
# print(df.sentiment.value_counts())

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# df["review"] = df.review.apply(lambda x: x.replace("_NEG", ""))
torch.manual_seed(RANDOM_SEED)

X_train, X_temp, y_train, y_temp = train_test_split(
    df["review"], df["sentiment"], test_size=0.3, random_state=42
)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42
)

# Tokenize the training data
train_data = tokenize_data(X_train, tokenizer)

# Tokenize the validation data
val_data = tokenize_data(X_val, tokenizer)

# Tokenize the test data
test_data = tokenize_data(X_test, tokenizer)


  from .autonotebook import tqdm as notebook_tqdm
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at mideind/IceBERT and are newly initialized: ['classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.out_proj.bias', 'classifier.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [2]:

import numpy as np

class SentimentDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels.iloc[idx])  # ensure labels are included
        return item

    def __len__(self):
        return len(self.labels)


train_dataset = SentimentDataset(train_data, y_train)
val_dataset = SentimentDataset(val_data, y_val)
test_dataset = SentimentDataset(test_data, y_test)

# Set GPU


# class CustomTrainer(Trainer):
#     def log(self, logs: dict):
#         # Get the last evaluation loss.
#         eval_loss = self.evaluate()["eval_loss"]
#         logs["eval_loss"] = eval_loss

#         # Call the parent class log method to handle the rest.
#         super().log(logs)

# training_args = TrainingArguments(
#     output_dir="./results",
#     evaluation_strategy="steps",  # "steps" to evaluate each `logging_steps` or "epoch" to evaluate each epoch
#     eval_steps=1000,  # Evaluation and Save happens every 500 steps
#     num_train_epochs=10,
#     per_device_train_batch_size=8,
#     per_device_eval_batch_size=8,
#     warmup_steps=500,
#     weight_decay=0.01,
#     logging_dir="./logs",
# )


def compute_metrics(p):
    return {"acc": (np.argmax(p.predictions, axis=1) == p.label_ids).mean()}

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=10,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=500,
    evaluation_strategy="steps",
    eval_steps=500,
    save_steps=1000,
    save_total_limit=2,
    load_best_model_at_end=True,  # This will ensure that the best model is loaded at the end of training
)

trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3, early_stopping_threshold=0.01)],
)


In [3]:
trainer.train()


results = trainer.evaluate()
print(results)

model.save_pretrained("./sentiment_model")
tokenizer.save_pretrained("./sentiment_model")

  6%|▌         | 500/8750 [02:11<36:16,  3.79it/s]

{'loss': 0.5666, 'learning_rate': 5e-05, 'epoch': 0.57}


                                                  
  6%|▌         | 500/8750 [02:27<36:16,  3.79it/s]

{'eval_loss': 0.4655722379684448, 'eval_acc': 0.84, 'eval_runtime': 16.371, 'eval_samples_per_second': 91.626, 'eval_steps_per_second': 11.484, 'epoch': 0.57}


 11%|█▏        | 1000/8750 [04:39<34:14,  3.77it/s]  

{'loss': 0.4836, 'learning_rate': 4.696969696969697e-05, 'epoch': 1.14}


                                                   
 11%|█▏        | 1000/8750 [04:55<34:14,  3.77it/s]

{'eval_loss': 0.4484061598777771, 'eval_acc': 0.842, 'eval_runtime': 16.4518, 'eval_samples_per_second': 91.175, 'eval_steps_per_second': 11.427, 'epoch': 1.14}


 17%|█▋        | 1500/8750 [07:15<31:31,  3.83it/s]   

{'loss': 0.3884, 'learning_rate': 4.3939393939393944e-05, 'epoch': 1.71}


                                                   
 17%|█▋        | 1500/8750 [07:32<31:31,  3.83it/s]

{'eval_loss': 0.5198267698287964, 'eval_acc': 0.8626666666666667, 'eval_runtime': 16.4529, 'eval_samples_per_second': 91.169, 'eval_steps_per_second': 11.427, 'epoch': 1.71}


 23%|██▎       | 2000/8750 [09:45<29:55,  3.76it/s]   

{'loss': 0.3517, 'learning_rate': 4.0909090909090915e-05, 'epoch': 2.29}


                                                   
 23%|██▎       | 2000/8750 [10:02<29:55,  3.76it/s]

{'eval_loss': 0.5042148232460022, 'eval_acc': 0.8613333333333333, 'eval_runtime': 16.9607, 'eval_samples_per_second': 88.44, 'eval_steps_per_second': 11.084, 'epoch': 2.29}


 29%|██▊       | 2500/8750 [12:22<26:50,  3.88it/s]   

{'loss': 0.2978, 'learning_rate': 3.787878787878788e-05, 'epoch': 2.86}


                                                   
 29%|██▊       | 2500/8750 [12:38<26:50,  3.88it/s]

{'eval_loss': 0.43802404403686523, 'eval_acc': 0.8846666666666667, 'eval_runtime': 15.9922, 'eval_samples_per_second': 93.796, 'eval_steps_per_second': 11.756, 'epoch': 2.86}


 34%|███▍      | 3000/8750 [14:48<23:35,  4.06it/s]  

{'loss': 0.2428, 'learning_rate': 3.484848484848485e-05, 'epoch': 3.43}


                                                   
 34%|███▍      | 3000/8750 [15:04<23:35,  4.06it/s]

{'eval_loss': 0.4011671841144562, 'eval_acc': 0.8853333333333333, 'eval_runtime': 15.7889, 'eval_samples_per_second': 95.003, 'eval_steps_per_second': 11.907, 'epoch': 3.43}


 40%|████      | 3500/8750 [17:21<22:26,  3.90it/s]   

{'loss': 0.2149, 'learning_rate': 3.181818181818182e-05, 'epoch': 4.0}


                                                   
 40%|████      | 3500/8750 [17:37<22:26,  3.90it/s]

{'eval_loss': 0.4456855356693268, 'eval_acc': 0.9033333333333333, 'eval_runtime': 16.3114, 'eval_samples_per_second': 91.96, 'eval_steps_per_second': 11.526, 'epoch': 4.0}


 46%|████▌     | 4000/8750 [19:48<21:24,  3.70it/s]  

{'loss': 0.1469, 'learning_rate': 2.878787878787879e-05, 'epoch': 4.57}


                                                   
 46%|████▌     | 4000/8750 [20:04<21:24,  3.70it/s]

{'eval_loss': 0.583526611328125, 'eval_acc': 0.8726666666666667, 'eval_runtime': 16.3477, 'eval_samples_per_second': 91.756, 'eval_steps_per_second': 11.5, 'epoch': 4.57}


 51%|█████▏    | 4500/8750 [22:22<18:14,  3.88it/s]   

{'loss': 0.1216, 'learning_rate': 2.575757575757576e-05, 'epoch': 5.14}


                                                   
 51%|█████▏    | 4500/8750 [22:38<18:14,  3.88it/s]

{'eval_loss': 0.5595088601112366, 'eval_acc': 0.888, 'eval_runtime': 16.3139, 'eval_samples_per_second': 91.946, 'eval_steps_per_second': 11.524, 'epoch': 5.14}


 51%|█████▏    | 4500/8750 [22:39<21:23,  3.31it/s]


{'train_runtime': 1359.3772, 'train_samples_per_second': 51.494, 'train_steps_per_second': 6.437, 'train_loss': 0.31269213019477, 'epoch': 5.14}


100%|██████████| 188/188 [00:16<00:00, 11.49it/s]


{'eval_loss': 0.4011671841144562, 'eval_acc': 0.8853333333333333, 'eval_runtime': 16.3446, 'eval_samples_per_second': 91.773, 'eval_steps_per_second': 11.502, 'epoch': 5.14}


('./sentiment_model\\tokenizer_config.json',
 './sentiment_model\\special_tokens_map.json',
 './sentiment_model\\vocab.json',
 './sentiment_model\\merges.txt',
 './sentiment_model\\added_tokens.json',
 './sentiment_model\\tokenizer.json')