In [1]:
# prompt: mount drive
import os
from google.colab import drive
drive.mount('/content/drive')
project_path = os.path.join('/content/drive', "MyDrive/Project")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install transformers torch



In [3]:
import torch
from torch.utils.data import Dataset, DataLoader
from torch import nn
from transformers import BertTokenizer, BertModel
import pandas as pd

# Define the DataFrame
df_comments = pd.DataFrame({
    'Game Title': ['The Legend of Zelda', 'Super Mario Bros', 'Pac-Man', 'Tetris', 'Minecraft'],
    'Comment': [
        'A timeless classic that never gets old.',
        'Fun and challenging at every level!',
        'Simple yet addictive gameplay.',
        'Great for brain exercise.',
        'Endless possibilities and creativity.'
    ],
    'Score': [95, 90, 85, 88, 92]
})

# Load Pre-trained BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')

# Tokenizing the comments
def tokenize_comments(comments):
    return tokenizer(comments, padding="max_length", truncation=True, max_length=128, return_tensors="pt")

df_comments['Tokenized'] = df_comments['Comment'].apply(lambda x: tokenize_comments(x))

# Creating a PyTorch dataset
class CommentDataset(Dataset):
    def __init__(self, encodings, scores):
        self.encodings = encodings
        self.scores = scores

    def __len__(self):
        return len(self.scores)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.scores[idx], dtype=torch.float)
        return item

# Prepare dataset
encodings = tokenizer(df_comments['Comment'].tolist(), padding=True, truncation=True, max_length=128, return_tensors="pt")
scores = df_comments['Score'].tolist()
dataset = CommentDataset(encodings, scores)
loader = DataLoader(dataset, batch_size=2, shuffle=True)

# Define a model
class BERTRegressor(nn.Module):
    def __init__(self, bert_model):
        super().__init__()
        self.bert = bert_model
        self.regressor = nn.Linear(768, 1)  # 768 is the dimensionality of BERT's output features

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        return self.regressor(pooled_output).squeeze(1)

model = BERTRegressor(bert_model)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)
loss_fn = nn.MSELoss()

# Training loop
num_epochs = 0  # Keep epochs low to avoid overfitting on such a small dataset
model.train()
for epoch in range(num_epochs):
    total_loss = 0
    for batch in loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']
        outputs = model(input_ids, attention_mask)
        loss = loss_fn(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {total_loss/len(loader)}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [4]:
df = pd.read_csv(os.path.join(project_path, "dataset_with_sentiment.csv"))

In [5]:
df = df.head(1000)

In [6]:
data = df[["score", "review_tokenised"]]

In [7]:
data

Unnamed: 0,score,review_tokenised
0,0.966495,"['Ruined', 'my', 'life']"
1,0.966495,"['This', 'will', 'be', 'more', 'of', 'a', 'my'..."
2,0.966495,"['This', 'game', 'saved', 'my', 'virginity']"
3,0.966495,"['Do', 'you', 'like', 'original', 'games', 'Do..."
4,0.966495,"['Easy', 'to', 'learn', 'hard', 'to', 'master']"
...,...,...
995,0.966495,"['best', 'joguinho', 'since', '1857', 'bj']"
996,0.966495,"['old', 'but', 'legendary']"
997,0.966495,"['Old', 'but', 'gold']"
998,0.966495,"['What', 'can', 'I', 'say', 'My', 'teenage', '..."


In [17]:
!pip install accelerate -U
!pip install transformers[torch] -U

Collecting transformers[torch]
  Downloading transformers-4.39.3-py3-none-any.whl (8.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.8/8.8 MB[0m [31m33.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.38.2
    Uninstalling transformers-4.38.2:
      Successfully uninstalled transformers-4.38.2
Successfully installed transformers-4.39.3


In [8]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset

# Assuming the 'transformers' and 'torch' libraries are installed
# Assuming data is loaded here, example:
# data = pd.read_csv('your_dataset.csv')

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Function to safely evaluate strings to lists
def safe_eval(x):
    if isinstance(x, str):
        try:
            return eval(x)
        except:
            return []  # or some other default value like ['error'] if eval fails
    return x

# Apply safe_eval to the 'review_tokenised' column
data.loc[:, 'review_tokenised'] = data['review_tokenised'].apply(safe_eval)

def tokenize_function(examples):
    text = [' '.join(tokens) if isinstance(tokens, list) else "" for tokens in examples['review_tokenised']]
    return tokenizer(text, padding='max_length', truncation=True, max_length=512)

# Tokenization
tokenized_inputs = tokenize_function(data)

# Dataset class
class ReviewDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float)
        return item

    def __len__(self):
        return len(self.labels)

# Split data
train_data, test_data = train_test_split(data, test_size=0.2)

# Prepare datasets
train_dataset = ReviewDataset(tokenize_function(train_data), train_data['score'].tolist())
test_dataset = ReviewDataset(tokenize_function(test_data), test_data['score'].tolist())

# Load BERT for sequence classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=1)

# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

# Train and evaluate
trainer.train()
evaluation_results = trainer.evaluate()
print(evaluation_results)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss
10,0.6404
20,0.5924
30,0.4042
40,0.2644
50,0.1286
60,0.0512
70,0.0298
80,0.0271
90,0.0251
100,0.0202


{'eval_loss': 0.00013637289521284401, 'eval_runtime': 1.926, 'eval_samples_per_second': 103.842, 'eval_steps_per_second': 6.75, 'epoch': 3.0}


In [9]:


model_path = os.path.join(project_path, "bert")
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

('/content/drive/MyDrive/Project/bert/tokenizer_config.json',
 '/content/drive/MyDrive/Project/bert/special_tokens_map.json',
 '/content/drive/MyDrive/Project/bert/vocab.txt',
 '/content/drive/MyDrive/Project/bert/added_tokens.json')