In [None]:
!tar xvzf /content/aclImdb_v1.tar.gz

In [2]:
import os
import torch
import torch.nn as nn
from transformers import DistilBertTokenizer, DistilBertModel, AdamW
from torch.utils.data import DataLoader, Dataset
from sklearn.metrics import mean_squared_error
from transformers import get_linear_schedule_with_warmup

In [3]:
def load_data_from_dir(data_dir):
    texts = []
    ratings = []

    for sentiment in ['pos', 'neg']:
        sentiment_path = os.path.join(data_dir, sentiment)

        for filename in os.listdir(sentiment_path):
            if filename.endswith(".txt"):
                rating = int(filename.split('_')[1].split('.')[0])
                file_path = os.path.join(sentiment_path, filename)

                with open(file_path, 'r', encoding='utf-8') as file:
                    text = file.read()

                texts.append(text)
                ratings.append(rating)

    return texts, ratings

In [4]:
train_dir = '/content/aclImdb/train'
test_dir = '/content/aclImdb/test'

X_train, y_train = load_data_from_dir(train_dir)
X_test, y_test = load_data_from_dir(test_dir)

In [5]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

def tokenize_data(texts, tokenizer, max_len=128):
    return tokenizer(
        texts,
        max_length=max_len,
        padding=True,
        truncation=True,
        return_tensors="pt"
    )

train_encodings = tokenize_data(X_train, tokenizer)
test_encodings = tokenize_data(X_test, tokenizer)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]



In [6]:
class SentimentDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float)
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = SentimentDataset(train_encodings, y_train)
test_dataset = SentimentDataset(test_encodings, y_test)

In [7]:
class SentimentRegressor(nn.Module):
    def __init__(self, pretrained_model, dropout=0.2):
        super(SentimentRegressor, self).__init__()
        self.bert = pretrained_model
        self.dropout = nn.Dropout(dropout)

        self.fc1 = nn.Linear(768, 512)
        self.fc2 = nn.Linear(512, 1)
        self.relu = nn.ReLU()

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.last_hidden_state[:, 0]
        pooled_output = self.dropout(pooled_output)

        x = self.fc1(pooled_output)
        x = self.relu(x)

        x = self.fc2(x)

        return x


pretrained_model = DistilBertModel.from_pretrained('distilbert-base-uncased')
model = SentimentRegressor(pretrained_model=pretrained_model)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

optimizer = AdamW(model.parameters(), lr=3e-5)
loss_fn = nn.MSELoss()

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

total_steps = len(train_loader) * 4
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)

model.train()
for epoch in range(4):
    total_loss = 0
    for batch in train_loader:
        optimizer.zero_grad()

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].unsqueeze(1).to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = loss_fn(outputs, labels)

        loss.backward()
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()

    print(f"Эпоха {epoch+1}, Потери: {total_loss / len(train_loader)}")


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]



Эпоха 1, Потери: 5.675733844477324
Эпоха 2, Потери: 2.974773317785196
Эпоха 3, Потери: 1.8234710773633065
Эпоха 4, Потери: 1.312247976956273


In [8]:
model.eval()
predictions = []
actuals = []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].unsqueeze(1).to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        predictions.extend(outputs.squeeze(1).tolist())
        actuals.extend(labels.squeeze(1).tolist())

mse = mean_squared_error(actuals, predictions)
print(f"Среднеквадратическая ошибка (MSE): {mse}")

Среднеквадратическая ошибка (MSE): 4.129538862247403


In [9]:
from sklearn.metrics import mean_squared_error, accuracy_score

model.eval()
predictions = []
actuals = []
binary_predictions = []
binary_actuals = []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].unsqueeze(1).to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)

        predictions.extend(outputs.squeeze(1).tolist())
        actuals.extend(labels.squeeze(1).tolist())

        binary_predictions.extend([1 if pred >= 7 else 0 for pred in outputs.squeeze(1).tolist()])
        binary_actuals.extend([1 if label >= 7 else 0 for label in labels.squeeze(1).tolist()])

mse = mean_squared_error(actuals, predictions)
print(f"Среднеквадратическая ошибка (MSE): {mse}")

accuracy = accuracy_score(binary_actuals, binary_predictions)
print(f"Точность классификации (Accuracy): {accuracy * 100:.2f}%")

Среднеквадратическая ошибка (MSE): 4.129538862247403
Точность классификации (Accuracy): 86.89%


In [10]:
# torch.save(model.state_dict(), "/content/model/sentiment_regressor.pth")

# tokenizer.save_pretrained("/content/model")

In [11]:
# !zip -r model.zip /content/model

In [12]:
# from google.colab import files
# files.download("model.zip")