In [1]:
# !tar xvzf /content/aclImdb_v1.tar.gz

In [5]:
import os
import torch
import torch.nn as nn
from transformers import DistilBertTokenizer, DistilBertModel, AdamW
from torch.utils.data import DataLoader, Dataset
from sklearn.metrics import mean_squared_error

In [6]:
def load_data_from_dir(data_dir):
    texts = []
    ratings = []

    for sentiment in ['pos', 'neg']:
        sentiment_path = os.path.join(data_dir, sentiment)

        for filename in os.listdir(sentiment_path):
            if filename.endswith(".txt"):
                rating = int(filename.split('_')[1].split('.')[0])
                file_path = os.path.join(sentiment_path, filename)

                with open(file_path, 'r', encoding='utf-8') as file:
                    text = file.read()

                texts.append(text)
                ratings.append(rating)

    return texts, ratings

In [7]:
train_dir = '/content/aclImdb/train'
test_dir = '/content/aclImdb/test'

X_train, y_train = load_data_from_dir(train_dir)
X_test, y_test = load_data_from_dir(test_dir)

In [8]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

def tokenize_data(texts, tokenizer, max_len=128):
    return tokenizer(
        texts,
        max_length=max_len,
        padding=True,
        truncation=True,
        return_tensors="pt"
    )

train_encodings = tokenize_data(X_train, tokenizer)
test_encodings = tokenize_data(X_test, tokenizer)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]



In [9]:
class SentimentDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float)
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = SentimentDataset(train_encodings, y_train)
test_dataset = SentimentDataset(test_encodings, y_test)

In [10]:
class SentimentRegressor(nn.Module):
    def __init__(self, pretrained_model, dropout=0.3):
        super(SentimentRegressor, self).__init__()
        self.bert = pretrained_model
        self.dropout = nn.Dropout(dropout)
        self.regressor = nn.Linear(768, 1)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.last_hidden_state[:, 0]
        pooled_output = self.dropout(pooled_output)
        return self.regressor(pooled_output)

pretrained_model = DistilBertModel.from_pretrained('distilbert-base-uncased')
model = SentimentRegressor(pretrained_model=pretrained_model)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

optimizer = AdamW(model.parameters(), lr=1e-5)
loss_fn = nn.MSELoss()

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

model.train()
for epoch in range(3):
    total_loss = 0
    for batch in train_loader:
        optimizer.zero_grad()
        
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].unsqueeze(1).to(device)
        
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = loss_fn(outputs, labels)
        
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Эпоха {epoch+1}, Потери: {total_loss / len(train_loader)}")


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]



Эпоха 1, Потери: 5.773727413827002
Эпоха 2, Потери: 3.6081599968217994
Эпоха 3, Потери: 2.693708076205531


In [11]:
model.eval()
predictions = []
actuals = []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].unsqueeze(1).to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        predictions.extend(outputs.squeeze(1).tolist())
        actuals.extend(labels.squeeze(1).tolist())

mse = mean_squared_error(actuals, predictions)
print(f"Среднеквадратическая ошибка (MSE): {mse}")

Среднеквадратическая ошибка (MSE): 4.2157452065263294


In [12]:
from sklearn.metrics import mean_squared_error, accuracy_score

model.eval()
predictions = []
actuals = []
binary_predictions = []
binary_actuals = []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].unsqueeze(1).to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)

        predictions.extend(outputs.squeeze(1).tolist())
        actuals.extend(labels.squeeze(1).tolist())

        binary_predictions.extend([1 if pred >= 7 else 0 for pred in outputs.squeeze(1).tolist()])
        binary_actuals.extend([1 if label >= 7 else 0 for label in labels.squeeze(1).tolist()])

mse = mean_squared_error(actuals, predictions)
print(f"Среднеквадратическая ошибка (MSE): {mse}")

accuracy = accuracy_score(binary_actuals, binary_predictions)
print(f"Точность классификации (Accuracy): {accuracy * 100:.2f}%")


Среднеквадратическая ошибка (MSE): 4.2157452065263294
Точность классификации (Accuracy): 85.81%


In [35]:
from transformers import DistilBertTokenizer
import torch

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

text = "It is disappointing to see as talented an actor as Amitabh Bachchan in such a weak role, especially when he was beyond sensational in BLACK (which I highly recommend). One line in the film states: "Sakar is not a mere man, he is a thought and a philosophy." Director Ram Gopal Varma credits THE GODFATHER as an inspiration for this movie, and perhaps that is the problem. It seems like a badly mangled American movie set in India. The Left Elbow Index considers seven elements of film-making--acting, continuity, plot, character development, dialogue, artistry, and production sets--on a scale from a high of 10 to a low of 1, with 5 given as a average score. The film continuity seems high, an 8, by maintaining a violent tone infused with drama in places, and using justice outside the legal system as motivation. However, there seems to be a lack of emotion connected with the evil of organized crime. The acting rates a 4, it appears too weak, even when someone is being beaten or murdered, it seems hoohum. For example, when one character is shot in the forehead, I found myself wondering if, or when, he was going to fall. He does not, and ala Ronald Reagan he is placed in an automobile, with his bleeding face cradled ala John F. Kennedy. The plot rates a 5 as an example of American-style gangsterism, with a family oriented Robinhood at its head. Character development appears static, and the characters seem like chess pieces on an abandoned chess board, thereby earning a rank of 3. The dialogue seems stilted, and appears to be forced to fit some Bowery pattern of speech--a 4 for dialogue. Production sets look to be below average--a 4. And, artistry is puzzling, with far too many close-ups, too rapid panning, and too many group scenes where the actors seem over rehearsed--a 3. To me, too much camera movement is disruptive. The average of the Left Elbow Index is 4.4, and with a slight deduction based on poor derivatism it moves down to a 4. Two questions continually arise in the film: one, why are so many people eating so often: and, two, does not India have its own brand of organized crime? Do films like this have to be so dependent on Western cultural examples? As much as I like Amitabh Bachchan, I cannot recommend this film."
text = text.replace('"', '')
text = text.replace("'", '')


inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)

inputs = {key: value.to(device) for key, value in inputs.items()}






SyntaxError: invalid syntax (<ipython-input-35-81b125176c2e>, line 10)

In [31]:
model.eval()
with torch.no_grad():
    outputs = model(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'])

    predicted_rating = outputs.item()
    print(f"Предсказанный рейтинг: {predicted_rating:.2f}")


Предсказанный рейтинг: 1.69


In [37]:
torch.save(model.state_dict(), "/content/model/sentiment_regressor.pth")

tokenizer.save_pretrained("/content/model")


('/content/model/tokenizer_config.json',
 '/content/model/special_tokens_map.json',
 '/content/model/vocab.txt',
 '/content/model/added_tokens.json')

In [38]:
!zip -r model.zip /content/model


  adding: content/model/ (stored 0%)
  adding: content/model/tokenizer_config.json (deflated 75%)
  adding: content/model/vocab.txt (deflated 53%)
  adding: content/model/sentiment_regressor.pth (deflated 8%)
  adding: content/model/special_tokens_map.json (deflated 42%)


In [39]:
from google.colab import files
files.download("model.zip")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>