In [47]:
import pandas as pd
import numpy as np
import torch

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.manual_seed(42)

<torch._C.Generator at 0x7ecd75122e50>

In [50]:
from datasets import load_dataset, Audio, DatasetDict, Dataset

In [142]:
# Dataset import
#############
films_ds_personal = pd.read_csv('./films.csv')
#############

ds = load_dataset("jquigl/imdb-genres", split="train")

In [143]:
ds

Dataset({
    features: ['movie title - year', 'genre', 'expanded-genres', 'rating', 'description'],
    num_rows: 238256
})

In [145]:
ds = ds.rename_columns({'movie title - year': 'title'})

In [165]:
ds = ds.remove_columns(['genre', 'expanded-genres', 'rating'])

In [None]:
# <pad>The Matrix is a movie about a man who is a sex addict -> google/flan-t5-small
# <pad>The Matrix is a movie about a man who is a savage -> google/flan-t5-base
# <pad>The Matrix is a movie about a man who is sent to a planet called -> google/flan-t5-large

In [85]:
# First model for filling dataset
############################
from transformers import T5Tokenizer, T5ForConditionalGeneration, T5Config

tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-small")
model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-small")
###########################

In [76]:
# config = T5Config(
#     vocab_size=32128, # Dictionary size (remains the same)
#     d_model=384, # hidden layers (main size of the model)
#     d_kv=32, # key/value in attention memory, resize by two
#     d_ff=1024, # feed forward, was 2048
#     num_layers=6, # was 6, encoders, the depth of understanding (input)
#     num_decoder_layers=6, # was 6, decoders, the depth of reasoning (output)
#     num_heads=6, # the size of heads
#     ################### tokenizer
#     decoder_start_token_id=0,
#     eos_token_id=1,
#     pad_token_id=0,
#     bos_token_id=0,
#     ####################
# )

In [86]:
model.eval()
model.to(device)

T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=384, bias=False)
              (k): Linear(in_features=512, out_features=384, bias=False)
              (v): Linear(in_features=512, out_features=384, bias=False)
              (o): Linear(in_features=384, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 6)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseGatedActDense(
              (wi_0): Linear(in_features=512, out_features=1024, bias=False)
              (wi_1): Linear(in_features=512, out_features=1024, bias=False)
              (wo): 

In [87]:
# model = T5ForConditionalGeneration(config)
# model.to(device)

In [88]:
model_new_size = sum(p.numel() for p in model.parameters())
model_new_size

76961152

In [129]:
input_text = "What can you tell me about Matrix movie"
input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to("cuda")

outputs = model.generate(input_ids)
print(tokenizer.decode(outputs[0]))

<pad>The Matrix is a horror film directed by Roberto Alcott and starring James Bond


In [90]:
from torch.optim import AdamW

In [91]:
optimizer = AdamW(model.parameters(), lr=5e-5)

In [92]:
optimizer

AdamW (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    lr: 5e-05
    maximize: False
    weight_decay: 0.01
)

In [93]:
model.train()

T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=384, bias=False)
              (k): Linear(in_features=512, out_features=384, bias=False)
              (v): Linear(in_features=512, out_features=384, bias=False)
              (o): Linear(in_features=384, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 6)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseGatedActDense(
              (wi_0): Linear(in_features=512, out_features=1024, bias=False)
              (wi_1): Linear(in_features=512, out_features=1024, bias=False)
              (wo): 

In [102]:
from tqdm import tqdm

In [167]:
def prepare_batch(batch):

  inputs = [t for t in tqdm(batch['title'])]
  targets = [d for d in tqdm(batch['description'])]

  input_encodings = tokenizer(inputs, padding=True, truncation=True,
                               return_tensors="pt", max_length=512)
  target_encodings = tokenizer(targets, padding=True, truncation=True,
                                return_tensors="pt", max_length=512)

  target_encodings['input_ids'][target_encodings['input_ids'] == tokenizer.pad_token_id] = -100

  return input_encodings, target_encodings


In [168]:
input, target = prepare_batch(ds)

100%|██████████| 238256/238256 [00:00<00:00, 4532507.07it/s]
100%|██████████| 238256/238256 [00:00<00:00, 3737612.93it/s]


In [169]:
print(input[:1])
print(target[:1])

{'input_ids': tensor([[7036,   51,   53,  262,  291,    7,    3,   18, 9047,    1,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0]])}
{'input_ids': tensor([[ 7036,    51,    53,   262,   291,     7,    19,     3,     9,  2783,
         17201,    18,    89,    23,   110, 12032,

In [150]:
def train_step(inputs, targets):
    ## forward pass
    outputs = model(
        input_ids=inputs['input_ids'],
        attention_mask=inputs['attention_mask'],
        labels=targets  # уже с -100 для padding
    )

    loss = outputs.loss()

    # Backward pass
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    return loss.item()

In [151]:
from torch.utils.data import Dataset, DataLoader

In [154]:
# class MovieDataset(Dataset):
#     def __init__(self, dataframe):
#         self.data = dataframe

#     def __len__(self):
#         return len(self.data)

#     def __getitem__(self, idx):
#         return self.data.iloc[idx]


In [155]:
# dataset = MovieDataset(ds)

In [170]:
data_loader = DataLoader(ds, batch_size=4, shuffle=True)

In [171]:
data_loader

<torch.utils.data.dataloader.DataLoader at 0x7ecba519d110>

In [127]:
model.to(device)

T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=384, bias=False)
              (k): Linear(in_features=512, out_features=384, bias=False)
              (v): Linear(in_features=512, out_features=384, bias=False)
              (o): Linear(in_features=384, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 6)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseGatedActDense(
              (wi_0): Linear(in_features=512, out_features=1024, bias=False)
              (wi_1): Linear(in_features=512, out_features=1024, bias=False)
              (wo): 

In [131]:
model.train()

T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=384, bias=False)
              (k): Linear(in_features=512, out_features=384, bias=False)
              (v): Linear(in_features=512, out_features=384, bias=False)
              (o): Linear(in_features=384, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 6)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseGatedActDense(
              (wi_0): Linear(in_features=512, out_features=1024, bias=False)
              (wi_1): Linear(in_features=512, out_features=1024, bias=False)
              (wo): 

In [133]:
def train_model(inputs, targets, data_loader):
  for epoch in tqdm(range(10)):
      total_loss = 0
      step_count = 0

      for inputs, targets in data_loader:
          # Forward pass
          outputs = model(
              input_ids=inputs['input_ids'],
              attention_mask=inputs['attention_mask'],
              labels=targets
          )

          loss = outputs.loss

          # Backward pass
          optimizer.zero_grad()
          loss.backward()
          optimizer.step()

          total_loss += loss.item()
          step_count += 1

          if step_count % 10 == 0:
              print(f"Epoch {epoch+1}, Step {step_count}, Loss: {loss.item():.4f}")

      avg_loss = total_loss / step_count
      print(f"Epoch {epoch+1} DONE! Average Loss: {avg_loss:.4f}")

In [172]:
train_model(inputs=input, targets=target, data_loader=data_loader)

  0%|          | 0/10 [00:00<?, ?it/s]


TypeError: string indices must be integers, not 'str'