In [1]:
%pip install transformers==4.2.2

Collecting transformers==4.2.2
  Downloading transformers-4.2.2-py3-none-any.whl (1.8 MB)
[K     |████████████████████████████████| 1.8 MB 2.8 MB/s eta 0:00:01
Collecting numpy
  Downloading numpy-1.24.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.3 MB)
[K     |████████████████████████████████| 17.3 MB 10.5 MB/s eta 0:00:01    |▎                               | 153 kB 11.3 MB/s eta 0:00:02
[?25hCollecting tokenizers==0.9.4
  Downloading tokenizers-0.9.4-cp38-cp38-manylinux2010_x86_64.whl (2.9 MB)
[K     |████████████████████████████████| 2.9 MB 11.3 MB/s eta 0:00:01
[?25hCollecting filelock
  Downloading filelock-3.9.0-py3-none-any.whl (9.7 kB)
Collecting sacremoses
  Downloading sacremoses-0.0.53.tar.gz (880 kB)
[K     |████████████████████████████████| 880 kB 11.5 MB/s eta 0:00:01
[?25hCollecting regex!=2019.12.17
  Downloading regex-2022.10.31-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (772 kB)
[K     |████████████████████████████████| 772 kB 1

In [3]:
!nvidia-smi

Tue Jan 17 13:38:02 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.161.03   Driver Version: 470.161.03   CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  Off  | 00000000:0A:00.0 N/A |                  N/A |
| N/A   47C    P0    N/A /  N/A |    698MiB /  1994MiB |     N/A      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

# Load the dataset from Kaggle

In [3]:
%pip install scikit-learn==0.23.2

Collecting scikit-learn==0.23.2
  Downloading scikit_learn-0.23.2-cp38-cp38-manylinux1_x86_64.whl (6.8 MB)
[K     |████████████████████████████████| 6.8 MB 4.4 MB/s eta 0:00:01
[?25hCollecting scipy>=0.19.1
  Downloading scipy-1.10.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (34.5 MB)
[K     |████████████████████████████████| 34.5 MB 679 kB/s  eta 0:00:01     |█████████▌                      | 10.3 MB 11.3 MB/s eta 0:00:03     |████████████████                | 17.4 MB 11.2 MB/s eta 0:00:02
Collecting threadpoolctl>=2.0.0
  Downloading threadpoolctl-3.1.0-py3-none-any.whl (14 kB)
Installing collected packages: scipy, threadpoolctl, scikit-learn
Successfully installed scikit-learn-0.23.2 scipy-1.10.0 threadpoolctl-3.1.0
Note: you may need to restart the kernel to use updated packages.


In [4]:
import re
import json
from sklearn.model_selection import train_test_split


with open('recipes.json') as f:
    data = json.load(f)

def build_text_files(data_json, dest_path):
    f = open(dest_path, 'w')
    data = ''
    for texts in data_json:
        summary = str(texts['Instructions']).strip()
        summary = re.sub(r"\s", " ", summary)
        data += summary + "  "
    f.write(data)

train, test = train_test_split(data,test_size=0.15) 


build_text_files(train,'train_dataset.txt')
build_text_files(test,'test_dataset.txt')

print("Train dataset length: "+str(len(train)))
print("Test dataset length: "+ str(len(test)))

Train dataset length: 10361
Test dataset length: 1829


# Tokenizer (german)

In [2]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("dbmdz/german-gpt2")

train_path = 'train_dataset.txt'
test_path = 'test_dataset.txt'


Special tokens have been added in the vocabulary, make sure the associated word embedding are fine-tuned or trained.


In [8]:
%pip install torch

Collecting torch
  Downloading torch-1.13.1-cp38-cp38-manylinux1_x86_64.whl (887.4 MB)
[K     |████████████████████████████████| 887.4 MB 5.8 kB/s  eta 0:00:01   |▍                               | 10.0 MB 4.1 MB/s eta 0:03:33     |████▌                           | 123.9 MB 12.3 MB/s eta 0:01:03     |███████▊                        | 212.9 MB 10.6 MB/s eta 0:01:04     |████████▌                       | 236.1 MB 12.1 MB/s eta 0:00:54     |██████████▎                     | 286.5 MB 10.5 MB/s eta 0:00:58     |███████████▎                    | 313.2 MB 12.2 MB/s eta 0:00:48     |███████████▌                    | 318.5 MB 12.0 MB/s eta 0:00:48     |██████████████                  | 391.2 MB 11.6 MB/s eta 0:00:43     |██████████████▋                 | 404.4 MB 10.8 MB/s eta 0:00:45     |█████████████████████           | 583.7 MB 12.0 MB/s eta 0:00:26     |███████████████████████▌        | 650.8 MB 11.0 MB/s eta 0:00:22
[?25hCollecting nvidia-cuda-runtime-cu11==11.7.99; platform_system == "L

In [3]:
from transformers import TextDataset,DataCollatorForLanguageModeling

def load_dataset(train_path,test_path,tokenizer):
    train_dataset = TextDataset(
          tokenizer=tokenizer,
          file_path=train_path,
          block_size=128)
     
    test_dataset = TextDataset(
          tokenizer=tokenizer,
          file_path=test_path,
          block_size=128)   
    
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, mlm=False,
    )
    return train_dataset,test_dataset,data_collator

train_dataset,test_dataset,data_collator = load_dataset(train_path,test_path,tokenizer)



# Initialize Trainer with TrainingArguments and GPT-2 model

In [5]:
from transformers import Trainer, TrainingArguments,AutoModelWithLMHead

model = AutoModelWithLMHead.from_pretrained("dbmdz/german-gpt2")


training_args = TrainingArguments(
    output_dir="./gpt2-gerchef", #The output directory
    overwrite_output_dir=True, #overwrite the content of the output directory
    num_train_epochs=3, # number of training epochs
    per_device_train_batch_size=32, # batch size for training
    per_device_eval_batch_size=64,  # batch size for evaluation
    eval_steps = 400, # Number of update steps between two evaluations.
    save_steps=800, # after # steps model is saved 
    warmup_steps=500,# number of warmup steps for learning rate scheduler
    prediction_loss_only=True,
    )


trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

Downloading: 100%|██████████| 510M/510M [00:44<00:00, 11.6MB/s]
    Found GPU0 NVIDIA GeForce GT 730 which is of cuda capability 3.5.
    PyTorch no longer supports this GPU because it is too old.
    The minimum cuda capability supported by this library is 3.7.
    


In [6]:
trainer.train()



OutOfMemoryError: CUDA out of memory. Tried to allocate 48.00 MiB (GPU 0; 1.95 GiB total capacity; 1.06 GiB already allocated; 31.31 MiB free; 1.14 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF