In [3]:
model_name = "distilgpt2"
device     = "gpu"

In [4]:
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(model_name)

model

  from .autonotebook import tqdm as notebook_tqdm


GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-5): 6 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [5]:
import pandas as pd

df = pd.read_csv("./inputs/wiki_movie_plots_deduped.csv")
df

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot
0,1901,Kansas Saloon Smashers,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Kansas_Saloon_Sm...,"A bartender is working at a saloon, serving dr..."
1,1901,Love by the Light of the Moon,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Love_by_the_Ligh...,"The moon, painted with a smiling face hangs ov..."
2,1901,The Martyred Presidents,American,Unknown,,unknown,https://en.wikipedia.org/wiki/The_Martyred_Pre...,"The film, just over a minute long, is composed..."
3,1901,"Terrible Teddy, the Grizzly King",American,Unknown,,unknown,"https://en.wikipedia.org/wiki/Terrible_Teddy,_...",Lasting just 61 seconds and consisting of two ...
4,1902,Jack and the Beanstalk,American,"George S. Fleming, Edwin S. Porter",,unknown,https://en.wikipedia.org/wiki/Jack_and_the_Bea...,The earliest known adaptation of the classic f...
...,...,...,...,...,...,...,...,...
34881,2014,The Water Diviner,Turkish,Director: Russell Crowe,Director: Russell Crowe\r\nCast: Russell Crowe...,unknown,https://en.wikipedia.org/wiki/The_Water_Diviner,"The film begins in 1919, just after World War ..."
34882,2017,Çalgı Çengi İkimiz,Turkish,Selçuk Aydemir,"Ahmet Kural, Murat Cemcir",comedy,https://en.wikipedia.org/wiki/%C3%87alg%C4%B1_...,"Two musicians, Salih and Gürkan, described the..."
34883,2017,Olanlar Oldu,Turkish,Hakan Algül,"Ata Demirer, Tuvana Türkay, Ülkü Duru",comedy,https://en.wikipedia.org/wiki/Olanlar_Oldu,"Zafer, a sailor living with his mother Döndü i..."
34884,2017,Non-Transferable,Turkish,Brendan Bradley,"YouTubers Shanna Malcolm, Shira Lazar, Sara Fl...",romantic comedy,https://en.wikipedia.org/wiki/Non-Transferable...,The film centres around a young woman named Am...


In [6]:
df = df[:1000]
df

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot
0,1901,Kansas Saloon Smashers,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Kansas_Saloon_Sm...,"A bartender is working at a saloon, serving dr..."
1,1901,Love by the Light of the Moon,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Love_by_the_Ligh...,"The moon, painted with a smiling face hangs ov..."
2,1901,The Martyred Presidents,American,Unknown,,unknown,https://en.wikipedia.org/wiki/The_Martyred_Pre...,"The film, just over a minute long, is composed..."
3,1901,"Terrible Teddy, the Grizzly King",American,Unknown,,unknown,"https://en.wikipedia.org/wiki/Terrible_Teddy,_...",Lasting just 61 seconds and consisting of two ...
4,1902,Jack and the Beanstalk,American,"George S. Fleming, Edwin S. Porter",,unknown,https://en.wikipedia.org/wiki/Jack_and_the_Bea...,The earliest known adaptation of the classic f...
...,...,...,...,...,...,...,...,...
995,1930,Playing Around,American,Mervyn LeRoy,"Alice White, Chester Morris",drama,https://en.wikipedia.org/wiki/Playing_Around,Alice White plays the part of a working class ...
996,1930,Raffles,American,George Fitzmaurice,"Ronald Colman, Kay Francis",mystery,https://en.wikipedia.org/wiki/Raffles_(1930_film),Gentleman jewel thief A.J. Raffles (Ronald Col...
997,1930,Reaching for the Moon,American,Edmund Goulding,"Douglas Fairbanks, Edward Everett Horton, Bing...",musical,https://en.wikipedia.org/wiki/Reaching_for_the...,"Wall Street wizard, Larry Day, new to the ways..."
998,1930,Recaptured Love,American,John G. Adolfi,"Belle Bennett, Dorothy Burgess",musical,https://en.wikipedia.org/wiki/Recaptured_Love,"In this drama, a 50-year-old married man (play..."


In [7]:
from datasets import Dataset 

def make_dataset(df):
    def map_dataset(batch):
        return tokenizer(
            batch["Plot"],
            truncation=True,
            max_length=128,
            return_overflowing_tokens=True
        )

    dataset = Dataset.from_pandas(df)
    dataset = dataset.map(
        map_dataset,
        batched=True,
        batch_size=8,
        remove_columns=list(df.columns)
    )
    dataset = dataset.remove_columns(["overflow_to_sample_mapping"])
    dataset = dataset.train_test_split(test_size=0.2)
    return dataset

In [8]:
dataset = make_dataset(df)
dataset

Map: 100%|██████████| 1000/1000 [00:00<00:00, 2248.66 examples/s]


DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 2188
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 548
    })
})

In [9]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

data_collator

DataCollatorForLanguageModeling(tokenizer=GPT2TokenizerFast(name_or_path='distilgpt2', vocab_size=50257, model_max_length=1024, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>', 'pad_token': '<|endoftext|>'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	50256: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
}, mlm=False, mlm_probability=0.15, pad_to_multiple_of=None, tf_experimental_compile=False, return_tensors='pt')

In [10]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./output/model",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    num_train_epochs=10
)

trainer = Trainer(
    model=model,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    args=training_args,
    data_collator=data_collator
)

trainer.train()

  0%|          | 0/2740 [00:00<?, ?it/s]You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
                                                  
 10%|█         | 275/2740 [00:48<38:50,  1.06it/s]

{'eval_loss': 3.7915360927581787, 'eval_runtime': 3.4728, 'eval_samples_per_second': 157.798, 'eval_steps_per_second': 19.869, 'epoch': 1.0}


 18%|█▊        | 500/2740 [01:25<06:12,  6.01it/s]

{'loss': 3.936, 'learning_rate': 1.635036496350365e-05, 'epoch': 1.82}


                                                  
 20%|██        | 549/2740 [01:37<34:51,  1.05it/s]

{'eval_loss': 3.7515931129455566, 'eval_runtime': 3.4981, 'eval_samples_per_second': 156.658, 'eval_steps_per_second': 19.725, 'epoch': 2.0}


                                                  
 30%|███       | 823/2740 [02:27<30:41,  1.04it/s]

{'eval_loss': 3.7390315532684326, 'eval_runtime': 3.526, 'eval_samples_per_second': 155.417, 'eval_steps_per_second': 19.569, 'epoch': 3.0}


 36%|███▋      | 1000/2740 [02:56<04:53,  5.94it/s]

{'loss': 3.7009, 'learning_rate': 1.27007299270073e-05, 'epoch': 3.65}


                                                   
 40%|████      | 1097/2740 [03:17<25:46,  1.06it/s]

{'eval_loss': 3.732489585876465, 'eval_runtime': 3.4644, 'eval_samples_per_second': 158.18, 'eval_steps_per_second': 19.917, 'epoch': 4.0}


                                                   
 50%|█████     | 1371/2740 [04:04<21:24,  1.07it/s]

{'eval_loss': 3.7292916774749756, 'eval_runtime': 3.4537, 'eval_samples_per_second': 158.671, 'eval_steps_per_second': 19.979, 'epoch': 5.0}


 55%|█████▍    | 1500/2740 [04:25<03:19,  6.22it/s]

{'loss': 3.6154, 'learning_rate': 9.05109489051095e-06, 'epoch': 5.47}


                                                   
 60%|██████    | 1645/2740 [04:54<17:08,  1.06it/s]

{'eval_loss': 3.728799819946289, 'eval_runtime': 3.4571, 'eval_samples_per_second': 158.512, 'eval_steps_per_second': 19.959, 'epoch': 6.0}


                                                   
 70%|███████   | 1919/2740 [05:43<13:05,  1.05it/s]

{'eval_loss': 3.7283449172973633, 'eval_runtime': 3.5122, 'eval_samples_per_second': 156.028, 'eval_steps_per_second': 19.646, 'epoch': 7.0}


 73%|███████▎  | 2000/2740 [05:57<01:58,  6.24it/s]

{'loss': 3.5495, 'learning_rate': 5.401459854014599e-06, 'epoch': 7.3}


                                                   
 80%|████████  | 2193/2740 [06:33<08:52,  1.03it/s]

{'eval_loss': 3.729825258255005, 'eval_runtime': 3.5736, 'eval_samples_per_second': 153.346, 'eval_steps_per_second': 19.308, 'epoch': 8.0}


                                                   
 90%|█████████ | 2467/2740 [07:22<04:24,  1.03it/s]

{'eval_loss': 3.7310678958892822, 'eval_runtime': 3.5605, 'eval_samples_per_second': 153.913, 'eval_steps_per_second': 19.38, 'epoch': 9.0}


 91%|█████████ | 2500/2740 [07:28<00:39,  6.01it/s]

{'loss': 3.5147, 'learning_rate': 1.7518248175182485e-06, 'epoch': 9.12}


                                                   
100%|██████████| 2740/2740 [08:13<00:00,  5.56it/s]

{'eval_loss': 3.7323896884918213, 'eval_runtime': 3.588, 'eval_samples_per_second': 152.73, 'eval_steps_per_second': 19.231, 'epoch': 10.0}
{'train_runtime': 493.0763, 'train_samples_per_second': 44.374, 'train_steps_per_second': 5.557, 'train_loss': 3.647957817133326, 'epoch': 10.0}





TrainOutput(global_step=2740, training_loss=3.647957817133326, metrics={'train_runtime': 493.0763, 'train_samples_per_second': 44.374, 'train_steps_per_second': 5.557, 'train_loss': 3.647957817133326, 'epoch': 10.0})

In [24]:
from transformers import AutoTokenizer,AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained("output/model/checkpoint-2500")

prompt = "A Green Alien lands "
inputs = tokenizer(prompt, return_tensors="pt").to("cpu")

outputs = model.generate(
    inputs.input_ids,
    max_new_tokens=100,
    do_sample=True,
    top_k=100,
    top_p=0.95
    )

outputs = tokenizer.batch_decode(outputs)
outputs

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


['A Green Alien lands iced up in the attic of the Hollywood Studios, just outside of Greenwich Village, causing his own personal mischief. He leaves the studio after the show in a hurry. Upon arriving at the studio by telephone, he realizes the studio is locked in a huge revolving door. He sneaks out and shows his name, but his name has been stolen and a poster stolen from his office appears on the wall outside. He then discovers that a hidden vase in the attic contains a key and that he had hidden']