In [25]:
model_name = "distilgpt2"
device     = "gpu"

In [26]:
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(model_name)

model

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-5): 6 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [27]:
import pandas as pd

df = pd.read_csv("./inputs/wiki_movie_plots_deduped.csv")
df

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot
0,1901,Kansas Saloon Smashers,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Kansas_Saloon_Sm...,"A bartender is working at a saloon, serving dr..."
1,1901,Love by the Light of the Moon,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Love_by_the_Ligh...,"The moon, painted with a smiling face hangs ov..."
2,1901,The Martyred Presidents,American,Unknown,,unknown,https://en.wikipedia.org/wiki/The_Martyred_Pre...,"The film, just over a minute long, is composed..."
3,1901,"Terrible Teddy, the Grizzly King",American,Unknown,,unknown,"https://en.wikipedia.org/wiki/Terrible_Teddy,_...",Lasting just 61 seconds and consisting of two ...
4,1902,Jack and the Beanstalk,American,"George S. Fleming, Edwin S. Porter",,unknown,https://en.wikipedia.org/wiki/Jack_and_the_Bea...,The earliest known adaptation of the classic f...
...,...,...,...,...,...,...,...,...
34881,2014,The Water Diviner,Turkish,Director: Russell Crowe,Director: Russell Crowe\r\nCast: Russell Crowe...,unknown,https://en.wikipedia.org/wiki/The_Water_Diviner,"The film begins in 1919, just after World War ..."
34882,2017,Çalgı Çengi İkimiz,Turkish,Selçuk Aydemir,"Ahmet Kural, Murat Cemcir",comedy,https://en.wikipedia.org/wiki/%C3%87alg%C4%B1_...,"Two musicians, Salih and Gürkan, described the..."
34883,2017,Olanlar Oldu,Turkish,Hakan Algül,"Ata Demirer, Tuvana Türkay, Ülkü Duru",comedy,https://en.wikipedia.org/wiki/Olanlar_Oldu,"Zafer, a sailor living with his mother Döndü i..."
34884,2017,Non-Transferable,Turkish,Brendan Bradley,"YouTubers Shanna Malcolm, Shira Lazar, Sara Fl...",romantic comedy,https://en.wikipedia.org/wiki/Non-Transferable...,The film centres around a young woman named Am...


In [28]:
df = df[:10000]
df

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot
0,1901,Kansas Saloon Smashers,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Kansas_Saloon_Sm...,"A bartender is working at a saloon, serving dr..."
1,1901,Love by the Light of the Moon,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Love_by_the_Ligh...,"The moon, painted with a smiling face hangs ov..."
2,1901,The Martyred Presidents,American,Unknown,,unknown,https://en.wikipedia.org/wiki/The_Martyred_Pre...,"The film, just over a minute long, is composed..."
3,1901,"Terrible Teddy, the Grizzly King",American,Unknown,,unknown,"https://en.wikipedia.org/wiki/Terrible_Teddy,_...",Lasting just 61 seconds and consisting of two ...
4,1902,Jack and the Beanstalk,American,"George S. Fleming, Edwin S. Porter",,unknown,https://en.wikipedia.org/wiki/Jack_and_the_Bea...,The earliest known adaptation of the classic f...
...,...,...,...,...,...,...,...,...
9995,1982,Night Shift,American,Ron Howard,"Henry Winkler, Michael Keaton, Shelley Long",comedy,https://en.wikipedia.org/wiki/Night_Shift_(film),"Chuck, formerly a successful stockbroker, has ..."
9996,1982,An Officer and a Gentleman,American,Taylor Hackford,"Richard Gere, Louis Gossett, Jr., Debra Winger...",drama,https://en.wikipedia.org/wiki/An_Officer_and_a...,"Zachary ""Zack"" Mayo is preparing to report to ..."
9997,1982,One from the Heart,American,Francis Ford Coppola,"Teri Garr, Frederic Forrest, Nastassja Kinski,...","drama, musical",https://en.wikipedia.org/wiki/One_from_the_Heart,"The evening of July 4, in Las Vegas, Hank, a m..."
9998,1982,Pandemonium,American,Al Sole,"Tom Smothers, Paul Reubens, Carol Kane",comedy,https://en.wikipedia.org/wiki/Pandemonium_(film),"In the fictional town of It Had To Be, Indiana..."


In [29]:
from datasets import Dataset 

def make_dataset(df):
    def map_dataset(batch):
        return tokenizer(
            batch["Plot"],
            truncation=True,
            max_length=128,
            return_overflowing_tokens=True
        )

    dataset = Dataset.from_pandas(df)
    dataset = dataset.map(
        map_dataset,
        batched=True,
        batch_size=8,
        remove_columns=list(df.columns)
    )
    dataset = dataset.remove_columns(["overflow_to_sample_mapping"])
    dataset = dataset.train_test_split(test_size=0.2)
    return dataset

In [30]:
dataset = make_dataset(df)
dataset

Map: 100%|██████████| 10000/10000 [00:06<00:00, 1501.45 examples/s]


DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 30754
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 7689
    })
})

In [31]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

data_collator

DataCollatorForLanguageModeling(tokenizer=GPT2TokenizerFast(name_or_path='distilgpt2', vocab_size=50257, model_max_length=1024, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>', 'pad_token': '<|endoftext|>'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	50256: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
}, mlm=False, mlm_probability=0.15, pad_to_multiple_of=None, tf_experimental_compile=False, return_tensors='pt')

In [32]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./output/model",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    num_train_epochs=10
)

trainer = Trainer(
    model=model,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    args=training_args,
    data_collator=data_collator
)

trainer.train()

  0%|          | 0/38450 [00:00<?, ?it/s]You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  1%|▏         | 500/38450 [01:21<1:44:18,  6.06it/s]Checkpoint destination directory ./output/model\checkpoint-500 already exists and is non-empty.Saving will proceed but saved results may be invalid.


{'loss': 4.0865, 'learning_rate': 1.973992197659298e-05, 'epoch': 0.13}


  3%|▎         | 1000/38450 [02:45<1:44:11,  5.99it/s]Checkpoint destination directory ./output/model\checkpoint-1000 already exists and is non-empty.Saving will proceed but saved results may be invalid.


{'loss': 3.9813, 'learning_rate': 1.9479843953185956e-05, 'epoch': 0.26}


  4%|▍         | 1500/38450 [04:10<1:43:15,  5.96it/s]Checkpoint destination directory ./output/model\checkpoint-1500 already exists and is non-empty.Saving will proceed but saved results may be invalid.


{'loss': 3.9554, 'learning_rate': 1.9219765929778935e-05, 'epoch': 0.39}


  5%|▌         | 2000/38450 [05:35<1:41:20,  5.99it/s]Checkpoint destination directory ./output/model\checkpoint-2000 already exists and is non-empty.Saving will proceed but saved results may be invalid.


{'loss': 3.9251, 'learning_rate': 1.8959687906371913e-05, 'epoch': 0.52}


  7%|▋         | 2500/38450 [06:59<1:41:26,  5.91it/s]Checkpoint destination directory ./output/model\checkpoint-2500 already exists and is non-empty.Saving will proceed but saved results may be invalid.


{'loss': 3.9111, 'learning_rate': 1.8699609882964892e-05, 'epoch': 0.65}


  8%|▊         | 3000/38450 [08:22<1:36:34,  6.12it/s]

{'loss': 3.8927, 'learning_rate': 1.8439531859557868e-05, 'epoch': 0.78}


  9%|▉         | 3500/38450 [09:45<1:34:40,  6.15it/s]

{'loss': 3.8851, 'learning_rate': 1.8179453836150846e-05, 'epoch': 0.91}


                                                      
 10%|█         | 3846/38450 [11:34<111:48:13, 11.63s/it]

{'eval_loss': 3.7517614364624023, 'eval_runtime': 49.761, 'eval_samples_per_second': 154.519, 'eval_steps_per_second': 19.332, 'epoch': 1.0}


 10%|█         | 4000/38450 [11:59<1:33:32,  6.14it/s]  

{'loss': 3.8513, 'learning_rate': 1.7919375812743825e-05, 'epoch': 1.04}


 12%|█▏        | 4500/38450 [13:22<1:35:44,  5.91it/s]

{'loss': 3.8044, 'learning_rate': 1.76592977893368e-05, 'epoch': 1.17}


 13%|█▎        | 5000/38450 [14:47<1:31:14,  6.11it/s]

{'loss': 3.7994, 'learning_rate': 1.739921976592978e-05, 'epoch': 1.3}


 14%|█▍        | 5500/38450 [16:12<1:32:51,  5.91it/s]

{'loss': 3.7984, 'learning_rate': 1.7139141742522758e-05, 'epoch': 1.43}


 16%|█▌        | 6000/38450 [17:38<1:30:35,  5.97it/s]

{'loss': 3.7861, 'learning_rate': 1.6879063719115737e-05, 'epoch': 1.56}


 17%|█▋        | 6500/38450 [19:04<1:29:14,  5.97it/s]

{'loss': 3.7875, 'learning_rate': 1.6618985695708712e-05, 'epoch': 1.69}


 18%|█▊        | 6956/38450 [20:22<1:25:43,  6.12it/s]

KeyboardInterrupt: 

In [None]:
from transformers import AutoTokenizer,AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained("output/model/checkpoint-6500")

prompt = "A Green Alien lands "
inputs = tokenizer(prompt, return_tensors="pt").to("cpu")

outputs = model.generate(
    inputs.input_ids,
    max_new_tokens=100,
    do_sample=True,
    top_k=100,
    top_p=0.95
    )

outputs = tokenizer.batch_decode(outputs)
outputs

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


['A Green Alien lands iced up in the attic of the Hollywood Studios, just outside of Greenwich Village, causing his own personal mischief. He leaves the studio after the show in a hurry. Upon arriving at the studio by telephone, he realizes the studio is locked in a huge revolving door. He sneaks out and shows his name, but his name has been stolen and a poster stolen from his office appears on the wall outside. He then discovers that a hidden vase in the attic contains a key and that he had hidden']

In [34]:
from transformers import AutoTokenizer,AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained("output/model/checkpoint-6500")

prompt = "A Green Alien lands "
inputs = tokenizer(prompt, return_tensors="pt").to("cpu")

outputs = model.generate(
    inputs.input_ids,
    max_new_tokens=100,
    do_sample=True,
    top_k=100,
    top_p=0.95
    )

outputs = tokenizer.batch_decode(outputs)
outputs

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


['A Green Alien lands \xa0 in the North and, with the help of fellow Martian Alan Rourke (who runs a large-scale show), the pilot is assigned by a female engineer. At the station, three "black" alien children are raised. A small team of aliens infiltrate the small station, then follow them to the nearby hills of the North. The aliens use weapons and other tools to escape and rescue the children. The children, now young adults, and their children, are given valuable research opportunities when their parents']