In [None]:
import pandas as pd

df_train = pd.read_csv('data').dropna()
df_validation = pd.read_csv('data').dropna()
df_test = pd.read_csv('data').dropna()

In [None]:
from datasets import Dataset, DatasetDict

dataset = DatasetDict(
    {'train':Dataset.from_pandas(df_train, preserve_index=False),
     'test':Dataset.from_pandas(df_test, preserve_index=False),
     'validation': Dataset.from_pandas(df_validation, preserve_index=False)
     })

dataset

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("data")
model = AutoModelForSeq2SeqLM.from_pretrained("data")

In [None]:
def tokenize(batch):
    encoding = tokenizer(batch['dialogue'], text_target=batch['summary'], max_length=200, truncation=True, padding=True, return_tensors='pt')

    return encoding

In [None]:
samsum_pt = dataset.map(tokenize, batched=True, batch_size=None)

In [None]:
from transformers import DataCollatorForSeq2Seq
from transformers import TrainingArguments, Trainer

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model) 

args = TrainingArguments(
    output_dir="train_dir",
    num_train_epochs=2,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    eval_strategy='epoch',
    save_strategy='epoch',
    weight_decay=0.01,
    learning_rate=2e-5,
    gradient_accumulation_steps=500 
)

trainer = Trainer(model=model,
                  args=args,
                  tokenizer=tokenizer,
                  data_collator=data_collator,
                  train_dataset=samsum_pt['train'],
                  eval_dataset=samsum_pt['validation']
                  )

trainer.train()

In [None]:
trainer.save_model("data")

In [None]:
from transformers import pipeline

pipe = pipeline('summarization', model='data')

custom_dialogue="""
"""

output = pipe(custom_dialogue)
output