In [1]:
import pandas as pd
from datasets import Dataset
reviews = pd.read_csv('combined_reviews.csv', index_col=0)
reviews = reviews.drop(columns=['score', 'thumbsUpCount'])
# reviews = reviews.rename(columns={'replyContent': 'label', 'content': 'text'})
reviews = reviews.dropna()
reviews = Dataset.from_pandas(reviews)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
reviews[0]

{'replyContent': 'Hey Jiaxing! Hope you love the new "Favourites" feature in the GXS app! We\'re just as excited for all the cool things to come in the future. 💜',
 'content': 'Gxs is simple and easy to use, with a saving account/pocket that has minimal TnC. Finally I can add payees and there is a debit card too. I am looking forward to the introduction of credit card and perhaps, investment into money market funds.',
 '__index_level_0__': 0}

In [3]:
from transformers import AutoTokenizer
baseline_model = "distilbert/distilgpt2"
tokenizer = AutoTokenizer.from_pretrained(baseline_model)

In [4]:
def preprocess_function(examples):
    return tokenizer(examples["replyContent"], truncation=True)

In [5]:
tokenized_reviews = reviews.map(preprocess_function, batched=True)

Map: 100%|██████████| 199/199 [00:00<00:00, 17010.06 examples/s]


In [6]:
tokenized_reviews = tokenized_reviews.train_test_split(test_size=0.1)

In [7]:
from transformers import DataCollatorForLanguageModeling
tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [8]:
from transformers import AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained(baseline_model)

In [9]:
from transformers import TrainingArguments, Trainer

save_path = "./gpt_model_causallm"

training_args = TrainingArguments(
    output_dir=save_path,
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_reviews["train"],
    eval_dataset=tokenized_reviews["test"],
    data_collator=data_collator,
)

trainer.train()
model.save_pretrained(save_path)


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
                                               
 33%|███▎      | 23/69 [00:12<00:22,  2.04it/s]

{'eval_loss': 2.738879680633545, 'eval_runtime': 0.4103, 'eval_samples_per_second': 48.744, 'eval_steps_per_second': 7.312, 'epoch': 1.0}


                                               
 67%|██████▋   | 46/69 [00:24<00:11,  2.07it/s]

{'eval_loss': 2.3985228538513184, 'eval_runtime': 0.3149, 'eval_samples_per_second': 63.509, 'eval_steps_per_second': 9.526, 'epoch': 2.0}


                                               
100%|██████████| 69/69 [00:36<00:00,  1.90it/s]


{'eval_loss': 2.3152339458465576, 'eval_runtime': 0.317, 'eval_samples_per_second': 63.096, 'eval_steps_per_second': 9.464, 'epoch': 3.0}
{'train_runtime': 36.2984, 'train_samples_per_second': 14.794, 'train_steps_per_second': 1.901, 'train_loss': 3.0795909494593525, 'epoch': 3.0}


In [10]:
import math
eval_results = trainer.evaluate()
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

100%|██████████| 3/3 [00:00<00:00, 12.06it/s]

Perplexity: 10.13





In [11]:
from transformers import pipeline, Conversation, AutoModelForCausalLM
save_path = "./gpt_model_causallm"
finetuned_model = AutoModelForCausalLM.from_pretrained(save_path)
generator = pipeline('text-generation', finetuned_model, tokenizer=tokenizer)
chatbot = pipeline('conversational',finetuned_model, tokenizer=tokenizer)

In [12]:
prompt = reviews[50]['content']
prompt

'Why do the app ask me to rate on Play Store if my rating on the app is 5, but there was no such questions when I put a 1 rating on the app? It encourages unfair ratings on the Play Store.'

In [13]:
generator(prompt, max_length=len(prompt)+50)[0]['generated_text'][len(prompt):]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


' No matter how you decide to rate, I would highly recommend you review the app for your iPhone. Please note: if you really want to take that step, please consider doing so before you can review the app. Thank you for your help! (To unsubscribe, add your review to our mailing list, then hit the Subscribe button next to our notification) <a href="https://www.pushover.com/services/feedback/8c0c6f3b3ad7fd4f49b56f6c45f40e53aa9c4f1/popout">@pushover.com</a> We\'re a huge customer and would gladly take your feedback with us! Happy new season! <a href="https://t.co/W2U8OeS7xYvS —@Pushover.com</a> ✕️ Thank you 🤔 <a'

In [14]:
conversation = Conversation(prompt)
conversation = chatbot(conversation)
conversation.messages[-1]['content']


No chat template is defined for this tokenizer - using the default template for the GPT2TokenizerFast class. If the default is not appropriate for your model, please set `tokenizer.chat_template` to an appropriate template. See https://huggingface.co/docs/transformers/main/chat_templating for more information.

A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


"Get your start-up-only business with the latest Android handset, iPhone Plus and for a price change. The app has been created for Apple Watch. When I was a little boy I was lucky enough to take the time to review the new app for mobile devices. Now that its release is arriving, we're here to take a closer look. We will take a closer look, in depth.\n\n\n\nIf you are having trouble reading our news article or find out more about our services, or for help with an issue with our app, please contact us at help@gxs.com.sg.\n\nIf you have been running a local server or are experiencing trouble running the app, please contact us at help@gxs.com.sg. Here are the steps and precautions required to take to recover and install the update.\n\n\n1. If you run to the local version of a local app or you are running the application from the App Store, you can install the update if you‽d believe your app has the same app and are experiencing an issue with mobile devices. Additionally, installing the up