In [38]:
import math
import pandas as pd
import numpy as np
from datasets import Dataset
from transformers import AutoTokenizer, DataCollatorForLanguageModeling, AutoModelForCausalLM, TrainingArguments, Trainer, pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import mysql.connector


In [None]:
# config = {
#     'user': 'root',
#     'password': 'JJY#91296517',
#     'host': '127.0.0.1',
#     'database': 'database',
# }

# # Establish a connection
# cnx = mysql.connector.connect(**config)

# # Define your query
# query = f"SELECT * FROM combined_reviews"

# # Use pandas to load sql query into a DataFrame
# reviews = pd.read_sql(query, con=cnx)

# print(reviews.head())  # Print the first few rows of the DataFrame

# # Don't forget to close the connection when done
# cnx.close()

In [2]:
reviews = pd.read_csv('combined_reviews.csv', index_col=0)
reviews = reviews.drop(columns=['score', 'thumbsUpCount'])
reviews = reviews.dropna()
inst = 'Below is a review for a banking app. Write a response that appropriately replies to the review from the perspective of the bank.'
text = []
for index, row in reviews.iterrows():
    text.append(f'{inst} ### Review: {row["content"]} ### Reply: {row["replyContent"]}')
reviews['text'] = text
reviews = Dataset.from_pandas(reviews)
reviews = reviews.train_test_split(test_size=0.1)

In [3]:
reviews

DatasetDict({
    train: Dataset({
        features: ['replyContent', 'content', 'text', '__index_level_0__'],
        num_rows: 179
    })
    test: Dataset({
        features: ['replyContent', 'content', 'text', '__index_level_0__'],
        num_rows: 20
    })
})

In [4]:
reviews['train'][0]

{'replyContent': "We're delighted to receive your 5-star rating and appreciate your feedback! 👍 Thank you for continued support in using our app! 💜",
 'content': 'Easy of use',
 'text': "Below is a review for a banking app. Write a response that appropriately replies to the review from the perspective of the bank. ### Review: Easy of use ### Reply: We're delighted to receive your 5-star rating and appreciate your feedback! 👍 Thank you for continued support in using our app! 💜",
 '__index_level_0__': 64}

In [5]:
# baseline_model = "distilbert/distilgpt2"
baseline_model = "openai-community/gpt2"
tokenizer = AutoTokenizer.from_pretrained(baseline_model)

In [6]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

In [7]:
tokenized_reviews = reviews.map(preprocess_function, batched=True, remove_columns=reviews['train'].column_names)

Map: 100%|██████████| 179/179 [00:00<00:00, 8364.03 examples/s]
Map: 100%|██████████| 20/20 [00:00<00:00, 5552.43 examples/s]


In [8]:
tokenized_reviews

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 179
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 20
    })
})

In [9]:
tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [10]:
model = AutoModelForCausalLM.from_pretrained(baseline_model)

In [11]:
save_path = "./gpt_model_causallm"

training_args = TrainingArguments(
    output_dir=save_path,
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_reviews["train"],
    eval_dataset=tokenized_reviews["test"],
    data_collator=data_collator,
)

trainer.train()
model.save_pretrained(save_path)


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
                                               
 33%|███▎      | 23/69 [00:36<01:17,  1.68s/it]

{'eval_loss': 2.3148837089538574, 'eval_runtime': 1.5808, 'eval_samples_per_second': 12.652, 'eval_steps_per_second': 1.898, 'epoch': 1.0}


 67%|██████▋   | 46/69 [01:05<00:25,  1.09s/it]
 67%|██████▋   | 46/69 [01:05<00:25,  1.09s/it]

{'eval_loss': 2.042876958847046, 'eval_runtime': 0.736, 'eval_samples_per_second': 27.173, 'eval_steps_per_second': 4.076, 'epoch': 2.0}


                                               
100%|██████████| 69/69 [01:34<00:00,  1.38s/it]


{'eval_loss': 1.9908866882324219, 'eval_runtime': 0.716, 'eval_samples_per_second': 27.934, 'eval_steps_per_second': 4.19, 'epoch': 3.0}
{'train_runtime': 94.8694, 'train_samples_per_second': 5.66, 'train_steps_per_second': 0.727, 'train_loss': 2.591305276621943, 'epoch': 3.0}


In [12]:
eval_results = trainer.evaluate()
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

100%|██████████| 3/3 [00:00<00:00,  5.57it/s]

Perplexity: 7.32





In [13]:
finetuned_model = AutoModelForCausalLM.from_pretrained(save_path)
generator = pipeline('text-generation', finetuned_model, tokenizer=tokenizer)

In [16]:
review = reviews['test'][0]['content']
prompt = f'{inst} ### Review: {review} ### Reply:'

In [22]:
# generate output method 1
generator(prompt, max_length=len(review)+50)[0]['generated_text'][141+len(review):]

" ### Reply: Hi Matt. Please send us your feedback via the feedback form on our Support Forums. We appreciate your concern. 💜 We're always on the look out for bugs, feedback and feature requests that arise with our app, and will be responsive to any potential issue that may arise. If you have any further concerns, feel free to reach us at help@xiaomi.com.sg."

In [21]:
# generate output method 2
input_ids = tokenizer.encode(prompt, return_tensors="pt")
output = finetuned_model.generate(input_ids, max_new_tokens = 50)
predicted_text = tokenizer.decode(output[0], skip_special_tokens=True)
print(predicted_text[141+len(review):])

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


 ### Reply: Hi Rika! I'm sorry for the inconvenience. We need to make sure that you get the correct bank card and are given the best deal on your order. We have to understand before you can use our mobile apps to access our services, so


In [39]:
def cos_sim(pred, actl):
    count_vectorizer = CountVectorizer()
    vector_matrix = count_vectorizer.fit_transform([pred[141:], actl[141:]])
    cosine_similarity_matrix = cosine_similarity(vector_matrix)
    return cosine_similarity_matrix[0,1]

In [44]:
cosine_similarity_ls = []
for review in reviews['test']:
    content = review['content']
    prompt = f'{inst} ### Review: {content} ### Reply:'
    # pred = generator(prompt, max_length=len(content)+100)[0]['generated_text'][141+len(content):]
    input_ids = tokenizer.encode(prompt, return_tensors="pt")
    output = finetuned_model.generate(input_ids, max_new_tokens = 50)
    predicted_text = tokenizer.decode(output[0], skip_special_tokens=True)
    pred = predicted_text[141+len(review):]
    actl = review['replyContent']
    cs = cos_sim(pred, actl)
    cosine_similarity_ls.append(cs)

print(f'Average cosine similarity: {np.mean(cosine_similarity_ls)}')

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generati

Average cosine similarity: 0.15632927735967989


In [45]:
cosine_similarity_ls

[0.19611613513818407,
 0.0,
 0.0,
 0.0,
 0.18802535827258876,
 0.2574253920840865,
 0.16903085094570328,
 0.11933359262635951,
 0.0,
 0.19802950859533489,
 0.03580574370197164,
 0.32025630761017426,
 0.1507556722888818,
 0.07207499701564471,
 0.11821656093586509,
 0.33351867298253507,
 0.34782754811291194,
 0.29958563516135256,
 0.3205835717220036,
 0.0]