In [1]:
import math
import pandas as pd
import numpy as np
from datasets import Dataset
from transformers import AutoTokenizer, DataCollatorForLanguageModeling, AutoModelForCausalLM, TrainingArguments, Trainer, pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import mysql.connector
import json


In [8]:

with open('config.json') as config_file:
    config = json.load(config_file)['database']


cnx = mysql.connector.connect(**config)

# Define your query
query = f"SELECT * FROM combined_reviews"

# Use pandas to load sql query into a DataFrame
reviews = pd.read_sql(query, con=cnx)

print(reviews.head())  # Print the first few rows of the DataFrame

# Don't forget to close the connection when done
cnx.close()

  reviews = pd.read_sql(query, con=cnx)


                                        replyContent  score  \
0  Hi there! Thank you for your 5-star rating rev...      5   
1  Hi Jerry, this feature isn't ready at the mome...      1   
2  Thanks for your feedback on mobile wallets and...      4   
3  Hi Eric. We're sorry to hear this. We would li...      1   
4  Hi Hafidz Melek! Thank you for your 5-star rat...      5   

                                             content  thumbsUpCount date  
0  I like how the app is really light and fast co...              0  NaT  
1  Cannot add card to google pay. Also cannot add...              0  NaT  
2  Generally good. However please add either NFC ...              0  NaT  
3  With continuous failed to log in,contacted cso...              0  NaT  
4  I don't get the negative reviews here. I insta...              0  NaT  


In [9]:
#reviews = pd.read_csv('combined_reviews.csv', index_col=0)
reviews = reviews.drop(columns=['score', 'thumbsUpCount'])
reviews = reviews.dropna()
inst = 'Below is a review for a banking app. Write a response that appropriately replies to the review from the perspective of the bank.'
text = []
for index, row in reviews.iterrows():
    text.append(f'{inst} ### Review: {row["content"]} ### Reply: {row["replyContent"]}')
reviews['text'] = text
reviews = Dataset.from_pandas(reviews)
reviews = reviews.train_test_split(test_size=0.1)

In [10]:
reviews

DatasetDict({
    train: Dataset({
        features: ['replyContent', 'content', 'date', 'text', '__index_level_0__'],
        num_rows: 1790
    })
    test: Dataset({
        features: ['replyContent', 'content', 'date', 'text', '__index_level_0__'],
        num_rows: 199
    })
})

In [12]:
reviews['train'][0]

{'replyContent': '',
 'content': 'why my app cant use？ : 1 month ordy cannot use why?',
 'date': Timestamp('2021-01-03 10:55:52'),
 'text': 'Below is a review for a banking app. Write a response that appropriately replies to the review from the perspective of the bank. ### Review: why my app cant use？ : 1 month ordy cannot use why? ### Reply: ',
 '__index_level_0__': 1305}

In [13]:
# baseline_model = "distilbert/distilgpt2"
baseline_model = "openai-community/gpt2"
tokenizer = AutoTokenizer.from_pretrained(baseline_model)

In [14]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

In [15]:
tokenized_reviews = reviews.map(preprocess_function, batched=True, remove_columns=reviews['train'].column_names)

Map:   0%|          | 0/1790 [00:00<?, ? examples/s]

Map:   0%|          | 0/199 [00:00<?, ? examples/s]

In [16]:
tokenized_reviews

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 1790
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 199
    })
})

In [17]:
tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [18]:
model = AutoModelForCausalLM.from_pretrained(baseline_model)

In [19]:
save_path = "./gpt_model_causallm"

training_args = TrainingArguments(
    output_dir=save_path,
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_reviews["train"],
    eval_dataset=tokenized_reviews["test"],
    data_collator=data_collator,
)

trainer.train()
model.save_pretrained(save_path)


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


  0%|          | 0/672 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [12]:
eval_results = trainer.evaluate()
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

100%|██████████| 3/3 [00:00<00:00,  5.57it/s]

Perplexity: 7.32





In [13]:
finetuned_model = AutoModelForCausalLM.from_pretrained(save_path)
generator = pipeline('text-generation', finetuned_model, tokenizer=tokenizer)

In [16]:
review = reviews['test'][0]['content']
prompt = f'{inst} ### Review: {review} ### Reply:'

In [22]:
# generate output method 1
generator(prompt, max_length=len(review)+50)[0]['generated_text'][141+len(review):]

" ### Reply: Hi Matt. Please send us your feedback via the feedback form on our Support Forums. We appreciate your concern. 💜 We're always on the look out for bugs, feedback and feature requests that arise with our app, and will be responsive to any potential issue that may arise. If you have any further concerns, feel free to reach us at help@xiaomi.com.sg."

In [21]:
# generate output method 2
input_ids = tokenizer.encode(prompt, return_tensors="pt")
output = finetuned_model.generate(input_ids, max_new_tokens = 50)
predicted_text = tokenizer.decode(output[0], skip_special_tokens=True)
print(predicted_text[141+len(review):])

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


 ### Reply: Hi Rika! I'm sorry for the inconvenience. We need to make sure that you get the correct bank card and are given the best deal on your order. We have to understand before you can use our mobile apps to access our services, so


In [39]:
def cos_sim(pred, actl):
    count_vectorizer = CountVectorizer()
    vector_matrix = count_vectorizer.fit_transform([pred[141:], actl[141:]])
    cosine_similarity_matrix = cosine_similarity(vector_matrix)
    return cosine_similarity_matrix[0,1]

In [44]:
cosine_similarity_ls = []
for review in reviews['test']:
    content = review['content']
    prompt = f'{inst} ### Review: {content} ### Reply:'
    # pred = generator(prompt, max_length=len(content)+100)[0]['generated_text'][141+len(content):]
    input_ids = tokenizer.encode(prompt, return_tensors="pt")
    output = finetuned_model.generate(input_ids, max_new_tokens = 50)
    predicted_text = tokenizer.decode(output[0], skip_special_tokens=True)
    pred = predicted_text[141+len(review):]
    actl = review['replyContent']
    cs = cos_sim(pred, actl)
    cosine_similarity_ls.append(cs)

print(f'Average cosine similarity: {np.mean(cosine_similarity_ls)}')

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generati

Average cosine similarity: 0.15632927735967989


Cosine similiarity measures how similiar content between two sentences are. The closer to 1, the better. However, it seems that GPT-2 finatuned with GXS bank replies and reviews does not seem to perform well. More data and a stronger model like Alpaca 7B would perhaps yielded better results.

In [45]:
cosine_similarity_ls

[0.19611613513818407,
 0.0,
 0.0,
 0.0,
 0.18802535827258876,
 0.2574253920840865,
 0.16903085094570328,
 0.11933359262635951,
 0.0,
 0.19802950859533489,
 0.03580574370197164,
 0.32025630761017426,
 0.1507556722888818,
 0.07207499701564471,
 0.11821656093586509,
 0.33351867298253507,
 0.34782754811291194,
 0.29958563516135256,
 0.3205835717220036,
 0.0]