In [None]:
import json
from transformers import GPT2LMHeadModel, GPT2Tokenizer, TextDataset, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments

In [None]:
model_name = "gpt2"
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

In [None]:
def prepare_data(json_data):
    prepared_data = []
    for item in json_data:
        input_text = f"Customer: {item['customer']}\nSales Rep Response:"
        target_text = item['sales_rep']
        prepared_data.append(f"{input_text} {target_text}")
    return prepared_data

In [None]:
with open('genotek.json', 'r') as f:
    data = json.load(f)
prepared_data = prepare_data(data)

In [None]:
prepared_data

['Customer: Your delivery times are longer than other companies. Why should I wait?\nSales Rep Response: I understand your concern about delivery times. Could you tell me about your typical order volumes and frequency? This will help me explain how our shipping process might actually save you time in the long run.',
 "Customer: I've heard your customer service isn't very responsive. How can you assure me I won't be left hanging?\nSales Rep Response: I appreciate you bringing up this concern. Can you share what specific aspects of customer service are most critical for your business? This will help me highlight how our support system aligns with your needs.",
 "Customer: Your software seems complicated. I'm worried my staff won't be able to use it effectively.\nSales Rep Response: That's a valid concern. Could you tell me more about your team's experience with similar systems? This will help me suggest the most appropriate training program and estimate the learning curve.",
 "Customer: 

In [None]:
with open('prepared_data.txt', 'w') as f:
    for item in prepared_data:
        f.write(f"{item}\n")

In [None]:
dataset = TextDataset(
    tokenizer=tokenizer,
    file_path="prepared_data.txt",
    block_size=128
)



In [None]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

In [None]:
pip install transformers[torch]



In [None]:
pip install accelerate -U



In [None]:
!pip uninstall transformers accelerate torch
!pip install transformers[torch] accelerate torch

Found existing installation: transformers 4.42.4
Uninstalling transformers-4.42.4:
  Would remove:
    /usr/local/bin/transformers-cli
    /usr/local/lib/python3.10/dist-packages/transformers-4.42.4.dist-info/*
    /usr/local/lib/python3.10/dist-packages/transformers/*
Proceed (Y/n)? y
  Successfully uninstalled transformers-4.42.4
Found existing installation: accelerate 0.32.1
Uninstalling accelerate-0.32.1:
  Would remove:
    /usr/local/bin/accelerate
    /usr/local/bin/accelerate-config
    /usr/local/bin/accelerate-estimate-memory
    /usr/local/bin/accelerate-launch
    /usr/local/bin/accelerate-merge-weights
    /usr/local/lib/python3.10/dist-packages/accelerate-0.32.1.dist-info/*
    /usr/local/lib/python3.10/dist-packages/accelerate/*
Proceed (Y/n)? y
  Successfully uninstalled accelerate-0.32.1
Found existing installation: torch 2.3.1
Uninstalling torch-2.3.1:
  Would remove:
    /usr/local/bin/convert-caffe2-to-onnx
    /usr/local/bin/convert-onnx-to-caffe2
    /usr/local/bi

In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    overwrite_output_dir=True,
    num_train_epochs=60,
    per_device_train_batch_size=4,
    save_steps=10_000,
    save_total_limit=2,
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
)

In [None]:
trainer.train()

Step,Training Loss


TrainOutput(global_step=120, training_loss=0.040737239519755046, metrics={'train_runtime': 1022.8358, 'train_samples_per_second': 0.411, 'train_steps_per_second': 0.117, 'total_flos': 27435663360000.0, 'train_loss': 0.040737239519755046, 'epoch': 60.0})

In [None]:
model.save_pretrained("./fine_tuned_gpt2")
tokenizer.save_pretrained("./fine_tuned_gpt2")

('./fine_tuned_gpt2/tokenizer_config.json',
 './fine_tuned_gpt2/special_tokens_map.json',
 './fine_tuned_gpt2/vocab.json',
 './fine_tuned_gpt2/merges.txt',
 './fine_tuned_gpt2/added_tokens.json')

In [None]:
model.eval()

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [None]:
import torch

In [None]:
model_path = "./fine_tuned_gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_path, pad_token='<|endoftext|>')
model.config.pad_token_id = tokenizer.pad_token_id

In [None]:
def generate_response_with_examples(customer_objection):
    prompt = f"""
Here are two examples of good sales representative responses:

Customer: Your product is more expensive than competitors. Why should I pay more?
Sales Rep:
1. Acknowledgment: I understand your concern about our pricing.
2. Question: Could you tell me which specific features are most important for your business?
3. Value proposition: Our higher price reflects superior quality and advanced features that often lead to greater long-term cost savings and efficiency for our clients.

Customer: I'm worried about the learning curve for my team.
Sales Rep:
1. Acknowledgment: It's natural to be concerned about the learning process for new software.
2. Question: Can you tell me about your team's experience with similar systems?
3. Value proposition: We offer comprehensive training and ongoing support to ensure a smooth transition and quick proficiency for your team.

Now, please respond to this customer objection:
Customer: {customer_objection}

Sales Representative's Response:
"""
def generate_response(customer_objection, max_length=150):
    input_text = f"Customer: {customer_objection}\nSales Rep Response:"
    inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True, max_length=max_length)

    with torch.no_grad():
        output = model.generate(
            inputs.input_ids,
            attention_mask=inputs.attention_mask,
            max_length=max_length,
            num_return_sequences=1,
            no_repeat_ngram_size=2,
            temperature=0.4
        )

    response1 = tokenizer.decode(output[0], skip_special_tokens=True)
    response1 = response1.split("Sales Rep Response:")[1].strip()
    first_line = response1.split('\n')[0].strip()
    return first_line

test_objection = "Your training resources seem limited compared to others. How can I ensure my team will be fully prepared?"
response1 = generate_response(test_objection)
print(f"Generated response: {response1}")


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Generated response: I understand your concern about training staff. Can you share which types of resources are most crucial for your operations? This will help me demonstrate how our onboarding process might actually save time in the long run or explore ways to expedite it.


In [None]:
pip install tabulate



In [None]:
from tabulate import tabulate

In [None]:
def generate_response(customer_objection, max_length=150):
    input_text = f"Customer: {customer_objection}\nSales Rep Response:"
    inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True, max_length=max_length)

    with torch.no_grad():
        output = model.generate(
            inputs.input_ids,
            attention_mask=inputs.attention_mask,
            max_length=max_length,
            num_return_sequences=1,
            no_repeat_ngram_size=2,
            temperature=0.4
        )

    response1 = tokenizer.decode(output[0], skip_special_tokens=True)
    response1 = response1.split("Sales Rep Response:")[1].strip()
    first_line = response1.split('\n')[0].strip()
    return first_line

customer_concerns = [
    "I've heard that your product can be difficult to customize for unique business needs. How flexible is it?",
    "Your product seems to lack robust reporting features. How can I ensure it meets our reporting needs?",
    "I'm worried about the reliability of your product during peak usage times. How do you ensure consistent performance?",
    "Your product's mobile functionality seems limited. How can I be sure it will work well for our mobile workforce?",
    "I've noticed your product has fewer integrations with third-party apps. How can I ensure it fits into our existing tech ecosystem?"
]

responses = [generate_response(concern) for concern in customer_concerns]

table_data = []
for i, (concern, response) in enumerate(zip(customer_concerns, responses), start=1):
    table_data.append([f"Concern {i}", concern, response])

# Define table headers
headers = ["#", "Customer Concern", "Sales Rep Response"]

# Print the table
print(tabulate(table_data, headers=headers, tablefmt="grid"))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


+-----------+------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| #         | Customer Concern                                                                                                                   | Sales Rep Response                                                                                                                                                                                                                                                                                      |
| Concern 1 | I've heard that your product can be difficult to customize for unique business needs. How flexib

In [None]:
!pip install pandas scikit-learn transformers datasets



In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM

# Load the models and tokenizers
base_model_name = "gpt2"
fine_tuned_model_path = "./fine_tuned_gpt2"

base_tokenizer = AutoTokenizer.from_pretrained(base_model_name)
fine_tuned_tokenizer = AutoTokenizer.from_pretrained(fine_tuned_model_path)

# Set pad_token to eos_token if pad_token is not already set
if base_tokenizer.pad_token is None:
    base_tokenizer.pad_token = base_tokenizer.eos_token

if fine_tuned_tokenizer.pad_token is None:
    fine_tuned_tokenizer.pad_token = fine_tuned_tokenizer.eos_token

base_model = AutoModelForCausalLM.from_pretrained(base_model_name)
fine_tuned_model = AutoModelForCausalLM.from_pretrained(fine_tuned_model_path)


In [164]:
import torch

def generate_responses(prompts, tokenizer, model, max_length=50):
    responses = []
    for prompt in prompts:
        inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=512)
        with torch.no_grad():
            output = model.generate(
                inputs.input_ids,
                attention_mask=inputs.attention_mask,
                max_length=max_length,
                num_return_sequences=1,
                no_repeat_ngram_size=2,
                temperature=0.7
            )
        response = tokenizer.decode(output[0], skip_special_tokens=True)
        responses.append(response)
    return responses

# Example test prompts
test_prompts = [
    "I've heard that customizing your product for specific business needs can be challenging. How adaptable is it?",
    "I’m concerned about how reliable your product is during high-demand periods. What measures are in place to maintain performance?",
    "It seems like your product may not have strong reporting capabilities. How can you ensure it covers all our reporting needs?",
    "Your product’s mobile features seem a bit limited. How can you assure me that it’s suitable for a mobile workforce?",
    "I noticed your product integrates with fewer third-party applications. How can I ensure it fits with the tools we already use?"
]

# Generate responses from both models
base_responses = generate_responses(test_prompts, base_tokenizer, base_model)
fine_tuned_responses = generate_responses(test_prompts, fine_tuned_tokenizer, fine_tuned_model)


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [165]:
print(base_responses)

["I've heard that customizing your product for specific business needs can be challenging. How adaptable is it?\n\nI think it's a good question. I think you can do a lot of things with your custom product. You can make it", "I’m concerned about how reliable your product is during high-demand periods. What measures are in place to maintain performance?\n\nI'm concerned that the product's performance is not being maintained. I'm also concerned with the fact that I", 'It seems like your product may not have strong reporting capabilities. How can you ensure it covers all our reporting needs?\n\nWe have a lot of reporting requirements that we need to meet. We have to be able to report on the most important issues', 'Your product’s mobile features seem a bit limited. How can you assure me that it’s suitable for a mobile workforce?\n\nWe are currently working on a solution to this problem. We are working with our partners to provide a', 'I noticed your product integrates with fewer third-part

In [160]:
print(fine_tuned_responses)

["I've heard that customizing your product for specific business needs can be challenging. How adaptable is it?\nSales Rep Response: I appreciate your concern about customizations. Can you share what specific aspects of customization are most critical for your operations", 'It seems like your product may not have strong reporting capabilities. How can you ensure it covers all our reporting needs?\nSales Rep Response: I appreciate your concern about our ability to meet our needs. Can you share what specific aspects of our onboarding', 'I’m concerned about how reliable your product is during high-demand periods. What measures are in place to maintain performance?\nSales Rep Response: I appreciate your concern about reliability. Can you share which types of adjustments are most important for your', 'Your product’s mobile features seem a bit limited. How can you assure me that it’s suitable for a mobile workforce?\nSales Rep Response: I appreciate your concern about accessibility. Can you 

In [161]:
def calculate_perplexity(prompts, tokenizer, model):
    total_log_likelihood = 0
    total_tokens = 0
    for prompt in prompts:
        inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=512)
        with torch.no_grad():
            outputs = model(**inputs, labels=inputs.input_ids)
        log_likelihood = -outputs.loss.item()
        total_log_likelihood += log_likelihood
        total_tokens += len(inputs.input_ids[0])
    average_log_likelihood = total_log_likelihood / len(prompts)
    perplexity = torch.exp(torch.tensor(average_log_likelihood))
    return perplexity.item()

# Calculate perplexity for both models
base_perplexity = calculate_perplexity(test_prompts, base_tokenizer, base_model)
fine_tuned_perplexity = calculate_perplexity(test_prompts, fine_tuned_tokenizer, fine_tuned_model)

print(f"Base GPT-2 Perplexity: {base_perplexity}")
print(f"Fine-Tuned Model Perplexity: {fine_tuned_perplexity}")


Base GPT-2 Perplexity: 0.015939459204673767
Fine-Tuned Model Perplexity: 0.004965505562722683
