In [1]:
! pip install datasets



In [2]:
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [3]:
!pip install transformers[torch] -U



In [None]:
import json
with open('C:/Users/surya/Downloads/chit_chat_generator/chit_chat.json', 'r') as file:
    data = json.load(file)

In [4]:
import pandas as pd
import csv
import re
import string

In [5]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s\.,!?]', '', text)
    PUNCT_TO_REMOVE = string.punctuation
    clean_text = text.translate(str.maketrans("", "", PUNCT_TO_REMOVE))
    clean_text = re.sub(r'\s+', ' ', clean_text).strip()
    return clean_text

In [None]:
formatted_data = []

for key, value in data.items():
    input_text = ""
    target_text = ""
    first_sender = None
    next_sender = None

    for messages in value["messages"]:
        for j, msg in enumerate(messages):
          msg_text = clean_text(msg["text"]).strip()
          if first_sender is None:
            first_sender = msg["sender"]

          if msg["sender"] == first_sender:
            if input_text == "":
              input_text = msg_text
            if j < len(messages) - 1:
              next_sender = messages[j + 1]["sender"]
            else:
              next_sender = None

            if msg["sender"] == next_sender:
              input_text = input_text + " </s> " + clean_text(messages[j + 1]["text"]).strip()
              continue
          elif msg["sender"] != first_sender:
            if target_text == "":
              target_text = msg_text
            if j < len(messages) - 1:
              next_sender = messages[j + 1]["sender"]
            else:
              next_sender = None

            if msg["sender"] == next_sender:
              target_text = target_text + " </s> " + clean_text(messages[j + 1]["text"]).strip()
              continue

            formatted_data.append({'input': input_text.strip() + " </s> ", 'target': target_text.strip() + " </s> "})
            input_text = ""
            target_text = ""

In [None]:
formatted_data_out = pd.DataFrame(formatted_data)
formatted_data_out.to_csv('C:/Users/surya/Downloads/chit_chat_generator/train_data.txt', sep='\t', index=False, header=True)
print("Formatted data stored in train_data.txt")

Formatted data stored in train_data.txt


In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
model = T5ForConditionalGeneration.from_pretrained('google-t5/t5-small').to(device)
tokenizer = T5Tokenizer.from_pretrained('google-t5/t5-small')

In [None]:
formatted_dataset = {
    "input_text": [pair["input"] for pair in formatted_data],
    "output_text": [pair["target"] for pair in formatted_data]
}

In [None]:
def tokenize_function(talks):
    tokenized_inputs = tokenizer(talks['input_text'], padding='max_length', truncation=True, max_length=256)
    tokenized_outputs = tokenizer(talks['output_text'], padding='max_length', truncation=True, max_length=256)
    return {'input_ids': tokenized_inputs['input_ids'], 'attention_mask': tokenized_inputs['attention_mask'], 'labels': tokenized_outputs['input_ids']}

In [None]:
from datasets import Dataset
dataset = Dataset.from_dict(formatted_dataset)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

train_eval_split = tokenized_datasets.train_test_split(test_size=0.2, shuffle=False)
train_dataset = train_eval_split['train']
eval_dataset = train_eval_split['test']

val_test_split = eval_dataset.train_test_split(test_size = 0.5, shuffle=False)
val_dataset = val_test_split['train']
test_dataset = val_test_split['test']

In [None]:
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer
from torch.optim import AdamW

training_args = Seq2SeqTrainingArguments(
    output_dir='C:/Users/surya/Downloads/chit_chat_generator/results',
    num_train_epochs=5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='C:/Users/surya/Downloads/chit_chat_generator/logs',
    save_strategy='epoch',
    evaluation_strategy='epoch',
    learning_rate=3e-4
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer
)


In [None]:
trainer.train()

In [None]:
evaluation_metrics = trainer.evaluate()
print(evaluation_metrics["eval_loss"])


In [None]:
evaluation_metrics

In [None]:
tokenizer.save_pretrained('C:/Users/surya/Downloads/chit_chat_generator/results/t5/tokenizer')

In [None]:
trainer.save_model('C:/Users/surya/Downloads/chit_chat_generator/results/t5/model')

In [None]:
# from huggingface_hub import notebook_login
# notebook_login()

In [None]:
model.push_to_hub("kssumanth6/t5_small_chit_chat_generator_v2")

In [None]:
tokenizer.push_to_hub("kssumanth6/t5_small_chit_chat_generator_v2")

In [None]:
model_path = 'C:/Users/surya/Downloads/chit_chat_generator/results/t5/model'
tokenizer_path = 'C:/Users/surya/Downloads/chit_chat_generator/results/t5/tokenizer'

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = T5ForConditionalGeneration.from_pretrained(model_path).to(device)
tokenizer = T5Tokenizer.from_pretrained(tokenizer_path)

model.eval()

In [None]:
def generate_text(prompt_text, model, tokenizer, device):

    input_ids = tokenizer(prompt_text, return_tensors='pt', padding="max_length", max_length=256, truncation=True)

    input_ids = input_ids.to(device)

    outputs = model.generate(input_ids=input_ids['input_ids'], max_length=300, do_sample=True, num_beams=5, no_repeat_ngram_size=2)

    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

    return generated_text

prompt_text = "Hi everyone"
generated_text = generate_text(prompt_text, model, tokenizer, device)
print("Generated text:", generated_text)


In [None]:
print(len(test_dataset))

In [None]:
original_sentences = []
generated_sentences = []
test_dataset = pd.DataFrame(test_dataset)
start = 0
end = 10
for index, row in test_dataset[start:end].iterrows():
    input_text, label = row['input_text'], row['output_text']
    generated_sentences.append(generate_text(input_text, model, tokenizer, device))
    original_sentences.append(label)

In [None]:
print(original_sentences[:10])
print(generated_sentences[:10])

In [None]:
input_ids_original = tokenizer(original_sentences, padding="max_length", max_length=256, return_tensors="pt", truncation=True)

input_ids_generated = tokenizer(generated_sentences, padding="max_length", max_length=256, return_tensors="pt", truncation=True)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
input_ids_original = input_ids_original.to(device)
input_ids_generated = input_ids_generated.to(device)

outputs = model(input_ids=input_ids_original["input_ids"], labels=input_ids_generated["input_ids"])
loss = outputs.loss.item()
perplexity = torch.exp(outputs.loss).item()

print(f"Loss: {loss}")
print(f"Perplexity: {perplexity}")


In [6]:
import pandas as pd
data_frame_2 = pd.read_csv('C:\\Users\\surya\\Downloads\\chit_chat_generator\\topical_chat.csv')

In [7]:
data_frame_2.columns
data_frame_2.dtypes

conversation_id     int64
message            object
sentiment          object
dtype: object

In [8]:
data_frame_2['conversation_id'] = data_frame_2['conversation_id'].astype(str)
data_frame_2['message'] = data_frame_2['message'].astype(str)
data_frame_2['sentiment'] = data_frame_2['sentiment'].astype(str)
print(data_frame_2.dtypes)

conversation_id    object
message            object
sentiment          object
dtype: object


In [10]:
data_frame_3 = data_frame_2.groupby('conversation_id')['message'].agg(list)
formatted_data = []
for index in range(len(data_frame_3)):
    current_row = data_frame_3[index]
    for i in range(1, len(current_row), 2):
        formatted_data.append({'input': clean_text(current_row[i-1]) + " </s> ", 'target': clean_text(current_row[i]) + " </s> "})
data_frame_4 = pd.DataFrame(formatted_data)

  current_row = data_frame_3[index]


In [11]:
data_frame_4[:10]

Unnamed: 0,input,target
0,are you a fan of google or microsoft </s>,both are excellent technology they are helpful...
1,im not a huge fan of google but i use it a lot...,google provides online related services and pr...
2,yeah their services are good im just not a fan...,google is leading the alphabet subsidiary and ...
3,did you know google had hundreds of live goats...,it is very interesting google provide chrome o...
4,i like google chrome do you use it as well for...,yesgoogle is the biggest search engine and goo...
5,by the way do you like fish </s>,yes they form a sister group of tourniquets th...
6,did you know that a seahorse is the only fish ...,freshwater fish only drink water through the s...
7,interesting they also have gills did you know ...,yes fish is the important resources of human w...
8,what about cats do you like cats im a dog fan ...,the cat is referred as domestic cat and wild c...
9,yeah cats can be cool but they sure do spend a...,cats hear the sounds too faint or too high fre...


In [12]:
data_frame_4['input'].to_csv('topical.csv', index=False)

In [66]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = T5ForConditionalGeneration.from_pretrained('kssumanth6/t5_small_chit_chat_generator_v2').to(device)
tokenizer = T5Tokenizer.from_pretrained('kssumanth6/t5_small_chit_chat_generator_v2')

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [67]:
import pandas as pd
import csv
import re
import string
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s\.,!?]', '', text)
    PUNCT_TO_REMOVE = string.punctuation
    clean_text = text.translate(str.maketrans("", "", PUNCT_TO_REMOVE))
    clean_text = re.sub(r'\s+', ' ', clean_text).strip()
    return clean_text

In [68]:
formatted_dataset = {
    "input_text": [pair["input"] for pair in formatted_data],
    "output_text": [pair["target"] for pair in formatted_data]
}

In [69]:
def tokenize_function(talks):
    tokenized_inputs = tokenizer(talks['input_text'], padding='max_length', truncation=True, max_length=256)
    tokenized_outputs = tokenizer(talks['output_text'], padding='max_length', truncation=True, max_length=256)
    return {'input_ids': tokenized_inputs['input_ids'], 'attention_mask': tokenized_inputs['attention_mask'], 'labels': tokenized_outputs['input_ids']}

In [70]:
from datasets import Dataset
dataset = Dataset.from_dict(formatted_dataset)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

train_eval_split = tokenized_datasets.train_test_split(test_size=0.2, shuffle=False)
train_dataset = train_eval_split['train']
eval_dataset = train_eval_split['test']

Map:   0%|          | 0/91174 [00:00<?, ? examples/s]



In [71]:
train_dataset[:10]

{'input_text': ['are you a fan of google or microsoft </s> ',
  'im not a huge fan of google but i use it a lot because i have to i think they are a monopoly in some sense </s> ',
  'yeah their services are good im just not a fan of intrusive they can be on our personal lives </s> ',
  'did you know google had hundreds of live goats to cut the grass in the past </s> ',
  'i like google chrome do you use it as well for your browser </s> ',
  'by the way do you like fish </s> ',
  'did you know that a seahorse is the only fish to have a neck </s> ',
  'interesting they also have gills did you know that jellyfish are immortal </s> ',
  'what about cats do you like cats im a dog fan myself </s> ',
  'yeah cats can be cool but they sure do spend a lot of their time sleeping </s> '],
 'output_text': ['both are excellent technology they are helpful in many ways for the security purpose both are super </s> ',
  'google provides online related services and products which includes online ads sea

In [72]:
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer
from torch.optim import AdamW

training_args = Seq2SeqTrainingArguments(
    output_dir='C:/Users/surya/Downloads/chit_chat_generator/results',
    num_train_epochs=5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='C:/Users/surya/Downloads/chit_chat_generator/logs',
    save_strategy='epoch',
    evaluation_strategy='epoch',
    learning_rate=3e-4
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer
)

In [73]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,0.3617,0.341108
2,0.3414,0.33072
3,0.3321,0.325636
4,0.3255,0.323122
5,0.3217,0.322251


TrainOutput(global_step=22795, training_loss=0.33953871512575146, metrics={'train_runtime': 28912.5869, 'train_samples_per_second': 12.614, 'train_steps_per_second': 0.788, 'total_flos': 2.467923914391552e+16, 'train_loss': 0.33953871512575146, 'epoch': 5.0})

In [74]:
evaluation_metrics = trainer.evaluate()
print(evaluation_metrics["eval_loss"])

0.3222512900829315


In [75]:
tokenizer.save_pretrained('C:/Users/surya/Downloads/chit_chat_generator/results/t5/tokenizer')

('C:/Users/surya/Downloads/chit_chat_generator/results/t5/tokenizer\\tokenizer_config.json',
 'C:/Users/surya/Downloads/chit_chat_generator/results/t5/tokenizer\\special_tokens_map.json',
 'C:/Users/surya/Downloads/chit_chat_generator/results/t5/tokenizer\\spiece.model',
 'C:/Users/surya/Downloads/chit_chat_generator/results/t5/tokenizer\\added_tokens.json')

In [76]:
trainer.save_model('C:/Users/surya/Downloads/chit_chat_generator/results/t5/model')

In [77]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [78]:
model.push_to_hub("kssumanth6/t5_small_chit_chat_generator_v3")

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/kssumanth6/t5_small_chit_chat_generator_v3/commit/994e5df0c326ead87cd2c36b5fc342808ec30453', commit_message='Upload T5ForConditionalGeneration', commit_description='', oid='994e5df0c326ead87cd2c36b5fc342808ec30453', pr_url=None, pr_revision=None, pr_num=None)

In [79]:
tokenizer.push_to_hub("kssumanth6/t5_small_chit_chat_generator_v3")

README.md:   0%|          | 0.00/5.18k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/kssumanth6/t5_small_chit_chat_generator_v3/commit/382af010e51391c83cf98bdf942d1ca1955f6a20', commit_message='Upload tokenizer', commit_description='', oid='382af010e51391c83cf98bdf942d1ca1955f6a20', pr_url=None, pr_revision=None, pr_num=None)

In [110]:
def generate_text(prompt_text, model, tokenizer, device):

    input_ids = tokenizer(prompt_text, return_tensors='pt', padding="max_length", max_length=256, truncation=True)

    input_ids = input_ids.to(device)

    outputs = model.generate(input_ids=input_ids['input_ids'], max_length=300, do_sample=True, num_beams=3, no_repeat_ngram_size=2, early_stopping=True)

    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

    return generated_text

prompt_text = "i like dance "
generated_text = generate_text(prompt_text, model, tokenizer, device)
print("Generated text:", generated_text)


Generated text: i like to dance a lot did you know that bruce lee was the first celebrity to appear on tv


In [113]:
original_sentences = []
generated_sentences = []
eval_dataset = pd.DataFrame(eval_dataset)
start = 0
end = 10
for index, row in eval_dataset[start:end].iterrows():
    input_text, label = row['input_text'], row['output_text']
    generated_sentences.append(generate_text(input_text, model, tokenizer, device))
    original_sentences.append(label)



In [114]:
print(original_sentences[:10])
print(generated_sentences[:10])

['and with the presence and ease of use of social media that also complicates things before there was only one big story each day but there are now like five or six that are being posted so i think journalist have a very tough job is there a certain news outlet you prefer </s> ', 'i like the new york times to catch up on news and do watch fox news as well i wonder why they discussed the possibility of creating a cable channel that only aired the simpsons is that show truly that popular </s> ', 'entertaining channel for sure but surprising that they threatened to sue the simpsons in order to stop them from parodying its antidemocratic party agenda i wonder if that was a result of the settlement </s> ', 'i didnt know that i wonder how the use of facebook compares with those stats since its valued at more than 104 billion now </s> ', 'what a horrible management decision considering that myspace is nonexistent nowadays and facebook is a big force iceland even has to rewrite their constitut

In [115]:
input_ids_original = tokenizer(original_sentences, padding="max_length", max_length=256, return_tensors="pt", truncation=True)

input_ids_generated = tokenizer(generated_sentences, padding="max_length", max_length=256, return_tensors="pt", truncation=True)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
input_ids_original = input_ids_original.to(device)
input_ids_generated = input_ids_generated.to(device)

outputs = model(input_ids=input_ids_original["input_ids"], labels=input_ids_generated["input_ids"])
loss = outputs.loss.item()
perplexity = torch.exp(outputs.loss).item()

print(f"Loss: {loss}")
print(f"Perplexity: {perplexity}")

Loss: 0.15013495087623596
Perplexity: 1.161991000175476
