In [1]:
!pip install transformers
!pip install torch
!pip install transformers[torch]

Collecting transformers
  Downloading transformers-4.33.2-py3-none-any.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m26.2 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.15.1 (from transformers)
  Downloading huggingface_hub-0.17.1-py3-none-any.whl (294 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m294.8/294.8 kB[0m [31m22.7 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m60.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m42.9 MB/s[0m eta [36m0:00:0

In [6]:
from google.colab import files, drive
uploaded = files.upload()
drive.mount('/content/drive')

Saving english-train.json to english-train.json
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [7]:
import json
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer, AdamW
from torch.utils.data import DataLoader, TensorDataset
from transformers import TrainingArguments, Trainer
from accelerate import Accelerator

In [39]:
# Load and preprocess the data from 'english-train.json'
data = json.load(open('english-train.json'))
training_data = [(item['utterances'][0], item['utterances'][1]) for item in data]

In [40]:
# Tokenization
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token  # Set padding token to EOS token
encoded_data = tokenizer(training_data, return_tensors='pt', truncation=True, padding=True)

In [41]:
# Create datasets
input_ids = encoded_data['input_ids']
attention_mask = encoded_data['attention_mask']
labels = encoded_data['input_ids'].clone()

In [42]:
# Fine-tuning the model
model = GPT2LMHeadModel.from_pretrained("gpt2")
optimizer = AdamW(model.parameters(), lr=1e-5)
dataset = TensorDataset(input_ids, attention_mask, labels)

training_args = TrainingArguments(
    output_dir="./gpt2-finetuned",
    num_train_epochs=3,
    per_device_train_batch_size=4,
    logging_dir="./logs",
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=lambda features: {
        'input_ids': torch.stack([f[0] for f in features]),
        'attention_mask': torch.stack([f[1] for f in features]),
        'labels': torch.stack([f[2] for f in features]),
    },
    train_dataset=dataset,
)

trainer.train()



Step,Training Loss


TrainOutput(global_step=363, training_loss=0.853226254465823, metrics={'train_runtime': 75.529, 'train_samples_per_second': 19.145, 'train_steps_per_second': 4.806, 'total_flos': 397752816384000.0, 'train_loss': 0.853226254465823, 'epoch': 3.0})

In [43]:
# Save the fine-tuned model
model.save_pretrained("/content/drive/My Drive/saved_model_directory/model.pth")

In [54]:
from difflib import SequenceMatcher

def chatbot_response(input_text, prev_responses=[]):
    model_path = "/content/drive/My Drive/saved_model_directory/model.pth"
    model = GPT2LMHeadModel.from_pretrained(model_path)
    input_ids = tokenizer.encode(input_text, return_tensors="pt")
    with torch.no_grad():
        output = model.generate(input_ids, max_length=100, num_return_sequences=1, pad_token_id=tokenizer.eos_token_id)
    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)

    # Split the generated text into individual responses
    responses = generated_text.split("doctor:")

    # Remove empty responses and strip leading/trailing whitespace
    responses = [response.strip() for response in responses if response.strip()]

    # Filter out repeated or similar responses
    unique_responses = []
    for response in responses:
        if response not in unique_responses and not any(similar_response(response, prev_resp) for prev_resp in prev_responses):
            unique_responses.append(response)

    # Join the unique responses back together
    doctor_response = "doctor:" + " ".join(unique_responses)

    # Split the generated text into individual responses
    responses = generated_text.split("doctor:")

    # Extract the last doctor's response (assuming it's the most recent)
    doctor_response = responses[-1].strip()

    # Add the new response to the list of previous responses
    prev_responses.append(doctor_response)

    return doctor_response

# Function to check if two responses are similar
def similar_response(response1, response2, threshold=0.7):
    # You can use a similarity metric like cosine similarity or Levenshtein distance
    # Here, we'll use a simple string similarity comparison for demonstration purposes
    return SequenceMatcher(None, response1, response2).ratio() > threshold


In [55]:
# Example usage
user_input = "i have all the symptoms except fever, i went to medicross and dr said i can get tested if i want to i'm not sure if i should. she gave me antibiotics klacid xl 500mg, she said i can take it if i feel worse i'm worried it will make immune system bad?"
response = chatbot_response(user_input)
print("Chatbot Response:", response)

Chatbot Response: in brief: yes, you should take it if you feel better. if you feel better, take it. if you feel worse, take it. if you feel better
