<a href="https://colab.research.google.com/github/ankitk75/AIML-Project-Series/blob/main/Admission_Chatbot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [11]:
import csv
import pandas as pd

# Define the input and output file paths
input_file_path = '/content/input.txt'
output_file_path = '/content/output.csv'

# Initialize lists to hold questions and answers
questions = []
answers = []

# Read the input file and parse the content
with open(input_file_path, 'r') as file:
    lines = file.readlines()
    for line in lines:
        if line.strip():  # Skip any empty lines
            question, answer = line.split(',', 1)  # Split only at the first comma
            questions.append(question.strip())
            answers.append(answer.strip())

# Write the questions and answers to a new CSV file
with open(output_file_path, 'w', newline='') as csvfile:
    csvwriter = csv.writer(csvfile)
    csvwriter.writerow(['question', 'answer'])  # Write the header row
    for question, answer in zip(questions, answers):
        csvwriter.writerow([question, answer])


In [12]:
def load_dataset_from_csv(csv_path):
    df = pd.read_csv(csv_path)
    dialogues = [{"input": row['question'], "output": row['answer']} for _, row in df.iterrows()]
    return Dataset.from_pandas(pd.DataFrame(dialogues))


# dataset = load_dataset_from_csv('output.csv')

In [3]:
!pip install datasets

Collecting datasets
  Downloading datasets-2.19.1-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m14.7 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m23.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m17.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: xxhash, dill, multiprocess, datasets
Successfully installed dataset

In [13]:
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
from datasets import Dataset, DatasetDict

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-medium")
model = AutoModelForCausalLM.from_pretrained("microsoft/DialoGPT-medium")

# Add padding token
tokenizer.pad_token = tokenizer.eos_token

# Prepare the dataset
dataset = load_dataset_from_csv('output.csv')
train_test_split = dataset.train_test_split(test_size=0.1)
dataset = DatasetDict({
    'train': train_test_split['train'],
    'test': train_test_split['test']
})


# Tokenize the dataset
def tokenize_function(examples):
    inputs = tokenizer(examples['input'], truncation=True, padding='max_length', max_length=128)
    outputs = tokenizer(examples['output'], truncation=True, padding='max_length', max_length=128)
    return {'input_ids': inputs['input_ids'], 'attention_mask': inputs['attention_mask'],
            'labels': outputs['input_ids']}


tokenized_datasets = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/49 [00:00<?, ? examples/s]

Map:   0%|          | 0/6 [00:00<?, ? examples/s]

In [5]:
!pip install accelerate -U

Collecting accelerate
  Downloading accelerate-0.30.1-py3-none-any.whl (302 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.6/302.6 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.w

In [14]:
# Tokenize the dataset
training_args = TrainingArguments(
    output_dir='./results',
    eval_strategy='epoch',
    num_train_epochs=15,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['test'],
)

trainer.train()

Epoch,Training Loss,Validation Loss
1,8.5636,8.281356
2,7.1687,3.015113
3,4.3471,2.65547
4,2.7987,2.313536
5,2.6502,2.174286
6,2.3583,2.030334
7,2.2054,1.929954
8,2.0206,1.821768
9,1.9621,1.757895
10,1.7147,1.68946


TrainOutput(global_step=195, training_loss=2.7521566928961336, metrics={'train_runtime': 104.6565, 'train_samples_per_second': 7.023, 'train_steps_per_second': 1.863, 'total_flos': 170648752619520.0, 'train_loss': 2.7521566928961336, 'epoch': 15.0})

In [15]:
model.save_pretrained("fine-tuned-dialoGPT-medium")
tokenizer.save_pretrained("fine-tuned-dialoGPT-medium")

('fine-tuned-dialoGPT-medium/tokenizer_config.json',
 'fine-tuned-dialoGPT-medium/special_tokens_map.json',
 'fine-tuned-dialoGPT-medium/vocab.json',
 'fine-tuned-dialoGPT-medium/merges.txt',
 'fine-tuned-dialoGPT-medium/added_tokens.json',
 'fine-tuned-dialoGPT-medium/tokenizer.json')

In [21]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

tokenizer = AutoTokenizer.from_pretrained("fine-tuned-dialoGPT-medium")
model = AutoModelForCausalLM.from_pretrained("fine-tuned-dialoGPT-medium")

def run(user_text, chat_history_ids):
    input_ids = tokenizer.encode(user_text + tokenizer.eos_token, return_tensors='pt')
    if chat_history_ids is None:
        bot_history_ids = input_ids
    else:
        bot_history_ids = torch.cat([chat_history_ids, input_ids], dim=-1)

    chat_history_ids = model.generate(bot_history_ids, max_length=1000, pad_token_id=tokenizer.eos_token_id)
    resp = tokenizer.decode(chat_history_ids[:, bot_history_ids.shape[-1]:][0], skip_special_tokens=True)

    return resp, chat_history_ids


In [26]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch


# tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-small")
# model = AutoModelForCausalLM.from_pretrained("microsoft/DialoGPT-small")

tokenizer = AutoTokenizer.from_pretrained("fine-tuned-dialoGPT-medium")
model = AutoModelForCausalLM.from_pretrained("fine-tuned-dialoGPT-medium")


# Let's chat for 5 lines
for step in range(5):
    # encode the new user input, add the eos_token and return a tensor in Pytorch
    new_user_input_ids = tokenizer.encode(input(">> User:") + tokenizer.eos_token, return_tensors='pt')

    # append the new user input tokens to the chat history
    bot_input_ids = torch.cat([chat_history_ids, new_user_input_ids], dim=-1) if step > 0 else new_user_input_ids

    # generated a response while limiting the total chat history to 1000 tokens,
    chat_history_ids = model.generate(bot_input_ids, max_length=1000, pad_token_id=tokenizer.eos_token_id)

    # pretty print last ouput tokens from bot
    print("DialoGPT: {}".format(tokenizer.decode(chat_history_ids[:, bot_input_ids.shape[-1]:][0], skip_special_tokens=True)))


>> User:how to get admission in MIT?
DialoGPT: You need to be accepted by MIT to apply
>> User:What are the hostel facilities at Manipal Institute of Technology?
DialoGPT: The hostel is in the main building of the campus
>> User:Can international students apply for B.Tech at Manipal Institute of Technology?
DialoGPT: Yes, international students can apply for B.Tech at Manipal Institute of Technology
>> User:What is the selection process for BTech at MIT Manipal?
DialoGPT: There is a selection process for all courses at MIT
>> User:thank you
DialoGPT: You are welcome
