# Chat Bot Law Support

## Installing the necessary libraries

In [1]:
# Install Hugging Face Transformers, Datasets, and Accelerate
!pip install transformers datasets accelerate

# Install PyTorch (should be pre-installed, but just in case)
!pip install torch

Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl.metadata (19 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting requests (from transformers)
  Downloading requests-2.32.3-py3-none-any.whl.metadata (4.6 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.5.0,>=2023.1.0 (from fsspec[http]<=2024.5.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.5.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Co

## Base model training

In [28]:
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, EarlyStoppingCallback
import torch
from sklearn.model_selection import train_test_split

# Load the CSV file
csv_file = './legal_qa.csv'
df = pd.read_csv(csv_file)

# Prepare the dataset
df['text'] = df['Question'] + " " + df['Answer']
train_df, test_df = train_test_split(df[['text']], test_size=0.2, random_state=42)
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("gpt2")
model = AutoModelForCausalLM.from_pretrained("gpt2")

# Add a padding token
tokenizer.pad_token = tokenizer.eos_token

# Tokenize the data
def tokenize_function(examples):
    encodings = tokenizer(examples['text'], padding="max_length", truncation=True, max_length=128)

    encodings['labels'] = encodings['input_ids'].copy()
    return encodings

train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

# Ensuring the datasets are formatted correctly
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=150,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    logging_dir="./logs",
    logging_steps=10,
    save_steps=200,
    save_total_limit=5,
    learning_rate=3e-5,
    evaluation_strategy="steps",
    eval_steps=200,
    save_strategy="steps",
    weight_decay=0.01,
    load_best_model_at_end=True,
    fp16=True,
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],  # Early stopping if no improvement for 3 evals
)

# Fine-tune the model
trainer.train()


Map:   0%|          | 0/80 [00:00<?, ? examples/s]

Map:   0%|          | 0/20 [00:00<?, ? examples/s]



Step,Training Loss,Validation Loss
200,0.1279,0.551767
400,0.069,0.652239
600,0.0539,0.682327
800,0.0481,0.7343


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


TrainOutput(global_step=800, training_loss=0.2382890198007226, metrics={'train_runtime': 103.3791, 'train_samples_per_second': 116.078, 'train_steps_per_second': 29.019, 'total_flos': 209033625600000.0, 'train_loss': 0.2382890198007226, 'epoch': 40.0})

In [29]:
# Load the best model checkpoint manually
best_checkpoint = "./results/checkpoint-800"
model = AutoModelForCausalLM.from_pretrained(best_checkpoint)

# Save this best model
model.save_pretrained("./fine-tuned-gpt2-best")
tokenizer.save_pretrained("./fine-tuned-gpt2-best")

('./fine-tuned-gpt2-best/tokenizer_config.json',
 './fine-tuned-gpt2-best/special_tokens_map.json',
 './fine-tuned-gpt2-best/vocab.json',
 './fine-tuned-gpt2-best/merges.txt',
 './fine-tuned-gpt2-best/added_tokens.json',
 './fine-tuned-gpt2-best/tokenizer.json')

In [36]:

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

# Load the fine-tuned model and tokenizer
model_path = "./fine-tuned-gpt2-best"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path)

# Function to generate a response
def generate_response(prompt, max_length=150, num_beams=5, early_stopping=True):
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    outputs = model.generate(
        inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        max_length=max_length,
        num_return_sequences=1,
        num_beams=num_beams,  # Beam search for better quality
        early_stopping=early_stopping,
        pad_token_id=tokenizer.eos_token_id  # Set pad token id
    )
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response

# Test the chatbot with some example questions
example_questions = [
    "What is a bench warrant?"
]

# Generate and print responses for the example questions
for question in example_questions:
    print(f"Question: {question}")
    print(f"Response: {generate_response(question)}")
    print("-" * 50)

Question: What is a bench warrant?
Response: What is a bench warrant? A bench warrant is a court order authorizing the enforcement of a judgment, typically involving the seizure of property to satisfy a debt.
--------------------------------------------------


## Generating more synthetic data

In [39]:
import pandas as pd

# Load the current dataset
file_path = "./legal_qa.csv"
df = pd.read_csv(file_path)

# Basic general questions and answers
general_qa = [
    {"Question": "Hello, how are you?", "Answer": "I'm just a bot, but I'm here to help! How can I assist you today?"},
    {"Question": "What is your name?", "Answer": "I'm your friendly legal assistant chatbot."},
    {"Question": "What can you do?", "Answer": "I can help answer your questions about legal topics. Ask me anything!"},
    {"Question": "Can you tell me a joke?", "Answer": "Why don't scientists trust atoms? Because they make up everything!"},
    {"Question": "Who created you?", "Answer": "I was created by a team of developers to assist with legal information."},
    {"Question": "How can I contact a lawyer?", "Answer": "You can contact a lawyer through various online legal services or by searching for local legal firms."},
    {"Question": "What is the time?", "Answer": "I don't have access to real-time data, but you can check the time on your device."},
    {"Question": "Can you help with medical advice?", "Answer": "I'm specialized in legal information. For medical advice, please consult a healthcare professional."},
    {"Question": "What is the capital of France?", "Answer": "The capital of France is Paris."},
    {"Question": "How do I reset my password?", "Answer": "For password resets, you should follow the instructions provided by the service you're using."}
]

# Augment existing questions by rephrasing
augmented_qa = []
for idx, row in df.iterrows():
    question = row['Question']
    answer = row['Answer']
    augmented_qa.append({"Question": question, "Answer": answer})
    augmented_qa.append({"Question": f"Can you explain how to {question.lower()}?", "Answer": answer})
    augmented_qa.append({"Question": f"What steps are involved in {question.lower()}?", "Answer": answer})
    augmented_qa.append({"Question": f"Tell me the process of {question.lower()}.", "Answer": answer})
    augmented_qa.append({"Question": f"How can I {question.lower()} legally?", "Answer": answer})

# Combine original, general, and augmented data
combined_qa = pd.DataFrame(general_qa + augmented_qa)

combined_qa = combined_qa.sample(n=100, random_state=42)

# Save the new dataset
combined_qa.to_csv("./combined_legal_qa.csv", index=False)
print("Synthetic data generated and saved to /combined_legal_qa.csv")


Synthetic data generated and saved to /combined_legal_qa.csv


## Incremental Training

In [67]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

# Load the fine-tuned model and tokenizer
model_path = "./fine-tuned-gpt2-best"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path)

In [53]:
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, EarlyStoppingCallback
import torch
from sklearn.model_selection import train_test_split

# Load the CSV file
csv_file = './combined_legal_qa.csv'
df = pd.read_csv(csv_file)

# Prepare the dataset
df['text'] = df['Question'] + " " + df['Answer']
train_df, test_df = train_test_split(df[['text']], test_size=0.1, random_state=42)
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)


# Add a padding token
tokenizer.pad_token = tokenizer.eos_token

# Tokenize the data
def tokenize_function(examples):
    encodings = tokenizer(examples['text'], padding="max_length", truncation=True, max_length=128)

    encodings['labels'] = encodings['input_ids'].copy()
    return encodings

train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

# Ensure the datasets are formatted correctly
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=150,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    logging_dir="./logs",
    logging_steps=10,
    save_steps=200,
    save_total_limit=5,
    learning_rate=3e-5,
    evaluation_strategy="steps",
    eval_steps=200,
    save_strategy="steps",
    weight_decay=0.01,
    load_best_model_at_end=True,
    fp16=True,
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],  # Early stopping if no improvement for 3 evals
)

# Fine-tune the model
trainer.train()


Map:   0%|          | 0/9 [00:00<?, ? examples/s]

Map:   0%|          | 0/3 [00:00<?, ? examples/s]



Step,Training Loss,Validation Loss
200,0.0213,0.195321
400,0.0217,0.202602


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


TrainOutput(global_step=450, training_loss=0.040164023141066235, metrics={'train_runtime': 55.9265, 'train_samples_per_second': 24.139, 'train_steps_per_second': 8.046, 'total_flos': 88186060800000.0, 'train_loss': 0.040164023141066235, 'epoch': 150.0})

In [54]:
# Load the best model checkpoint manually
best_checkpoint = "./results/checkpoint-200"
model = AutoModelForCausalLM.from_pretrained(best_checkpoint)

# Save this best model
model.save_pretrained("./fine-tuned-gpt2-best")
tokenizer.save_pretrained("./fine-tuned-gpt2-best")

('./fine-tuned-gpt2-best/tokenizer_config.json',
 './fine-tuned-gpt2-best/special_tokens_map.json',
 './fine-tuned-gpt2-best/vocab.json',
 './fine-tuned-gpt2-best/merges.txt',
 './fine-tuned-gpt2-best/added_tokens.json',
 './fine-tuned-gpt2-best/tokenizer.json')

## Testing the model on sample inputs

In [66]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

# Load the fine-tuned model and tokenizer
model_path = "./fine-tuned-gpt2-best"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path)

# Function to generate a response
def generate_response(prompt, max_length=150, num_beams=5, early_stopping=True):
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    outputs = model.generate(
        inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        max_length=max_length,
        num_return_sequences=1,
        num_beams=num_beams,  # Beam search for better quality
        early_stopping=early_stopping,
        pad_token_id=tokenizer.eos_token_id  # Set pad token id
    )
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response

# Test the chatbot with some example questions
example_questions = [
    "Hi i am not feeling well."
]

# Generate and print responses for the example questions
for question in example_questions:
    print(f"Question: {question}")
    print(f"Response: {generate_response(question)}")
    print("-" * 50)

Question: Hi i am not feeling well.
Response: Hi i am not feeling well. How can i help?
--------------------------------------------------


## Saving model to cloud

In [68]:
from google.colab import drive

# Load the fine-tuned model and tokenizer
model_path = "./fine-tuned-gpt2-best"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path)

# Mount Google Drive
drive.mount('/content/drive')

# Path to save in Google Drive
save_path = '/content/drive/MyDrive/Chatbot/models/Final_Model/Chat_Bot_Law_GPT2'


# Save model and tokenizer to Google Drive
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)

print(f'Model and tokenizer saved to {save_path}')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Model and tokenizer saved to /content/drive/MyDrive/Chatbot/models/Final_Model/Chat_Bot_Law_GPT2


## Metrics Evaluation

In [74]:
!pip install nltk
!pip install rouge-score

Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=6e1255c45a22e1d031e62b593bdc96087ae4232a9d590dc558de6703bd9f5dbb
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge-score
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2


In [71]:
import pandas as pd
import glob


folder_path = "./"


csv_files = glob.glob(folder_path + "*.csv")

# Print the list of found CSV files
print(f"Found CSV files: {csv_files}")

if not csv_files:
    print("No CSV files found in the specified directory.")
else:
    # Load and concatenate all CSV files
    dataframes = [pd.read_csv(file) for file in csv_files]
    combined_df = pd.concat(dataframes, ignore_index=True)

    # Save the combined DataFrame to a new CSV file
    combined_csv_path = "./testdata_legal_qa.csv"
    combined_df.to_csv(combined_csv_path, index=False)
    print(f"Combined CSV file saved to {combined_csv_path}")



Found CSV files: ['./combined_legal_qa.csv', './legal_qa.csv', './augmented_legal_qa.csv', './real_estate_questions_answers.csv']
Combined CSV file saved to ./testdata_legal_qa.csv


In [78]:
import pandas as pd
import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer
from transformers import AutoTokenizer, AutoModelForCausalLM

# Download NLTK data for METEOR
nltk.download('wordnet')
nltk.download('punkt')

# Load the fine-tuned model and tokenizer
model_path = "./fine-tuned-gpt2-best"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path)

# Function to generate a response
def generate_response(prompt, max_length=150, num_beams=5, early_stopping=True):
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    outputs = model.generate(
        inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        max_length=max_length,
        num_return_sequences=1,
        num_beams=num_beams,  # Beam search for better quality
        early_stopping=early_stopping,
        pad_token_id=tokenizer.eos_token_id  # Set pad token id
    )
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response

# Load the combined dataset
combined_csv_path = "./testdata_legal_qa.csv"
combined_df = pd.read_csv(combined_csv_path)

# Initialize metrics
bleu_scores = []
rouge_l_scores = []
meteor_scores = []

# Initialize ROUGE scorer
rouge = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)

# Function to calculate METEOR score
def meteor_score(reference, hypothesis):
    reference_tokens = nltk.word_tokenize(reference)
    hypothesis_tokens = nltk.word_tokenize(hypothesis)
    return nltk.translate.meteor_score.meteor_score([reference_tokens], hypothesis_tokens)

# Evaluate the model
for index, row in combined_df.iterrows():
    question = row['Question']
    actual_answer = row['Answer']
    generated_answer = generate_response(question)

    # Calculate BLEU score
    smoothing_function = SmoothingFunction().method4
    bleu = sentence_bleu([actual_answer.split()], generated_answer.split(), smoothing_function=smoothing_function)
    bleu_scores.append(bleu)

    # Calculate ROUGE-L score
    rouge_l = rouge.score(actual_answer, generated_answer)['rougeL'].fmeasure
    rouge_l_scores.append(rouge_l)

    # Calculate METEOR score
    meteor = meteor_score(actual_answer, generated_answer)
    meteor_scores.append(meteor)


# Calculate average scores
average_bleu = sum(bleu_scores) / len(bleu_scores)
average_rouge_l = sum(rouge_l_scores) / len(rouge_l_scores)
average_meteor = sum(meteor_scores) / len(meteor_scores)

# Print average scores
print(f"Average BLEU Score: {average_bleu}")
print(f"Average ROUGE-L Score: {average_rouge_l}")
print(f"Average METEOR Score: {average_meteor}")


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Average BLEU Score: 0.5407452461414209
Average ROUGE-L Score: 0.6936331744368349
Average METEOR Score: 0.7691570935387103
