Fine-tuning

In [1]:
from sentence_transformers import SentenceTransformer, InputExample, losses
from torch.utils.data import DataLoader
import json

# Step 1: Load the Preprocessed Dataset
def load_data(json_file):
    """
    Load the dataset from a JSON file and convert it to InputExample format.
    """
    with open(json_file, "r", encoding="utf-8") as file:
        data = json.load(file)
    
    # Convert each entry to InputExample
    examples = [
        InputExample(
            texts=[item["sentence1"], item["sentence2"]],
            label=float(item["label"])
        )
        for item in data
    ]
    return examples

# Step 2: Fine-Tune the Model
def fine_tune_model(train_examples, model_name, output_dir, batch_size=32, epochs=3):
    """
    Fine-tune a pre-trained SentenceTransformer model using the provided examples.
    """
    # Load a pre-trained SentenceTransformer model
    model = SentenceTransformer(model_name)

    # Create a DataLoader
    train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=batch_size)

    # Define the loss function (CosineSimilarityLoss)
    train_loss = losses.CosineSimilarityLoss(model)

    # Fine-tune the model
    model.fit(
        train_objectives=[(train_dataloader, train_loss)],
        epochs=epochs,
        warmup_steps=100,
        show_progress_bar=True
    )

    # Save the fine-tuned model
    model.save(output_dir)
    print(f"Fine-tuned model saved to {output_dir}")

# Main Script
if __name__ == "__main__":
    # File paths
    input_json_file = "semantic_search_data.json"  # Replace with your JSON file
    model_name = "sentence-transformers/all-MiniLM-L6-v2"  # Base model
    output_dir = "fine_tuned_semantic_search_model"  # Directory to save the model

    # Parameters
    batch_size = 32
    epochs = 3

    # Load data
    train_examples = load_data(input_json_file)
    print(f"Loaded {len(train_examples)} training examples.")

    # Fine-tune the model
    fine_tune_model(train_examples, model_name, output_dir, batch_size, epochs)


  from .autonotebook import tqdm as notebook_tqdm



Loaded 1001 training examples.


100%|██████████| 96/96 [08:51<00:00,  5.54s/it]


{'train_runtime': 531.7817, 'train_samples_per_second': 5.647, 'train_steps_per_second': 0.181, 'train_loss': 0.24934768676757812, 'epoch': 3.0}


                                                                             

Fine-tuned model saved to fine_tuned_semantic_search_model


Validating

In [2]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import json
import numpy as np

# Step 1: Load the Fine-Tuned Model
def load_model(model_path):
    """
    Load the fine-tuned SentenceTransformer model.
    """
    return SentenceTransformer(model_path)

# Step 2: Load the Validation Dataset
def load_validation_data(json_file):
    """
    Load the validation dataset from the JSON file.
    """
    with open(json_file, "r", encoding="utf-8") as file:
        data = json.load(file)
    
    # Extract unique questions and answers for validation
    questions = list(set(item["sentence1"] for item in data))
    answers = list(set(item["sentence2"] for item in data))
    return questions, answers

# Step 3: Encode Questions and Answers
def encode_texts(model, texts):
    """
    Encode a list of texts using the model.
    """
    return model.encode(texts, convert_to_tensor=True)

# Step 4: Perform Semantic Search
def semantic_search(question, question_embedding, answers, answer_embeddings, top_k=3):
    """
    Perform semantic search to find the most similar answers for a given question.
    """
    # Compute cosine similarities
    similarities = cosine_similarity(
        question_embedding.detach().numpy().reshape(1, -1), 
        answer_embeddings.detach().numpy()
    )[0]

    # Get top-k answers
    top_k_indices = np.argsort(similarities)[-top_k:][::-1]
    top_k_answers = [(answers[i], similarities[i]) for i in top_k_indices]
    return top_k_answers

# Step 5: Validate the Model
def validate_model(model, questions, answers, top_k=3):
    """
    Validate the model by encoding and retrieving the most relevant answers for each question.
    """
    print("Encoding questions and answers...")
    question_embeddings = encode_texts(model, questions)
    answer_embeddings = encode_texts(model, answers)

    print("\nPerforming semantic search for validation...")
    for question in questions[:5]:  # Validate on a subset of questions
        question_embedding = model.encode(question, convert_to_tensor=True)
        results = semantic_search(question, question_embedding, answers, answer_embeddings, top_k=top_k)
        
        print(f"\nQuestion: {question}")
        print("Top Answers:")
        for idx, (answer, similarity) in enumerate(results):
            print(f"{idx+1}. {answer} (Similarity: {similarity:.4f})")

# Main Script
if __name__ == "__main__":
    # File paths
    model_path = "fine_tuned_semantic_search_model"  # Path to fine-tuned model
    validation_json_file = "semantic_search_data.json"  # JSON file used for validation

    # Parameters
    top_k = 3  # Number of top answers to retrieve

    # Load the fine-tuned model
    model = load_model(model_path)
    print("Model loaded.")

    # Load the validation data
    questions, answers = load_validation_data(validation_json_file)
    print(f"Loaded {len(questions)} unique questions and {len(answers)} unique answers.")

    # Validate the model
    validate_model(model, questions, answers, top_k=top_k)


Model loaded.
Loaded 987 unique questions and 987 unique answers.
Encoding questions and answers...

Performing semantic search for validation...

Question: តើមូលនិធិអភិវឌ្ឍន៍ព្រៃឈើជាតិមានប្រភពចំណូលមកពីអ្វីខ្លះ ?
Top Answers:
1. ក្រសួងការបរទេសនិងសហប្រតិបត្តិការអន្តរជាតិស្ថិតនៅក្នុងវិស័យរដ្ឋបាល ។ (Similarity: 0.7895)
2. នៅជនបទកម្ពុជាប្រភេទឥន្ធនៈសំខាន់ជាងគេសម្រាប់ចម្អិនអាហារនោះអុស ។ (Similarity: 0.7895)
3. ដើម្បីឱ្យអាពាហ៍ពិពាហ៍ប្រព្រឹត្តទៅតាមផ្លូវច្បាប់គូស្រករទាំងពីរត្រូវទៅចុះកិច្ចសន្យាអាពាហ៍ពិពាហ៍នៅមុខមន្ត្រីអត្រានុកូលដ្ឋាននៅគេដ្ឋានខាងស្រី ។ (Similarity: 0.7895)

Question: តើចំណែកនៃភាគបម្រុងមានអ្វីខ្លះ ?
Top Answers:
1. ក្រសួងការបរទេសនិងសហប្រតិបត្តិការអន្តរជាតិស្ថិតនៅក្នុងវិស័យរដ្ឋបាល ។ (Similarity: 0.7895)
2. នៅជនបទកម្ពុជាប្រភេទឥន្ធនៈសំខាន់ជាងគេសម្រាប់ចម្អិនអាហារនោះអុស ។ (Similarity: 0.7895)
3. ដើម្បីឱ្យអាពាហ៍ពិពាហ៍ប្រព្រឹត្តទៅតាមផ្លូវច្បាប់គូស្រករទាំងពីរត្រូវទៅចុះកិច្ចសន្យាអាពាហ៍ពិពាហ៍នៅមុខមន្ត្រីអត្រានុកូលដ្ឋាននៅគេដ្ឋានខាងស្រី ។ (Similarity: 0.7895)

Question: តើដូចម្ដេចដែលហៅថាខ្មាំង

Unique question validating

In [None]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import json
import numpy as np

# Step 1: Load the Fine-Tuned Model
def load_model(model_path):
    """
    Load the fine-tuned SentenceTransformer model.
    """
    return SentenceTransformer(model_path)

# Step 2: Load the Answers Dataset
def load_answers(json_file):
    """
    Load the answers from the JSON file.
    """
    with open(json_file, "r", encoding="utf-8") as file:
        data = json.load(file)
    
    # Extract unique answers
    answers = list(set(item["sentence2"] for item in data))
    return answers

# Step 3: Encode Texts
def encode_texts(model, texts):
    """
    Encode a list of texts using the model.
    """
    return model.encode(texts, convert_to_tensor=True)

# Step 4: Perform Semantic Search
def semantic_search(question, question_embedding, answers, answer_embeddings, top_k=3):
    """
    Perform semantic search to find the most similar answers for a given question.
    """
    # Compute cosine similarities
    similarities = cosine_similarity(
        question_embedding.detach().numpy().reshape(1, -1), 
        answer_embeddings.detach().numpy()
    )[0]

    # Get top-k answers
    top_k_indices = np.argsort(similarities)[-top_k:][::-1]
    top_k_answers = [(answers[i], similarities[i]) for i in top_k_indices]
    return top_k_answers

# Main Script
if __name__ == "__main__":
    # File paths
    model_path = "fine_tuned_semantic_search_model"  # Path to the fine-tuned model
    validation_json_file = "semantic_search_data.json"  # JSON file containing answers

    # Parameters
    top_k = 3  # Number of top answers to retrieve
    specific_question = "ដូចម្តេចដែលហៅថា ការប្រើប្រាស់បច្ចេកទេសទំនើប ?"  # The question you want to validate

    # Load the fine-tuned model
    model = load_model(model_path)
    print("Model loaded.")

    # Load answers
    answers = load_answers(validation_json_file)
    print(f"Loaded {len(answers)} unique answers.")

    # Encode the specific question
    print(f"\nEncoding the question: {specific_question}")
    question_embedding = encode_texts(model, [specific_question])

    # Encode all answers
    print("Encoding answers...")
    answer_embeddings = encode_texts(model, answers)

    # Perform semantic search
    print("\nPerforming semantic search...")
    results = semantic_search(specific_question, question_embedding, answers, answer_embeddings, top_k=top_k)

    # Display results
    print(f"\nQuestion: {specific_question}")
    print("Top Answers:")
    for idx, (answer, similarity) in enumerate(results):
        print(f"{idx+1}. {answer} (Similarity: {similarity:.4f})")


Model loaded.
Loaded 987 unique answers.

Encoding the question: តើ​អធិប​តេយ្យ​មាន​ន័យ​ដូចម្ដេច?
Encoding answers...

Performing semantic search...

Question: តើ​អធិប​តេយ្យ​មាន​ន័យ​ដូចម្ដេច?
Top Answers:
1. ក្រសួងការបរទេសនិងសហប្រតិបត្តិការអន្តរជាតិស្ថិតនៅក្នុងវិស័យរដ្ឋបាល ។ (Similarity: 0.7895)
2. នៅជនបទកម្ពុជាប្រភេទឥន្ធនៈសំខាន់ជាងគេសម្រាប់ចម្អិនអាហារនោះអុស ។ (Similarity: 0.7895)
3. ដើម្បីឱ្យអាពាហ៍ពិពាហ៍ប្រព្រឹត្តទៅតាមផ្លូវច្បាប់គូស្រករទាំងពីរត្រូវទៅចុះកិច្ចសន្យាអាពាហ៍ពិពាហ៍នៅមុខមន្ត្រីអត្រានុកូលដ្ឋាននៅគេដ្ឋានខាងស្រី ។ (Similarity: 0.7895)
