Model training for the purpose of Research!<br>
By Aashik Baruwal

In [None]:
!pip install transformers torch

In [None]:
from google.colab import files

print("Upload your generate_questions_from_map.py script:")
uploaded = files.upload()

print("Upload your concept map JSON files (upload as many as you need):")
uploaded = files.upload()

In [4]:
import os
import shutil

# Create necessary directories
os.makedirs('scripts/data/concept_maps', exist_ok=True)
os.makedirs('scripts/data/generated_questions', exist_ok=True)
os.makedirs('scripts', exist_ok=True)

# Move uploaded files to the correct locations
for fname in os.listdir():
    if fname.endswith('.json'):
        shutil.move(fname, f'scripts/data/concept_maps/{fname}')
    elif fname == 'generate_questions_from_map.py':
        shutil.move(fname, f'scripts/{fname}')

In [None]:
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')
!python scripts/generate_questions_from_map.py


In [None]:
!pip install --upgrade transformers datasets -q
import pandas as pd
import json
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, TrainingArguments, Trainer
import torch
import os



# You can replace this with your actual path
cs_csv_path = '/content/scripts/data/generated_questions/cs_concept_map_questions.csv'
sc_csv_path = '/content/scripts/data/generated_questions/sc_concept_map_questions.csv'

# Loading the CSVs
try:
    cs_df = pd.read_csv(cs_csv_path)
    sc_df = pd.read_csv(sc_csv_path)
    print("‚úÖ Successfully loaded CSV files.")
except FileNotFoundError:
    print(f"‚ùå Error: Make sure your CSV files are uploaded or Drive is mounted and paths are correct.")
    print(f"Expected paths: {cs_csv_path}, {sc_csv_path}")
    # We will want to stop execution here if files aren't found


# We will Combine both datasets
if 'cs_df' in locals() and 'sc_df' in locals():
    df = pd.concat([cs_df, sc_df], ignore_index=True)
    print(f"Combined dataframe shape: {df.shape}")

    # Prepare samples for QA training
    samples = []
    for index, row in df.iterrows():
        # Simple context construction: Source Verb Target.
        # We could potentially make this more complex or use the full sentence from source text if available
        context = f"{row['Source']} {row['Verb']} {row['Target']}."
        question = row['Question']
        answer = str(row['Answer']).strip() # Ensure answer is string and clean whitespace

        # Find answer start index in the constructed context
        # Use case-insensitive search for robustness
        context_lower = context.lower()
        answer_lower = answer.lower()
        answer_start = context_lower.find(answer_lower)

        # We need the answer to be present in the context for extractive QA
        if answer_start != -1:
             samples.append({
                'context': context,
                'question': question,
                'answers': {
                    'text': [answer],
                    'answer_start': [answer_start]
                }
            })
        # else:
            # Optional: print skipped samples to debug
            # print(f"Skipped sample: Answer '{answer}' not found in context '{context}'")


    print(f"Prepared {len(samples)} samples for training.")

    # Convert the training samples into a Huggingface Dataset
    if samples:
        dataset = Dataset.from_list(samples)
        print("‚úÖ Converted samples to Huggingface Dataset.")
        print(dataset)
    else:
        print("‚ùå No valid training samples were prepared. Please check your CSV data and the data preparation logic.")
        dataset = None # Ensure dataset is None if no samples
else:
    print("‚ùå DataFrames were not loaded. Cannot proceed with data preparation.")
    dataset = None # Ensure dataset is None if no dataframes

# Our selected model for low-end resource settings
model_name = "distilbert-base-uncased-distilled-squad"


if dataset is not None:
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForQuestionAnswering.from_pretrained(model_name)
    print(f"‚úÖ Loaded tokenizer and model: {model_name}")

    # Define a function to preprocess the dataset for tokenization
    def preprocess_function(examples):
        questions = examples["question"]
        contexts = examples["context"]
        answers = examples["answers"]

        # Tokenize the questions and contexts
        inputs = tokenizer(
            questions,
            contexts,
            truncation=True,
            padding='max_length',
            max_length=128, # Keep max length reasonable for low resource
            return_offsets_mapping=True,
            return_tensors='pt'
        )

        # Extract offset mappings for token positions
        offset_mappings = inputs.pop("offset_mapping")

        # Initialize lists to store start and end positions of answers
        start_positions = []
        end_positions = []

        # Loop through each sample to calculate start and end token indices
        for i, offset in enumerate(offset_mappings):
            answer = answers[i]
            start_char = answer["answer_start"][0]
            # Adding the length of the answer text to its start character index to get the end character index
            end_char = start_char + len(answer["text"][0])

            # Find the sequence index of the context, typically 1 in [CLS] question [SEP] context [SEP]
            sequence_ids = inputs.sequence_ids(i)

            # Adjust start and end char positions to be relative to the context
            # Find the token corresponding to the start of the context
            context_start_token = sequence_ids.index(1) if 1 in sequence_ids else 0 # Handle cases where context might not be explicitly separated

            # Adjust start and end positions to find the answer within the context part of the tokenized sequence
            # Find the token span of the answer
            token_start_index = -1
            token_end_index = -1

            # Iterate through the tokens in the context part of the sequence
            for token_index in range(context_start_token, len(sequence_ids)):
                 if sequence_ids[token_index] != 1: # Stop when we leave the context part
                     break
                 # Check if the character range of the current token overlaps with the answer character range
                 token_char_start = offset[token_index][0]
                 token_char_end = offset[token_index][1]

                 if token_start_index == -1 and token_char_start <= start_char and token_char_end >= start_char:
                     token_start_index = token_index

                 if token_char_start <= end_char and token_char_end >= end_char:
                     token_end_index = token_index

            # If token start/end found within context, use them
            if token_start_index != -1 and token_end_index != -1:
                 start_positions.append(token_start_index)
                 end_positions.append(token_end_index)
            else:
                 # If answer tokens not found in context span, set positions to model's [CLS] token
                 # This signifies an unanswerable question in SQuAD context
                 start_positions.append(0)
                 end_positions.append(0)


        # Add start and end positions to the inputs
        inputs["start_positions"] = torch.tensor(start_positions)
        inputs["end_positions"] = torch.tensor(end_positions)

        return inputs

    # Apply the preprocessing function to the dataset
    # This tokenizes the dataset and calculates start/end positions
    # Ensure dataset is not None before mapping
    if dataset is not None:
        tokenized_dataset = dataset.map(preprocess_function, batched=True, remove_columns=dataset.column_names)
        print("‚úÖ Successfully tokenized and preprocessed dataset.")
        print(tokenized_dataset)
    else:
         print("‚ùå Dataset is None. Skipping tokenization.")
         tokenized_dataset = None

else:
    print("‚ùå Dataset is None. Skipping model loading and tokenization.")
    tokenizer = None
    model = None
    tokenized_dataset = None


if model is not None and tokenized_dataset is not None:
    # Define training arguments for the model
    training_args = TrainingArguments(
        output_dir="./qa_model",  # Directory to save the model checkpoints
        per_device_train_batch_size=8,  # Batch size per device - adjust based on Colab GPU RAM
        num_train_epochs=2,  # Number of training epochs - start small
        save_steps=200,  # Save the model every 200 steps - adjust based on dataset size
        save_total_limit=1,  # Keep only the latest checkpoint
        logging_steps=100,  # Log training progress every 100 steps
        learning_rate=3e-5,  # Learning rate
        weight_decay=0.01,  # Weight decay
        disable_tqdm=False,  # Enable progress bars
        push_to_hub=False,  # Do not push the model to Huggingface Hub
        # Optional: Add evaluation if you have a separate eval dataset
        # evaluation_strategy="steps",
        # eval_steps=200,
        # load_best_model_at_end=True, # Requires evaluation strategy
    )

    # Initialize the Trainer for model training
    trainer = Trainer(
        model=model,  # The QA model
        args=training_args,  # Training arguments
        train_dataset=tokenized_dataset,  # Tokenized training dataset
        tokenizer=tokenizer,  # Tokenizer for preprocessing
    )

    # Train the model using the Trainer
    print("üöÄ Starting model training...")
    trainer.train()
    print("‚úÖ Model training complete.")
    final_model_dir = "./qa_model_final"
    trainer.save_model(final_model_dir)  # Save the model
    tokenizer.save_pretrained(final_model_dir)  # Save the tokenizer

    print(f"‚úÖ Final model and tokenizer saved at {final_model_dir}")
else:
    print("‚ùå Cannot proceed with training because dataset or model was not loaded.")


In [9]:
# Zip the saved model directory
model_dir_to_zip = "./qa_model_final"
zip_filename = "qa_model_final.zip"

# Use a shell command to create the zip file
!zip -r {zip_filename} {model_dir_to_zip}

print(f"‚úÖ Zipped model to {zip_filename}")

# Download the zip file
from google.colab import files

zip_filename = "qa_model_final.zip" # Ensure this matches the filename used in the zipping step

try:
    files.download(zip_filename)
    print(f"‚úÖ Initiated download of {zip_filename}. Check your browser's downloads.")
except FileNotFoundError:
    print(f"‚ùå Error: {zip_filename} not found. Make sure the zipping step completed successfully.")

  adding: qa_model_final/ (stored 0%)
  adding: qa_model_final/special_tokens_map.json (deflated 42%)
  adding: qa_model_final/tokenizer_config.json (deflated 75%)
  adding: qa_model_final/model.safetensors (deflated 8%)
  adding: qa_model_final/training_args.bin (deflated 52%)
  adding: qa_model_final/config.json (deflated 43%)
  adding: qa_model_final/tokenizer.json (deflated 71%)
  adding: qa_model_final/vocab.txt (deflated 53%)
‚úÖ Zipped model to qa_model_final.zip


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

‚úÖ Initiated download of qa_model_final.zip. Check your browser's downloads.
