<a href="https://colab.research.google.com/github/bhupesh-varma/PhysiSolve/blob/main/main.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Step - 1
### spplitting the dataset into training, testing and evaluation

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import json
import random
from sklearn.model_selection import train_test_split
from collections import Counter

# Set random seed for reproducibility
RANDOM_SEED = 42
random.seed(RANDOM_SEED)

# Load the dataset
with open(r"/content/drive/MyDrive/dataset/high_school_physics.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# Extract the 'subject' field for stratification
subjects = [item["subject"] for item in data]

# Verify initial distribution
print("Original subject distribution:", Counter(subjects))

# First split: 70% train, 30% temp (test + eval)
train_data, temp_data = train_test_split(
    data,
    train_size=0.7,
    stratify=subjects,
    random_state=RANDOM_SEED
)

# Second split: Split the 30% temp into 15% test and 15% eval (50/50 of temp)
test_data, eval_data = train_test_split(
    temp_data,
    test_size=0.5,  # 50% of 30% = 15% of original
    stratify=[item["subject"] for item in temp_data],
    random_state=RANDOM_SEED
)

# Save train, test, and evaluation sets
with open(r"/content/drive/MyDrive/dataset/train.json", "w", encoding="utf-8") as f:
    json.dump(train_data, f, indent=4)
with open(r"/content/drive/MyDrive/dataset/test.json", "w", encoding="utf-8") as f:
    json.dump(test_data, f, indent=4)
with open(r"/content/drive/MyDrive/dataset/eval.json", "w", encoding="utf-8") as f:
    json.dump(eval_data, f, indent=4)

# Print split sizes and subject distribution
print(f"\nDataset split into {len(train_data)} training, {len(test_data)} testing, and {len(eval_data)} evaluation samples.")
print("\nSubject distribution in each split:")
print("Train:", Counter([item["subject"] for item in train_data]))
print("Test:", Counter([item["subject"] for item in test_data]))
print("Eval:", Counter([item["subject"] for item in eval_data]))

Original subject distribution: Counter({'Electrostatics and Current Electricity': 76, 'Mechanics': 60, 'Kinematics': 55, 'Electromagnetism': 45, 'Thermodynamics': 44, 'Optics': 38, 'Atomic and Modern Physics': 30, 'Electronic Devices': 29, 'Periodic Motion': 13, 'Waves and Oscillations': 10})

Dataset split into 280 training, 60 testing, and 60 evaluation samples.

Subject distribution in each split:
Train: Counter({'Electrostatics and Current Electricity': 53, 'Mechanics': 42, 'Kinematics': 39, 'Thermodynamics': 31, 'Electromagnetism': 31, 'Optics': 27, 'Atomic and Modern Physics': 21, 'Electronic Devices': 20, 'Periodic Motion': 9, 'Waves and Oscillations': 7})
Test: Counter({'Electrostatics and Current Electricity': 11, 'Mechanics': 9, 'Kinematics': 8, 'Electromagnetism': 7, 'Thermodynamics': 6, 'Optics': 5, 'Atomic and Modern Physics': 5, 'Electronic Devices': 5, 'Periodic Motion': 2, 'Waves and Oscillations': 2})
Eval: Counter({'Electrostatics and Current Electricity': 12, 'Mechan

### Analysis of the Output
#### Original Distribution
- **Total samples**: 400
- **Subjects**:
  - Electrostatics and Current Electricity: 76
  - Mechanics: 60
  - Kinematics: 55
  - Electromagnetism: 45
  - Thermodynamics: 44
  - Optics: 38
  - Atomic and Modern Physics: 30
  - Electronic Devices: 29
  - Periodic Motion: 13
  - Waves and Oscillations: 10

#### Split Results
- **Train**: 280 samples (70%)
- **Test**: 60 samples (15%)
- **Eval**: 60 samples (15%)

#### Subject Distribution Across Splits
| Subject                          | Original | Train (70%) | Test (15%) | Eval (15%) |
|----------------------------------|----------|-------------|------------|------------|
| Electrostatics and Current Elec. | 76       | 53 (53.2)   | 11 (11.4)  | 12 (11.4)  |
| Mechanics                        | 60       | 42 (42)     | 9 (9)      | 9 (9)      |
| Kinematics                       | 55       | 39 (38.5)   | 8 (8.25)   | 8 (8.25)   |
| Electromagnetism                 | 45       | 31 (31.5)   | 7 (6.75)   | 7 (6.75)   |
| Thermodynamics                   | 44       | 31 (30.8)   | 6 (6.6)    | 7 (6.6)    |
| Optics                           | 38       | 27 (26.6)   | 5 (5.7)    | 6 (5.7)    |
| Atomic and Modern Physics        | 30       | 21 (21)     | 5 (4.5)    | 4 (4.5)    |
| Electronic Devices               | 29       | 20 (20.3)   | 5 (4.35)   | 4 (4.35)   |
| Periodic Motion                  | 13       | 9 (9.1)     | 2 (1.95)   | 2 (1.95)   |
| Waves and Oscillations           | 10       | 7 (7)       | 2 (1.5)    | 1 (1.5)    |


In [3]:
print(test_data[0])
print(test_data[3])

{'id': 298, 'question': 'What is the change in internal energy of an ideal gas during an isochoric process?', 'subject': 'Thermodynamics', 'choices': ['Zero', 'Positive', 'Negative', 'Depends on the process'], 'answer': 'D', 'explanation': 'In an isochoric process, the change in internal energy depends on the heat added to or removed from the gas.', 'dataset': 'high_school_physics'}
{'id': 206, 'question': 'An ideal gas is heated at constant volume. What happens to its pressure?', 'subject': 'Thermodynamics', 'choices': ['Increases', 'Decreases', 'Remains constant', 'Doubles'], 'answer': 'A', 'explanation': "According to Gay-Lussac's Law, P1/T1 = P2/T2. If temperature increases, pressure increases.", 'dataset': 'high_school_physics'}


In [4]:
!pip install -q transformers accelerate

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m50.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m41.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m33.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m16.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

### Zero shot Testing


In [5]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

# Load the Phi-3 Mini 4K model and tokenizer
model_id = "microsoft/phi-3-mini-4k-instruct"

tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",  # Automatically selects the device (CPU/GPU)
    torch_dtype=torch.float16,  # For memory-efficient execution on Colab
    trust_remote_code=True
)

# Function to evaluate the model's accuracy
def evaluate_model(model, dataset):
    correct = 0
    total = len(dataset)

    for item in dataset:
        question = item["question"]
        idx = ord(item["answer"]) - ord("A")  # Convert answer index (A, B, C, D) to index
        correct_answer = item["choices"][idx]

        # Create the prompt for Phi-3 to generate an answer
        prompt = f"Question: {question}\nChoices:\n"
        for i, choice in enumerate(item['choices']):
            prompt += f"{chr(65+i)}. {choice}\n"
        prompt += "Answer only (A, B, C, or D):"

        # Tokenize the prompt and generate a response
        inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
        outputs = model.generate(**inputs, max_new_tokens=5, use_cache = False)
        prediction = tokenizer.decode(outputs[0], skip_special_tokens=True)

        # Compare predicted answer with the correct answer
        if correct_answer in prediction:
            correct += 1

    # Calculate accuracy
    accuracy = (correct / total) * 100
    return accuracy

# Test the model with the dataset
zero_shot_accuracy = evaluate_model(model, test_data)
print(f"Zero-Shot Accuracy: {zero_shot_accuracy:.2f}%")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/3.44k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.94M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/306 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/599 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/967 [00:00<?, ?B/s]

configuration_phi3.py:   0%|          | 0.00/11.2k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/phi-3-mini-4k-instruct:
- configuration_phi3.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_phi3.py:   0%|          | 0.00/73.2k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/phi-3-mini-4k-instruct:
- modeling_phi3.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors.index.json:   0%|          | 0.00/16.5k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.67G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/181 [00:00<?, ?B/s]



Zero-Shot Accuracy: 100.00%


### Fine-Tuning PHi 3

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
from torch.utils.data import Dataset
import json
import os
import time

# Disable W&B logging
os.environ["WANDB_DISABLED"] = "true"

# Load model and tokenizer
model_id = "microsoft/phi-3-mini-4k-instruct"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    torch_dtype=torch.float16,
    trust_remote_code=True
)

# Create custom dataset
class PhysicsDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=128):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        prompt = f"Question: {item['question']} Choices: {', '.join(item['choices'])} Answer:"
        idx = ord(item["answer"][0])-ord("A")
        target = item["choices"][idx]

        encodings = self.tokenizer(prompt, truncation=True, padding="max_length", max_length=self.max_length, return_tensors="pt")
        target_encodings = self.tokenizer(target, truncation=True, padding="max_length", max_length=self.max_length, return_tensors="pt")

        return {
            "input_ids": encodings["input_ids"].squeeze(),
            "attention_mask": encodings["attention_mask"].squeeze(),
            "labels": target_encodings["input_ids"].squeeze(),
        }

# Load json files
train_file_path = "/content/drive/MyDrive/dataset/train.json"
with open(train_file_path, "r", encoding="utf-8") as f:
    train_data = json.load(f)

# Prepare datasets
train_dataset = PhysicsDataset(train_data, tokenizer)

# Load evaluation data
eval_file_path = "/content/drive/MyDrive/dataset/eval.json"
with open(eval_file_path, "r", encoding="utf-8") as f:
    eval_data = json.load(f)

eval_dataset = PhysicsDataset(eval_data, tokenizer)

# Define output directories
model_save_dir = "/content/drive/MyDrive/dataset/trained_model"
results_dir = "/content/drive/MyDrive/dataset/results"

# Create directories if they don't exist
os.makedirs(model_save_dir, exist_ok=True)
os.makedirs(results_dir, exist_ok=True)

# Define training arguments
training_args = TrainingArguments(
    output_dir=results_dir,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    save_steps=500,
    evaluation_strategy="epoch",
    logging_dir="./logs",
    run_name=f"phi3-mini-{time.strftime('%Y%m%d-%H%M%S')}",
    report_to="none",
)

# Train model
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
)

trainer.train()

# Save the trained model and tokenizer properly
model.save_pretrained(model_save_dir)
tokenizer.save_pretrained(model_save_dir)
print(f"Model and tokenizer saved to {model_save_dir}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


### Testing the trained model on test.json

In [None]:
from transformers import T5ForConditionalGeneration, T5Tokenizer
import json
import torch
import os
from tqdm import tqdm

# Paths
model_save_dir = "/content/drive/MyDrive/dataset/trained_model"
test_file_path = "/content/drive/MyDrive/dataset/test.json"  # Adjust if your test file has a different name

# First, let's check if the model files exist
print("Checking model directory contents:")
for root, dirs, files in os.walk(model_save_dir):
    for file in files:
        print(os.path.join(root, file))

# Load the model and tokenizer
try:
    model = T5ForConditionalGeneration.from_pretrained(model_save_dir)
    tokenizer = T5Tokenizer.from_pretrained(model_save_dir)
    print("Model and tokenizer loaded successfully!")
except Exception as e:
    print(f"Error loading model: {e}")
    # If model loading fails, load the original pretrained model
    print("Loading the base model instead...")
    model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-large")
    tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-large", legacy=False)

# Load test data
with open(test_file_path, "r", encoding="utf-8") as f:
    test_data = json.load(f)

print(f"Loaded {len(test_data)} test examples")

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

# Testing function
def evaluate_model(model, tokenizer, test_data, device, correct, total):

    for item in tqdm(test_data):
        prompt = f"Question: {item['question']} Choices: {', '.join(item['choices'])} Answer:"
        correct_idx = ord(item["answer"][0]) - ord("A")
        correct_answer = item["choices"][correct_idx]

        # Tokenize input
        input_ids = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=128).input_ids.to(device)

        # Generate output
        with torch.no_grad():
            outputs = model.generate(
                input_ids=input_ids,
                max_length=128,
                num_beams=4,
                early_stopping=True
            )

        # Decode output
        predicted_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

        # Simple string match for evaluation
        if predicted_text.strip() == correct_answer.strip():
            correct += 1
        else:
            # Print some examples of wrong predictions for debugging
            if total < 5:  # Limit to just a few examples
                print(f"\nQuestion: {item['question']}")
                print(f"Choices: {', '.join(item['choices'])}")
                print(f"Correct Answer: {correct_answer}")
                print(f"Predicted: {predicted_text}")

        total += 1

    accuracy = correct / total if total > 0 else 0
    return accuracy

# Evaluate model
print("Evaluating model on test set...")
correct = 0
total = 0
accuracy = evaluate_model(model, tokenizer, test_data, device, correct, total)
print(f"Test Accuracy: {accuracy:.4f} ({correct}/{total})")

Checking model directory contents:
Error loading model: Error no file named pytorch_model.bin, model.safetensors, tf_model.h5, model.ckpt.index or flax_model.msgpack found in directory /content/drive/MyDrive/dataset/trained_model.
Loading the base model instead...
Loaded 60 test examples
Evaluating model on test set...


  2%|▏         | 1/60 [00:05<05:38,  5.74s/it]


Question: A 10 Ω resistor is connected across a 15 V battery. What is the current flowing through the resistor?
Choices: 0.5 A, 1 A, 1.5 A, 2 A
Correct Answer: 1.5 A
Predicted: 2 A


  3%|▎         | 2/60 [00:11<05:44,  5.94s/it]


Question: A force of 100 N is applied to a 50 kg object. What is the acceleration of the object?
Choices: 1 m/s², 2 m/s², 3 m/s², 4 m/s²
Correct Answer: 2 m/s²
Predicted: 4 m/s2


  5%|▌         | 3/60 [00:14<04:02,  4.26s/it]


Question: What is the equivalent resistance of two 10Ω resistors in parallel?
Choices: 5Ω, 10Ω, 20Ω, 0Ω
Correct Answer: 5Ω
Predicted: 0


  7%|▋         | 4/60 [00:18<03:54,  4.19s/it]


Question: A force of 80 N is applied to a 8 kg block. What is the acceleration of the block?
Choices: 5 m/s², 10 m/s², 15 m/s², 20 m/s²
Correct Answer: 10 m/s²
Predicted: 20 m/s2


  8%|▊         | 5/60 [00:21<03:35,  3.92s/it]


Question: Two resistors of 15Ω and 5Ω are connected in parallel. What is the equivalent resistance?
Choices: 1Ω, 2Ω, 3.75Ω, 5Ω
Correct Answer: 3.75Ω
Predicted: 5


100%|██████████| 60/60 [03:16<00:00,  3.27s/it]

Test Accuracy: 0.2667 (0/0)





In [1]:
import torch
import json
from transformers import T5ForConditionalGeneration, T5Tokenizer
from torch.utils.data import Dataset
import pickle

# Load trained model
model_path = "/content/drive/MyDrive/dataset/trained_model.pkl"
with open(model_path, "rb") as f:
    model = pickle.load(f)

# Load tokenizer
model_name = "google/flan-t5-large"
tokenizer = T5Tokenizer.from_pretrained(model_name, legacy=False)

# Load test dataset
test_file_path = "/content/drive/MyDrive/dataset/test.json"
with open(test_file_path, "r", encoding="utf-8") as f:
    test_data = json.load(f)

# Define test dataset class
class PhysicsDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=128):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        prompt = f"Question: {item['question']} Choices: {', '.join(item['choices'])} Answer:"
        idx = ord(item["answer"][0])-ord("A")  # Convert letter to index
        target = item["choices"][idx]

        encodings = self.tokenizer(prompt, truncation=True, padding="max_length", max_length=self.max_length, return_tensors="pt")
        target_encodings = self.tokenizer(target, truncation=True, padding="max_length", max_length=self.max_length, return_tensors="pt")

        return {
            "input_ids": encodings["input_ids"].squeeze(),
            "attention_mask": encodings["attention_mask"].squeeze(),
            "labels": target_encodings["input_ids"].squeeze(),
            "correct_answer": target
        }

# Prepare test dataset
test_dataset = PhysicsDataset(test_data, tokenizer)

# Function to evaluate accuracy
def evaluate_accuracy(model, dataset, tokenizer):
    model.eval()  # Set model to evaluation mode
    correct = 0
    total = 0

    for item in dataset:
        input_ids = item["input_ids"].unsqueeze(0).to("cuda" if torch.cuda.is_available() else "cpu")
        attention_mask = item["attention_mask"].unsqueeze(0).to("cuda" if torch.cuda.is_available() else "cpu")

        # Generate prediction
        with torch.no_grad():
            output = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_length=128)

        # Decode predicted and actual answer
        predicted_answer = tokenizer.decode(output[0], skip_special_tokens=True)
        correct_answer = item["correct_answer"]

        # Compare answers
        if predicted_answer.strip().lower() == correct_answer.strip().lower():
            correct += 1
        total += 1

    accuracy = (correct / total) * 100
    return accuracy

# Run evaluation
accuracy = evaluate_accuracy(model, test_dataset, tokenizer)
print(f"Test Accuracy: {accuracy:.2f}%")


KeyboardInterrupt: 