<a href="https://colab.research.google.com/github/YakshithK/mentora/blob/yakshith%2Fai-grader/ai-grader-microservice/Mentora_Scripts.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import json
import csv
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import numpy as np

print(" loading file...")
with open('leaf.jsonl', 'r', encoding='utf-8') as f:
    lines = f.readlines()

data = []
for line in lines:
    obj = json.loads(line)
    data.append({
        'split': obj['split'],
        'essay_text': obj['essay_text'],
        'human_feedback': obj['human_feedback_text']
    })

print(f"loaded {len(data)} essays.")

print("loading model...")
model_name = "KevSun/Engessay_grading_ML"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()
print(f"🚀 Model loaded on: {device}")

def get_grades(text):
    if not text or not isinstance(text, str) or text.strip() == "":
        return [1.0] * 6  # Default fail-safe values
    encoded_input = tokenizer(
        text, return_tensors='pt', padding=True, truncation=True, max_length=64
    ).to(device)

    with torch.no_grad():
        outputs = model(**encoded_input)

    predictions = outputs.logits.squeeze().cpu().numpy()
    item_names = ["cohesion", "syntax", "vocabulary", "phraseology", "grammar", "conventions"]
    scaled_scores = 2.25 * predictions - 1.25
    rounded_scores = [round(score * 2) / 2 for score in scaled_scores]
    return rounded_scores

print("starting grading...")
for i, item in enumerate(data):
    essay = item['essay_text']
    scores = get_grades(essay)
    item['grade'] = scores
    if i % 50 == 0:
        print(f"processed {i}/{len(data)} essays...")


def write_csv(filtered_data, filename):
    with open(filename, 'w', newline='', encoding='utf-8') as f:
        fieldnames = ['essay_text', 'human_feedback', 'grade']
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        writer.writeheader()

        for row in filtered_data:
            # Only keep expected fields
            cleaned_row = {key: row[key] for key in fieldnames}
            writer.writerow(cleaned_row)


train_data = [d for d in data if d['split'] == 'train']
test_data  = [d for d in data if d['split'] == 'test']

print("writing csv files...")
write_csv(train_data, 'training.csv')
write_csv(test_data, 'testing.csv')

print("done! files saved: training.csv and testing.csv")


 loading file...
loaded 4918 essays.
loading model...
🚀 Model loaded on: cpu
starting grading...
processed 0/4918 essays...
processed 50/4918 essays...
processed 100/4918 essays...
processed 150/4918 essays...
processed 200/4918 essays...
processed 250/4918 essays...
processed 300/4918 essays...
processed 350/4918 essays...
processed 400/4918 essays...
processed 450/4918 essays...
processed 500/4918 essays...
processed 550/4918 essays...
processed 600/4918 essays...
processed 650/4918 essays...
processed 700/4918 essays...
processed 750/4918 essays...
processed 800/4918 essays...
processed 850/4918 essays...
processed 900/4918 essays...
processed 950/4918 essays...
processed 1000/4918 essays...
processed 1050/4918 essays...
processed 1100/4918 essays...
processed 1150/4918 essays...
processed 1200/4918 essays...
processed 1250/4918 essays...
processed 1300/4918 essays...
processed 1350/4918 essays...
processed 1400/4918 essays...
processed 1450/4918 essays...
processed 1500/4918 essays

In [10]:
!pip install transformers datasets torch pandas scikit-learn --quiet


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m121.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m95.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m47.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m770.2 kB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m14.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [8]:
from datasets import Dataset
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, TrainingArguments, Trainer

In [24]:
train_df = pd.read_csv("training.csv").dropna(subset=["essay_text", "human_feedback"]).head(150)
test_df = pd.read_csv("testing.csv").dropna(subset=["essay_text", "human_feedback"]).head(150)

train_df["grading_pattern"] = train_df["human_feedback"].apply(
    lambda f: "Extract and list specific rubric-based patterns the teacher follows. For example:\n- Strict on thesis clarity\n- Tolerant of minor grammar errors\n- Values personal voice\nFeedback: " + f
)

test_df["grading_pattern"] = test_df["human_feedback"].apply(
    lambda f: "Extract and list specific rubric-based patterns the teacher follows. For example:\n- Strict on thesis clarity\n- Tolerant of minor grammar errors\n- Values personal voice\nFeedback: " + f
)

train_dataset = Dataset.from_pandas(train_df[["essay_text", "grading_pattern"]])
test_dataset = Dataset.from_pandas(test_df[["essay_text", "grading_pattern"]])


In [25]:
model_checkpoint = "google/flan-t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

max_input_length = 512
max_target_length = 256

def preprocess(example):
    inputs = tokenizer(example["essay_text"], truncation=True, padding="max_length", max_length=max_input_length)
    targets = tokenizer(example["grading_pattern"], truncation=True, padding="max_length", max_length=max_target_length)
    inputs["labels"] = targets["input_ids"]
    return inputs

train_tokenized = train_dataset.map(preprocess, remove_columns=train_dataset.column_names)
test_tokenized = test_dataset.map(preprocess, remove_columns=test_dataset.column_names)

Map:   0%|          | 0/150 [00:00<?, ? examples/s]

Map:   0%|          | 0/150 [00:00<?, ? examples/s]

In [26]:
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

In [27]:
from huggingface_hub import login
login()


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [28]:
training_args = TrainingArguments(
    output_dir = "./flan-t5-feedback",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    save_steps=500,
    logging_steps=100,
    eval_strategy="epoch",
    save_strategy="epoch",
    weight_decay=0.01,
    load_best_model_at_end=True,
    push_to_hub=True,
    report_to="none"
)

In [29]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized,
    eval_dataset=test_tokenized,
    tokenizer=tokenizer
)

trainer.train()

  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,No log,4.153334
2,No log,3.661808
3,4.676600,3.536682


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight'].


TrainOutput(global_step=114, training_loss=4.581195530138518, metrics={'train_runtime': 1651.476, 'train_samples_per_second': 0.272, 'train_steps_per_second': 0.069, 'total_flos': 83650727116800.0, 'train_loss': 4.581195530138518, 'epoch': 3.0})

In [30]:
def generate_insights(essay):
    inputs = tokenizer(essay, return_tensors="pt", truncation=True, padding=True, max_length=max_input_length)
    outputs = model.generate(**inputs, max_new_tokens=256)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)


sample_essay = test_df.iloc[0]["essay_text"]
print(generate_insights(sample_essay))
print(sample_essay)

On the one hand, many people tend to repeat the same things as they believe they have achieved some security. However, change is not always a personal option. Even when people believe they are resisting change themselves, they cannot stop the world around them from changing. For instance, technology and scientific breakthroughs are changing the world on a daily basis. n On the other hand, I believe that change brings positive consequences for people. To give one example, repeating one job may be too tedious, and change can help people to have motivation for working. However, change is not always for the better. A lot of innovations, for example, are made with the aim of making money. In that case, new things should not be promoted. As a result, people need to keep pace with changes. n On the whole, I believe that people should not remain to do the same things, although all change does not necessarily have good outcomes. n On the whole, I believe that people should not remain to do the 

In [None]:
# Save model and tokenizer to a folder inside your Colab environment
model.save_pretrained("./my-fine-tuned-model")
tokenizer.save_pretrained("./my-fine-tuned-model")

In [None]:
import shutil

# Zip the folder
shutil.make_archive("my-fine-tuned-model", 'zip', "./my-fine-tuned-model")

In [None]:
from google.colab import files

# Download the zipped model folder
files.download("my-fine-tuned-model.zip")

In [15]:
!pip uninstall -y torch torchvision torchaudio
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

Found existing installation: torch 2.6.0+cu124
Uninstalling torch-2.6.0+cu124:
  Successfully uninstalled torch-2.6.0+cu124
Found existing installation: torchvision 0.21.0+cu124
Uninstalling torchvision-0.21.0+cu124:
  Successfully uninstalled torchvision-0.21.0+cu124
Found existing installation: torchaudio 2.6.0+cu124
Uninstalling torchaudio-2.6.0+cu124:
  Successfully uninstalled torchaudio-2.6.0+cu124
Looking in indexes: https://download.pytorch.org/whl/cu118
Collecting torch
  Downloading https://download.pytorch.org/whl/cu118/torch-2.7.0%2Bcu118-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (28 kB)
Collecting torchvision
  Downloading https://download.pytorch.org/whl/cu118/torchvision-0.22.0%2Bcu118-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (6.1 kB)
Collecting torchaudio
  Downloading https://download.pytorch.org/whl/cu118/torchaudio-2.7.0%2Bcu118-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (6.6 kB)
Collecting sympy>=1.13.3 (from torch)
  Downloading https://download.pyto

In [49]:
from bertopic import BERTopic
import pandas as pd

# 1. Input: Essay feedback samples
feedback_sentences = [
    "Dear student, your discussion would have been better if you had tried to follow the proper discussion outline as indicated in the original prompt.",
    "You neglected to properly assess the requirements of the essay and represent it in your outline plus paragraph discussion.",
    "You were asked to discuss 2 points of view and then your personal opinion. You only discussed one point of view and your opinion.",
    "In terms of task accuracy, that would result in a score of 4 because your response to the required discussion is minimal.",
    "The failure of your opening statement, the paraphrasing sealed the failing score for your TA portion."
]

teacher_ids = ["teacher_A"] * len(feedback_sentences)

# 2. Train BERTopic model
topic_model = BERTopic(umap_model=None)
topics, probs = topic_model.fit_transform(feedback_sentences)

# 3. Analyze topic labels
topic_info = topic_model.get_topic_info()
print("\n=== Topic Labels ===")
print(topic_info)

# 4. Classify feedback as strict or not (you can replace this with ML)
sentiments = [
    "strict" if any(word in sent.lower() for word in ["better", "neglected", "failing"]) else "neutral"
    for sent in feedback_sentences
]

# 5. Create full DataFrame
df = pd.DataFrame({
    "teacher": teacher_ids,
    "sentence": feedback_sentences,
    "topic_id": topics,
    "sentiment": sentiments
})

# 6. Map topic ID to BERTopic label
topic_label_map = {row["Topic"]: row["Name"] for _, row in topic_info.iterrows()}
df["topic_label"] = df["topic_id"].map(topic_label_map)

# 7. Aggregate strictness per topic
agg = df.groupby(["teacher", "topic_label", "sentiment"]).size().unstack(fill_value=0)
agg["total"] = agg.sum(axis=1)
agg["strict_ratio"] = agg.get("strict", 0) / agg["total"]

# 8. Display profile
profile = agg.reset_index()[["teacher", "topic_label", "strict_ratio"]]
print("\n=== Grading Style Profile ===")
print(profile)


  eigenvalues, eigenvectors = scipy.sparse.linalg.eigsh(
  eigenvalues, eigenvectors = scipy.sparse.linalg.eigsh(


TypeError: Cannot use scipy.linalg.eigh for sparse A with k >= N. Use scipy.linalg.eigh(A.toarray()) or reduce k.

In [47]:
from collections import Counter
import re

def get_top_keywords(sentences, n=5):
    words = []
    for sent in sentences:
        words += re.findall(r'\b\w+\b', sent.lower())
    most_common = Counter(words).most_common(n)
    return [word for word, _ in most_common if word not in {"the", "you", "and", "your"}]

for cluster_id in sorted(set(clusters)):
    sents = df[df["cluster"] == cluster_id]["sentence"].tolist()
    keywords = get_top_keywords(sents)
    print(f"Topic {cluster_id}: {' / '.join(keywords)}")


Topic 0: of
Topic 1: in / of / terms / task / accuracy
Topic 2: discussion / dear / student


In [55]:
import pandas as pd
import joblib

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
import matplotlib.pyplot as plt

import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

In [53]:
df = pd.read_csv("grammar.csv")

X = df['text']
y = df['label']

vectorizer = TfidfVectorizer()
X_vectorized = vectorizer.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_vectorized, y, test_size=0.2, random_state=42)

In [54]:
X_train_dense = X_train.toarray()
X_test_dense = X_test.toarray()

In [57]:
model = Sequential([
    Dense(128, activation="relu", input_shape=(X_train_dense.shape[1],)),
    Dropout(0.3),
    Dense(64, activation="relu"),
    Dropout(0.3),
    Dense(1, activation="sigmoid")
])

In [59]:
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

In [61]:
early_stop = EarlyStopping(monitor='val_loss', patience=3)

history = model.fit(X_train_dense, y_train,
                    validation_split=0.2,
                    epochs=10,
                    batch_size=32,
                    callbacks=[early_stop])

Epoch 1/10
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 1.0000 - loss: 0.0015 - val_accuracy: 1.0000 - val_loss: 5.6247e-04
Epoch 2/10
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 1.0000 - loss: 0.0013 - val_accuracy: 1.0000 - val_loss: 4.5065e-04
Epoch 3/10
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 1.0000 - loss: 0.0011 - val_accuracy: 1.0000 - val_loss: 3.6341e-04
Epoch 4/10
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 1.0000 - loss: 8.4662e-04 - val_accuracy: 1.0000 - val_loss: 2.9670e-04
Epoch 5/10
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 1.0000 - loss: 7.1977e-04 - val_accuracy: 1.0000 - val_loss: 2.4739e-04
Epoch 6/10
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 1.0000 - loss: 7.2600e-04 - val_accuracy: 1.0000 - val_loss: 2.0813e-04
Epoc

In [64]:
y_pred = model.predict(X_test_dense)
y_pred_labels = (y_pred > 0.5).astype(int)

acc = accuracy_score(y_test, y_pred_labels)
cm = confusion_matrix(y_test, y_pred_labels)

print(f"Accuracy: {acc}")
print("Confusion Matrix:")
print(cm)

[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step 
Accuracy: 1.0
Confusion Matrix:
[[119   0]
 [  0 101]]
