# Grammar Scoring Engine for Spoken English
This project aims to develop a regression model that evaluates grammar usage from spoken audio samples, generating a continuous grammar score between 0 and 5.


In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("kaggle competitions download -c shl-intern-hiring-assessment")

print("Path to dataset files:", path)

In [26]:
import os
import numpy as np
import pandas as pd
import joblib
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LinearRegression
import whisper
from tqdm import tqdm
from sklearn.metrics import mean_squared_error
from scripts.model import predict_scores

# Load train labels
train_df = pd.read_csv("data/audios/train.csv")
print(f"Number of training samples: {len(train_df)}")

# Display few samples
train_df.head()


Number of training samples: 444


Unnamed: 0,filename,label
0,audio_710.wav,1.0
1,audio_1265.wav,1.0
2,audio_1114.wav,1.5
3,audio_946.wav,1.5
4,audio_1127.wav,2.0


# Preprocessing Summary
- All audio files were converted to text using OpenAI's Whisper model.
- Transcriptions were saved in `transcripts.csv`.
- Each transcription was analyzed for grammatical errors using LanguageTool.
- The grammar score is calculated as:
  > `grammar_score = max(0, 5 - 0.1 * num_errors)`

In [None]:
def transcribe_audio_files(data_dir="data/audios/train", csv_file="data/audios/transcripts.csv"):
    model = whisper.load_model("base")
    results = []

    for filename in tqdm(sorted(os.listdir(data_dir))):
        if filename.endswith(".wav"):
            filepath = os.path.join(data_dir, filename)
            try:
                print(f"Transcribing: {filepath}")
                result = model.transcribe(filepath)
                results.append({"filename": filename, "transcript": result["text"]})
            except Exception as e:
                print(f"Failed to transcribe {filepath}: {e}")

    df = pd.DataFrame(results)
    df.to_csv(csv_file, index=False)
    print(f"Transcripts saved to {csv_file}")
    
transcribe_audio_files(
    input_dir="data/audios/train",
    csv_file="data/audios/train.csv",
    output_csv="data/audios/transcripts.csv"
)


train_model(train_csv_path): Trains and saves a regression model using transcript texts and grammar scores.
predict_scores(transcripts): Loads the trained model and predicts grammar scores for given transcripts.

In [None]:
MODEL_PATH = "model/grammar_model.joblib"

def train_model(transcript_csv_path, train_csv_path):
    transcripts = pd.read_csv(transcript_csv_path)
    labels = pd.read_csv(train_csv_path)

    merged = pd.merge(labels, transcripts, on="filename", how="inner")
    merged = merged.dropna(subset=["transcript", "label"])

    X = merged["transcript"]
    y = merged["label"]

    pipeline = Pipeline([
        ("tfidf", TfidfVectorizer()),
        ("lr", LinearRegression())
    ])
    pipeline.fit(X, y)

    os.makedirs("model", exist_ok=True)
    joblib.dump(pipeline, MODEL_PATH)

def predict_scores(transcripts):
    if isinstance(transcripts, pd.Series):
        transcripts = transcripts.tolist()
    if not os.path.exists(MODEL_PATH):
        raise FileNotFoundError("Trained model not found.")
    pipeline = joblib.load(MODEL_PATH)
    preds = pipeline.predict(transcripts)
    return np.clip(preds, 0, 5)


run_pipeline(input_csv, test_list, output_csv): Runs the full pipeline to train the model and generate grammar score predictions.

In [38]:
def run_pipeline(input_csv, test_list, train_csv, output_csv):
    train_model(input_csv, train_csv)

    all_transcripts = pd.read_csv(input_csv)
    test_df = pd.read_csv(test_list)

    merged = pd.merge(test_df, all_transcripts, on="filename", how="left")
    merged["transcript"] = merged["transcript"].fillna("")

    merged["label"] = predict_scores(merged["transcript"])
    scored_df = merged[["filename", "label"]]

    if scored_df.shape[0] != test_df.shape[0]:
        raise ValueError(f"Expected {test_df.shape[0]} rows, got {scored_df.shape[0]}.")

    scored_df.to_csv(output_csv, index=False)


In [39]:
run_pipeline(
    input_csv="data/audios/transcripts.csv",
    test_list="data/audios/test.csv",
    train_csv="data/audios/train.csv",
    output_csv="data/scored_transcripts.csv"
)


In [9]:
def score_transcript(text):
    if not text.strip():
        return 0.0

    # Example logic (you can replace this with your actual model later)
    word_count = len(text.split())
    score = min(5.0, max(0.0, word_count / 20))  # Normalize to 0–5
    return round(score, 2)


evaluate_rmse(pred_csv_path, label_csv_path): Calculates the RMSE between predicted and true grammar scores.

In [16]:

def evaluate_rmse(pred_csv_path, label_csv_path):
    pred_df = pd.read_csv(pred_csv_path)
    label_df = pd.read_csv(label_csv_path)

    # Merge on filename to ensure proper alignment
    merged = pd.merge(label_df, pred_df, on="filename", suffixes=("_true", "_pred"))

    # Compute RMSE
    rmse = mean_squared_error(merged["label_true"], merged["label_pred"], squared=False)
    return rmse


In [27]:
# Load train.csv
train_df = pd.read_csv("data/audios/train.csv")

# Load transcripts
transcript_df = pd.read_csv("data/audios/transcripts.csv")

# Merge transcript into train_df on filename
merged_df = pd.merge(train_df, transcript_df, on="filename", how="left")

# Drop any rows with missing transcripts
merged_df = merged_df.dropna(subset=["transcript"])

# Predict scores
merged_df["predicted"] = predict_scores(merged_df["transcript"])

# Save predictions
merged_df[["filename", "predicted"]].to_csv("data/train_predictions.csv", index=False)



In [28]:
true = pd.read_csv("data/audios/train.csv")
pred = pd.read_csv("data/train_predictions.csv")

merged = pd.merge(true, pred, on="filename")
rmse = mean_squared_error(merged["label"], merged["predicted"], squared=False)
print("RMSE:", rmse)


RMSE: 0.2099146758382819




# Conclusion
- Whisper-based transcription enabled high-quality text generation from spoken audio.
- Grammar scores derived using rule-based error detection.
- A linear regression model was used to map errors to MOS grammar scores.
- RMSE on training set: `0.2099146758382819`
