## 1. Introduction & Objective

The goal of this project is to build a Grammar Scoring Engine that evaluates grammar proficiency from spoken audio samples.
Each audio is 45-60 seconds long and labeled with a Grammar Score (0 to 5) based on MOS Likert Scale.

In [4]:
import os
import librosa
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
from scipy.stats import pearsonr
import language_tool_python
import whisper

ModuleNotFoundError: No module named 'matplotlib'

In [None]:
!pip install matplotlib

## 2. Dataset Overview
Load the dataset to colab

In [None]:
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

print("Train shape:", train_df.shape)
print(train_df.head())

### Audio Preprocessing

In [None]:
def load_audio(path, sr=16000):
    y, _ = librosa.load(path, sr=sr)
    return y

# Example load
audio_path = os.path.join("train", train_df["file_name"].iloc[0])
audio_data = load_audio(audio_path)

### Transcription (Whisper)

In [None]:
whisper_model = whisper.load_model("base")

def transcribe_audio(path):
    result = whisper_model.transcribe(path)
    return result['text']

###Grammar Error Detection

In [None]:
tool = language_tool_python.LanguageTool('en-US')

def grammar_error_count(text):
    matches = tool.check(text)
    return len(matches)

### Feature Extraction

In [None]:
def extract_features(audio_path):
    transcript = transcribe_audio(audio_path)
    num_errors = grammar_error_count(transcript)
    audio, _ = librosa.load(audio_path, sr=16000)
    mfcc = librosa.feature.mfcc(audio, sr=16000, n_mfcc=13)
    mfcc_mean = mfcc.mean(axis=1)
    features = list(mfcc_mean) + [num_errors]
    return features

# Example
sample_features = extract_features(audio_path)
print("Feature vector length:", len(sample_features))


# Prepare Dataset for Model

In [None]:
X = []
y = []

for idx, row in train_df.iterrows():
    audio_path = os.path.join("train", row["file_name"])
    features = extract_features(audio_path)
    X.append(features)
    y.append(row["label"])

X = np.array(X)
y = np.array(y)

### Train-Test Split

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


## TRAIN MODEL

In [None]:
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)


##Evaluation

In [None]:
y_pred = model.predict(X_val)

mse = mean_squared_error(y_val, y_pred)
mae = mean_absolute_error(y_val, y_pred)
pearson_corr, _ = pearsonr(y_val, y_pred)

print("MSE:", mse)
print("MAE:", mae)
print("Pearson Correlation:", pearson_corr)

Visualization

In [None]:
plt.figure(figsize=(8,6))
sns.scatterplot(x=y_val, y=y_pred)
plt.xlabel("Actual Scores")
plt.ylabel("Predicted Scores")
plt.title("Predicted vs Actual Grammar Scores")
plt.grid(True)
plt.show()

#Predict on Test Set

In [None]:
submission = pd.read_csv("sample_submission.csv")

predictions = []
for filename in test_df["file_name"]:
    audio_path = os.path.join("test", filename)
    features = extract_features(audio_path)
    pred = model.predict([features])[0]
    predictions.append(pred)

submission["label"] = predictions
submission.to_csv("submission.csv", index=False)

# ============================
# 14. Conclusion
# ============================

"""
In this notebook, we built a baseline Grammar Scoring Engine using audio and linguistic features.
Future improvements may include deep learning models (e.g., BERT + CNN) and data augmentation.
"""